blob: 9d9611f86ec11321ed8e29ffe37abeabecfcc38c [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""Autolink helps auto-link references to artifacts in text.
6
7This class maintains a registry of artifact autolink syntax specs and
8callbacks. The structure of that registry is:
9 { component_name: (lookup_callback, match_to_reference_function,
10 { regex: substitution_callback, ...}),
11 ...
12 }
13
14For example:
15 { 'tracker':
16 (GetReferencedIssues,
17 ExtractProjectAndIssueIds,
18 {_ISSUE_REF_RE: ReplaceIssueRef}),
19 'versioncontrol':
20 (GetReferencedRevisions,
21 ExtractProjectAndRevNum,
22 {_GIT_HASH_RE: ReplaceRevisionRef}),
23 }
24
25The dictionary of regexes is used here because, in the future, we
26might add more regexes for each component rather than have one complex
27regex per component.
28"""
29from __future__ import print_function
30from __future__ import division
31from __future__ import absolute_import
32
33import logging
34import re
Adrià Vilanova Martínezde942802022-07-15 14:06:55 +020035from six.moves import urllib
36from six.moves.urllib.parse import urlparse
Copybara854996b2021-09-07 19:36:02 +000037
38import settings
39from features import autolink_constants
40from framework import template_helpers
41from framework import validate
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010042from mrproto import project_pb2
Copybara854996b2021-09-07 19:36:02 +000043from tracker import tracker_helpers
44
45
46# If the total length of all comments is too large, we don't autolink.
47_MAX_TOTAL_LENGTH = 150 * 1024 # 150KB
48# Special all_referenced_artifacts value used to indicate that the
49# text content is too big to lookup all referenced artifacts quickly.
50SKIP_LOOKUPS = 'skip lookups'
51
52_CLOSING_TAG_RE = re.compile('</[a-z0-9]+>$', re.IGNORECASE)
53
54# These are allowed in links, but if any of closing delimiters appear
55# at the end of the link, and the opening one is not part of the link,
56# then trim off the closing delimiters.
57_LINK_TRAILING_CHARS = [
58 (None, ':'),
59 (None, '.'),
60 (None, ','),
61 ('(', ')'),
62 ('[', ']'),
63 ('{', '}'),
64 ('<', '>'),
65 ("'", "'"),
66 ('"', '"'),
67 ]
68
69
70def LinkifyEmail(_mr, autolink_regex_match, component_ref_artifacts):
71 """Examine a textual reference and replace it with a hyperlink or not.
72
73 This is a callback for use with the autolink feature. The function
74 parameters are standard for this type of callback.
75
76 Args:
77 _mr: unused information parsed from the HTTP request.
78 autolink_regex_match: regex match for the textual reference.
79 component_ref_artifacts: result of call to GetReferencedUsers.
80
81 Returns:
82 A list of TextRuns with tag=a linking to the user profile page of
83 any defined users, otherwise a mailto: link is generated.
84 """
85 email = autolink_regex_match.group(0)
86
87 if not validate.IsValidEmail(email):
88 return [template_helpers.TextRun(email)]
89
90 if component_ref_artifacts and email in component_ref_artifacts:
91 href = '/u/%s' % email
92 else:
93 href = 'mailto:' + email
94
95 result = [template_helpers.TextRun(email, tag='a', href=href)]
96 return result
97
98
99def CurryGetReferencedUsers(services):
100 """Return a function to get ref'd users with these services objects bound.
101
102 Currying is a convienent way to give the callback access to the services
103 objects, but without requiring that all possible services objects be passed
104 through the autolink registry and functions.
105
106 Args:
107 services: connection to the user persistence layer.
108
109 Returns:
110 A ready-to-use function that accepts the arguments that autolink
111 expects to pass to it.
112 """
113
114 def GetReferencedUsers(mr, emails):
115 """Return a dict of users referenced by these comments.
116
117 Args:
118 mr: commonly used info parsed from the request.
119 ref_tuples: email address strings for each user
120 that is mentioned in the comment text.
121
122 Returns:
123 A dictionary {email: user_pb} including all existing users.
124 """
125 user_id_dict = services.user.LookupExistingUserIDs(mr.cnxn, emails)
126 users_by_id = services.user.GetUsersByIDs(mr.cnxn,
127 list(user_id_dict.values()))
128 users_by_email = {
129 email: users_by_id[user_id]
130 for email, user_id in user_id_dict.items()}
131 return users_by_email
132
133 return GetReferencedUsers
134
135
136def Linkify(_mr, autolink_regex_match, _component_ref_artifacts):
137 """Examine a textual reference and replace it with a hyperlink or not.
138
139 This is a callback for use with the autolink feature. The function
140 parameters are standard for this type of callback.
141
142 Args:
143 _mr: unused information parsed from the HTTP request.
144 autolink_regex_match: regex match for the textual reference.
145 _component_ref_artifacts: unused result of call to GetReferencedIssues.
146
147 Returns:
148 A list of TextRuns with tag=a for all matched ftp, http, https and mailto
149 links converted into HTML hyperlinks.
150 """
151 hyperlink = autolink_regex_match.group(0)
152
153 trailing = ''
154 for begin, end in _LINK_TRAILING_CHARS:
155 if hyperlink.endswith(end):
156 if not begin or hyperlink[:-len(end)].find(begin) == -1:
157 trailing = end + trailing
158 hyperlink = hyperlink[:-len(end)]
159
160 tag_match = _CLOSING_TAG_RE.search(hyperlink)
161 if tag_match:
162 trailing = hyperlink[tag_match.start(0):] + trailing
163 hyperlink = hyperlink[:tag_match.start(0)]
164
165 href = hyperlink
166 if not href.lower().startswith(('http', 'ftp', 'mailto')):
167 # We use http because redirects for https are not all set up.
168 href = 'http://' + href
169
170 if (not validate.IsValidURL(href) and
171 not (href.startswith('mailto') and validate.IsValidEmail(href[7:]))):
172 return [template_helpers.TextRun(autolink_regex_match.group(0))]
173
174 result = [template_helpers.TextRun(hyperlink, tag='a', href=href)]
175 if trailing:
176 result.append(template_helpers.TextRun(trailing))
177
178 return result
179
180
181# Regular expression to detect git hashes.
182# Used to auto-link to Git hashes on crrev.com when displaying issue details.
183# Matches "rN", "r#N", and "revision N" when "rN" is not part of a larger word
184# and N is a hexadecimal string of 40 chars.
185_GIT_HASH_RE = re.compile(
186 r'\b(?P<prefix>r(evision\s+#?)?)?(?P<revnum>([a-f0-9]{40}))\b',
187 re.IGNORECASE | re.MULTILINE)
188
189# This is for SVN revisions and Git commit posisitons.
190_SVN_REF_RE = re.compile(
191 r'\b(?P<prefix>r(evision\s+#?)?)(?P<revnum>([0-9]{4,7}))\b',
192 re.IGNORECASE | re.MULTILINE)
193
194
195def GetReferencedRevisions(_mr, _refs):
196 """Load the referenced revision objects."""
197 # For now we just autolink any revision hash without actually
198 # checking that such a revision exists,
199 # TODO(jrobbins): Hit crrev.com and check that the revision exists
200 # and show a rollover with revision info.
201 return None
202
203
204def ExtractRevNums(_mr, autolink_regex_match):
205 """Return internal representation of a rev reference."""
206 ref = autolink_regex_match.group('revnum')
207 logging.debug('revision ref = %s', ref)
208 return [ref]
209
210
211def ReplaceRevisionRef(
212 mr, autolink_regex_match, _component_ref_artifacts):
213 """Return HTML markup for an autolink reference."""
214 prefix = autolink_regex_match.group('prefix')
215 revnum = autolink_regex_match.group('revnum')
216 url = _GetRevisionURLFormat(mr.project).format(revnum=revnum)
217 content = revnum
218 if prefix:
219 content = '%s%s' % (prefix, revnum)
220 return [template_helpers.TextRun(content, tag='a', href=url)]
221
222
223def _GetRevisionURLFormat(project):
224 # TODO(jrobbins): Expose a UI to customize it to point to whatever site
225 # hosts the source code. Also, site-wide default.
226 return (project.revision_url_format or settings.revision_url_format)
227
228
229# Regular expression to detect issue references.
230# Used to auto-link to other issues when displaying issue details.
231# Matches "issue " when "issue" is not part of a larger word, or
232# "issue #", or just a "#" when it is preceeded by a space.
233_ISSUE_REF_RE = re.compile(r"""
234 (?P<prefix>\b(issues?|bugs?)[ \t]*(:|=)?)
235 ([ \t]*(?P<project_name>\b[-a-z0-9]+[:\#])?
236 (?P<number_sign>\#?)
237 (?P<local_id>\d+)\b
238 (,?[ \t]*(and|or)?)?)+""", re.IGNORECASE | re.VERBOSE)
239
240# This is for chromium.org's crbug.com shorthand domain.
241_CRBUG_REF_RE = re.compile(r"""
242 (?P<prefix>\b(https?://)?crbug.com/)
243 ((?P<project_name>\b[-a-z0-9]+)(?P<separator>/))?
244 (?P<local_id>\d+)\b
245 (?P<anchor>\#c[0-9]+)?""", re.IGNORECASE | re.VERBOSE)
246
247# Once the overall issue reference has been detected, pick out the specific
248# issue project:id items within it. Often there is just one, but the "and|or"
249# syntax can allow multiple issues.
250_SINGLE_ISSUE_REF_RE = re.compile(r"""
251 (?P<prefix>\b(issue|bug)[ \t]*)?
252 (?P<project_name>\b[-a-z0-9]+[:\#])?
253 (?P<number_sign>\#?)
254 (?P<local_id>\d+)\b""", re.IGNORECASE | re.VERBOSE)
255
256
257def CurryGetReferencedIssues(services):
258 """Return a function to get ref'd issues with these services objects bound.
259
260 Currying is a convienent way to give the callback access to the services
261 objects, but without requiring that all possible services objects be passed
262 through the autolink registry and functions.
263
264 Args:
265 services: connection to issue, config, and project persistence layers.
266
267 Returns:
268 A ready-to-use function that accepts the arguments that autolink
269 expects to pass to it.
270 """
271
272 def GetReferencedIssues(mr, ref_tuples):
273 """Return lists of open and closed issues referenced by these comments.
274
275 Args:
276 mr: commonly used info parsed from the request.
277 ref_tuples: list of (project_name, local_id) tuples for each issue
278 that is mentioned in the comment text. The project_name may be None,
279 in which case the issue is assumed to be in the current project.
280
281 Returns:
282 A list of open and closed issue dicts.
283 """
284 ref_projects = services.project.GetProjectsByName(
285 mr.cnxn,
286 [(ref_pn or mr.project_name) for ref_pn, _ in ref_tuples])
287 issue_ids, _misses = services.issue.ResolveIssueRefs(
288 mr.cnxn, ref_projects, mr.project_name, ref_tuples)
289 open_issues, closed_issues = (
290 tracker_helpers.GetAllowedOpenedAndClosedIssues(
291 mr, issue_ids, services))
292
293 open_dict = {}
294 for issue in open_issues:
295 open_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue
296
297 closed_dict = {}
298 for issue in closed_issues:
299 closed_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue
300
301 logging.info('autolinking dicts %r and %r', open_dict, closed_dict)
302
303 return open_dict, closed_dict
304
305 return GetReferencedIssues
306
307
308def _ParseProjectNameMatch(project_name):
309 """Process the passed project name and determine the best representation.
310
311 Args:
312 project_name: a string with the project name matched in a regex
313
314 Returns:
315 A minimal representation of the project name, None if no valid content.
316 """
317 if not project_name:
318 return None
319 return project_name.lstrip().rstrip('#: \t\n')
320
321
322def _ExtractProjectAndIssueIds(
323 autolink_regex_match, subregex, default_project_name=None):
324 """Convert a regex match for a textual reference into our internal form."""
325 whole_str = autolink_regex_match.group(0)
326 refs = []
327 for submatch in subregex.finditer(whole_str):
328 project_name = (
329 _ParseProjectNameMatch(submatch.group('project_name')) or
330 default_project_name)
331 ref = (project_name, int(submatch.group('local_id')))
332 refs.append(ref)
333 logging.info('issue ref = %s', ref)
334
335 return refs
336
337
338def ExtractProjectAndIssueIdsNormal(_mr, autolink_regex_match):
339 """Convert a regex match for a textual reference into our internal form."""
340 return _ExtractProjectAndIssueIds(
341 autolink_regex_match, _SINGLE_ISSUE_REF_RE)
342
343
344def ExtractProjectAndIssueIdsCrBug(_mr, autolink_regex_match):
345 """Convert a regex match for a textual reference into our internal form."""
346 return _ExtractProjectAndIssueIds(
347 autolink_regex_match, _CRBUG_REF_RE, default_project_name='chromium')
348
349
350# This uses project name to avoid a lookup on project ID in a function
351# that has no services object.
352def _IssueProjectKey(project_name, local_id):
353 """Make a dictionary key to identify a referenced issue."""
354 return '%s:%d' % (project_name, local_id)
355
356
357class IssueRefRun(object):
358 """A text run that links to a referenced issue."""
359
360 def __init__(self, issue, is_closed, project_name, content, anchor):
361 self.tag = 'a'
362 self.css_class = 'closed_ref' if is_closed else None
363 self.title = issue.summary
364 self.href = '/p/%s/issues/detail?id=%d%s' % (
365 project_name, issue.local_id, anchor)
366
367 self.content = content
368 if is_closed:
369 self.content = ' %s ' % self.content
370
371
372def _ReplaceIssueRef(
373 autolink_regex_match, component_ref_artifacts, single_issue_regex,
374 default_project_name):
375 """Examine a textual reference and replace it with an autolink or not.
376
377 Args:
378 autolink_regex_match: regex match for the textual reference.
379 component_ref_artifacts: result of earlier call to GetReferencedIssues.
380 single_issue_regex: regular expression to parse individual issue references
381 out of a multi-issue-reference phrase. E.g., "issues 12 and 34".
382 default_project_name: project name to use when not specified.
383
384 Returns:
385 A list of IssueRefRuns and TextRuns to replace the textual
386 reference. If there is an issue to autolink to, we return an HTML
387 hyperlink. Otherwise, we the run will have the original plain
388 text.
389 """
390 open_dict, closed_dict = {}, {}
391 if component_ref_artifacts:
392 open_dict, closed_dict = component_ref_artifacts
393 original = autolink_regex_match.group(0)
394 logging.info('called ReplaceIssueRef on %r', original)
395 result_runs = []
396 pos = 0
397 for submatch in single_issue_regex.finditer(original):
398 if submatch.start() >= pos:
399 if original[pos: submatch.start()]:
400 result_runs.append(template_helpers.TextRun(
401 original[pos: submatch.start()]))
402 replacement_run = _ReplaceSingleIssueRef(
403 submatch, open_dict, closed_dict, default_project_name)
404 result_runs.append(replacement_run)
405 pos = submatch.end()
406
407 if original[pos:]:
408 result_runs.append(template_helpers.TextRun(original[pos:]))
409
410 return result_runs
411
412
413def ReplaceIssueRefNormal(mr, autolink_regex_match, component_ref_artifacts):
414 """Replaces occurances of 'issue 123' with link TextRuns as needed."""
415 return _ReplaceIssueRef(
416 autolink_regex_match, component_ref_artifacts,
417 _SINGLE_ISSUE_REF_RE, mr.project_name)
418
419
420def ReplaceIssueRefCrBug(_mr, autolink_regex_match, component_ref_artifacts):
421 """Replaces occurances of 'crbug.com/123' with link TextRuns as needed."""
422 return _ReplaceIssueRef(
423 autolink_regex_match, component_ref_artifacts,
424 _CRBUG_REF_RE, 'chromium')
425
426
427def _ReplaceSingleIssueRef(
428 submatch, open_dict, closed_dict, default_project_name):
429 """Replace one issue reference with a link, or the original text."""
430 content = submatch.group(0)
431 project_name = submatch.group('project_name')
432 anchor = submatch.groupdict().get('anchor') or ''
433 if project_name:
434 project_name = project_name.lstrip().rstrip(':#')
435 else:
436 # We need project_name for the URL, even if it is not in the text.
437 project_name = default_project_name
438
439 local_id = int(submatch.group('local_id'))
440 issue_key = _IssueProjectKey(project_name, local_id)
441 if issue_key in open_dict:
442 return IssueRefRun(
443 open_dict[issue_key], False, project_name, content, anchor)
444 elif issue_key in closed_dict:
445 return IssueRefRun(
446 closed_dict[issue_key], True, project_name, content, anchor)
447 else: # Don't link to non-existent issues.
448 return template_helpers.TextRun(content)
449
450
451class Autolink(object):
452 """Maintains a registry of autolink syntax and can apply it to comments."""
453
454 def __init__(self):
455 self.registry = {}
456
457 def RegisterComponent(self, component_name, artifact_lookup_function,
458 match_to_reference_function, autolink_re_subst_dict):
459 """Register all the autolink info for a software component.
460
461 Args:
462 component_name: string name of software component, must be unique.
463 artifact_lookup_function: function to batch lookup all artifacts that
464 might have been referenced in a set of comments:
465 function(all_matches) -> referenced_artifacts
466 the referenced_artifacts will be pased to each subst function.
467 match_to_reference_function: convert a regex match object to
468 some internal representation of the artifact reference.
469 autolink_re_subst_dict: dictionary of regular expressions and
470 the substitution function that should be called for each match:
471 function(match, referenced_artifacts) -> replacement_markup
472 """
473 self.registry[component_name] = (artifact_lookup_function,
474 match_to_reference_function,
475 autolink_re_subst_dict)
476
477 def GetAllReferencedArtifacts(
478 self, mr, comment_text_list, max_total_length=_MAX_TOTAL_LENGTH):
479 """Call callbacks to lookup all artifacts possibly referenced.
480
481 Args:
482 mr: information parsed out of the user HTTP request.
483 comment_text_list: list of comment content strings.
484 max_total_length: int max number of characters to accept:
485 if more than this, then skip autolinking entirely.
486
487 Returns:
488 Opaque object that can be pased to MarkupAutolinks. It's
489 structure happens to be {component_name: artifact_list, ...},
490 or the special value SKIP_LOOKUPS.
491 """
492 total_len = sum(len(comment_text) for comment_text in comment_text_list)
493 if total_len > max_total_length:
494 return SKIP_LOOKUPS
495
496 all_referenced_artifacts = {}
497 for comp, (lookup, match_to_refs, re_dict) in self.registry.items():
498 refs = set()
499 for comment_text in comment_text_list:
500 for regex in re_dict:
501 for match in regex.finditer(comment_text):
502 additional_refs = match_to_refs(mr, match)
503 if additional_refs:
504 refs.update(additional_refs)
505
506 all_referenced_artifacts[comp] = lookup(mr, refs)
507
508 return all_referenced_artifacts
509
510 def MarkupAutolinks(self, mr, text_runs, all_referenced_artifacts):
511 """Loop over components and regexes, applying all substitutions.
512
513 Args:
514 mr: info parsed from the user's HTTP request.
515 text_runs: List of text runs for the user's comment.
516 all_referenced_artifacts: result of previous call to
517 GetAllReferencedArtifacts.
518
519 Returns:
520 List of text runs for the entire user comment, some of which may have
521 attribures that cause them to render as links in render-rich-text.ezt.
522 """
523 items = list(self.registry.items())
524 items.sort() # Process components in determinate alphabetical order.
525 for component, (_lookup, _match_ref, re_subst_dict) in items:
526 if all_referenced_artifacts == SKIP_LOOKUPS:
527 component_ref_artifacts = None
528 else:
529 component_ref_artifacts = all_referenced_artifacts[component]
530 for regex, subst_fun in re_subst_dict.items():
531 text_runs = self._ApplySubstFunctionToRuns(
532 text_runs, regex, subst_fun, mr, component_ref_artifacts)
533
534 return text_runs
535
536 def _ApplySubstFunctionToRuns(
537 self, text_runs, regex, subst_fun, mr, component_ref_artifacts):
538 """Apply autolink regex and substitution function to each text run.
539
540 Args:
541 text_runs: list of TextRun objects with parts of the original comment.
542 regex: Regular expression for detecting textual references to artifacts.
543 subst_fun: function to return autolink markup, or original text.
544 mr: common info parsed from the user HTTP request.
545 component_ref_artifacts: already-looked-up destination artifacts to use
546 when computing substitution text.
547
548 Returns:
549 A new list with more and smaller runs, some of which may have tag
550 and link attributes set.
551 """
552 result_runs = []
553 for run in text_runs:
554 content = run.content
555 if run.tag:
556 # This chunk has already been substituted, don't allow nested
557 # autolinking to mess up our output.
558 result_runs.append(run)
559 else:
560 pos = 0
561 for match in regex.finditer(content):
562 if match.start() > pos:
563 result_runs.append(template_helpers.TextRun(
564 content[pos: match.start()]))
565 replacement_runs = subst_fun(mr, match, component_ref_artifacts)
566 result_runs.extend(replacement_runs)
567 pos = match.end()
568
569 if run.content[pos:]: # Keep any text that came after the last match
570 result_runs.append(template_helpers.TextRun(run.content[pos:]))
571
572 # TODO(jrobbins): ideally we would merge consecutive plain text runs
573 # so that regexes can match across those run boundaries.
574
575 return result_runs
576
577
578def RegisterAutolink(services):
579 """Register all the autolink hooks."""
580 # The order of the RegisterComponent() calls does not matter so that we could
581 # do this registration from separate modules in the future if needed.
582 # Priority order of application is determined by the names of the registered
583 # handers, which are sorted in MarkupAutolinks().
584
585 services.autolink.RegisterComponent(
586 '01-tracker-crbug',
587 CurryGetReferencedIssues(services),
588 ExtractProjectAndIssueIdsCrBug,
589 {_CRBUG_REF_RE: ReplaceIssueRefCrBug})
590
591 services.autolink.RegisterComponent(
592 '02-linkify-full-urls',
593 lambda request, mr: None,
594 lambda mr, match: None,
595 {autolink_constants.IS_A_LINK_RE: Linkify})
596
597 services.autolink.RegisterComponent(
598 '03-linkify-user-profiles-or-mailto',
599 CurryGetReferencedUsers(services),
600 lambda _mr, match: [match.group(0)],
601 {autolink_constants.IS_IMPLIED_EMAIL_RE: LinkifyEmail})
602
603 services.autolink.RegisterComponent(
604 '04-tracker-regular',
605 CurryGetReferencedIssues(services),
606 ExtractProjectAndIssueIdsNormal,
607 {_ISSUE_REF_RE: ReplaceIssueRefNormal})
608
609 services.autolink.RegisterComponent(
610 '05-linkify-shorthand',
611 lambda request, mr: None,
612 lambda mr, match: None,
613 {autolink_constants.IS_A_SHORT_LINK_RE: Linkify,
614 autolink_constants.IS_A_NUMERIC_SHORT_LINK_RE: Linkify,
615 autolink_constants.IS_IMPLIED_LINK_RE: Linkify,
616 })
617
618 services.autolink.RegisterComponent(
619 '06-versioncontrol',
620 GetReferencedRevisions,
621 ExtractRevNums,
622 {_GIT_HASH_RE: ReplaceRevisionRef,
623 _SVN_REF_RE: ReplaceRevisionRef})