blob: 67c898a8b26d0e30a58dad5d43772a7bc1761b97 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5
6"""Autolink helps auto-link references to artifacts in text.
7
8This class maintains a registry of artifact autolink syntax specs and
9callbacks. The structure of that registry is:
10 { component_name: (lookup_callback, match_to_reference_function,
11 { regex: substitution_callback, ...}),
12 ...
13 }
14
15For example:
16 { 'tracker':
17 (GetReferencedIssues,
18 ExtractProjectAndIssueIds,
19 {_ISSUE_REF_RE: ReplaceIssueRef}),
20 'versioncontrol':
21 (GetReferencedRevisions,
22 ExtractProjectAndRevNum,
23 {_GIT_HASH_RE: ReplaceRevisionRef}),
24 }
25
26The dictionary of regexes is used here because, in the future, we
27might add more regexes for each component rather than have one complex
28regex per component.
29"""
30from __future__ import print_function
31from __future__ import division
32from __future__ import absolute_import
33
34import logging
35import re
Adrià Vilanova Martínezde942802022-07-15 14:06:55 +020036from six.moves import urllib
37from six.moves.urllib.parse import urlparse
Copybara854996b2021-09-07 19:36:02 +000038
39import settings
40from features import autolink_constants
41from framework import template_helpers
42from framework import validate
43from proto import project_pb2
44from tracker import tracker_helpers
45
46
47# If the total length of all comments is too large, we don't autolink.
48_MAX_TOTAL_LENGTH = 150 * 1024 # 150KB
49# Special all_referenced_artifacts value used to indicate that the
50# text content is too big to lookup all referenced artifacts quickly.
51SKIP_LOOKUPS = 'skip lookups'
52
53_CLOSING_TAG_RE = re.compile('</[a-z0-9]+>$', re.IGNORECASE)
54
55# These are allowed in links, but if any of closing delimiters appear
56# at the end of the link, and the opening one is not part of the link,
57# then trim off the closing delimiters.
58_LINK_TRAILING_CHARS = [
59 (None, ':'),
60 (None, '.'),
61 (None, ','),
62 ('(', ')'),
63 ('[', ']'),
64 ('{', '}'),
65 ('<', '>'),
66 ("'", "'"),
67 ('"', '"'),
68 ]
69
70
71def LinkifyEmail(_mr, autolink_regex_match, component_ref_artifacts):
72 """Examine a textual reference and replace it with a hyperlink or not.
73
74 This is a callback for use with the autolink feature. The function
75 parameters are standard for this type of callback.
76
77 Args:
78 _mr: unused information parsed from the HTTP request.
79 autolink_regex_match: regex match for the textual reference.
80 component_ref_artifacts: result of call to GetReferencedUsers.
81
82 Returns:
83 A list of TextRuns with tag=a linking to the user profile page of
84 any defined users, otherwise a mailto: link is generated.
85 """
86 email = autolink_regex_match.group(0)
87
88 if not validate.IsValidEmail(email):
89 return [template_helpers.TextRun(email)]
90
91 if component_ref_artifacts and email in component_ref_artifacts:
92 href = '/u/%s' % email
93 else:
94 href = 'mailto:' + email
95
96 result = [template_helpers.TextRun(email, tag='a', href=href)]
97 return result
98
99
100def CurryGetReferencedUsers(services):
101 """Return a function to get ref'd users with these services objects bound.
102
103 Currying is a convienent way to give the callback access to the services
104 objects, but without requiring that all possible services objects be passed
105 through the autolink registry and functions.
106
107 Args:
108 services: connection to the user persistence layer.
109
110 Returns:
111 A ready-to-use function that accepts the arguments that autolink
112 expects to pass to it.
113 """
114
115 def GetReferencedUsers(mr, emails):
116 """Return a dict of users referenced by these comments.
117
118 Args:
119 mr: commonly used info parsed from the request.
120 ref_tuples: email address strings for each user
121 that is mentioned in the comment text.
122
123 Returns:
124 A dictionary {email: user_pb} including all existing users.
125 """
126 user_id_dict = services.user.LookupExistingUserIDs(mr.cnxn, emails)
127 users_by_id = services.user.GetUsersByIDs(mr.cnxn,
128 list(user_id_dict.values()))
129 users_by_email = {
130 email: users_by_id[user_id]
131 for email, user_id in user_id_dict.items()}
132 return users_by_email
133
134 return GetReferencedUsers
135
136
137def Linkify(_mr, autolink_regex_match, _component_ref_artifacts):
138 """Examine a textual reference and replace it with a hyperlink or not.
139
140 This is a callback for use with the autolink feature. The function
141 parameters are standard for this type of callback.
142
143 Args:
144 _mr: unused information parsed from the HTTP request.
145 autolink_regex_match: regex match for the textual reference.
146 _component_ref_artifacts: unused result of call to GetReferencedIssues.
147
148 Returns:
149 A list of TextRuns with tag=a for all matched ftp, http, https and mailto
150 links converted into HTML hyperlinks.
151 """
152 hyperlink = autolink_regex_match.group(0)
153
154 trailing = ''
155 for begin, end in _LINK_TRAILING_CHARS:
156 if hyperlink.endswith(end):
157 if not begin or hyperlink[:-len(end)].find(begin) == -1:
158 trailing = end + trailing
159 hyperlink = hyperlink[:-len(end)]
160
161 tag_match = _CLOSING_TAG_RE.search(hyperlink)
162 if tag_match:
163 trailing = hyperlink[tag_match.start(0):] + trailing
164 hyperlink = hyperlink[:tag_match.start(0)]
165
166 href = hyperlink
167 if not href.lower().startswith(('http', 'ftp', 'mailto')):
168 # We use http because redirects for https are not all set up.
169 href = 'http://' + href
170
171 if (not validate.IsValidURL(href) and
172 not (href.startswith('mailto') and validate.IsValidEmail(href[7:]))):
173 return [template_helpers.TextRun(autolink_regex_match.group(0))]
174
175 result = [template_helpers.TextRun(hyperlink, tag='a', href=href)]
176 if trailing:
177 result.append(template_helpers.TextRun(trailing))
178
179 return result
180
181
182# Regular expression to detect git hashes.
183# Used to auto-link to Git hashes on crrev.com when displaying issue details.
184# Matches "rN", "r#N", and "revision N" when "rN" is not part of a larger word
185# and N is a hexadecimal string of 40 chars.
186_GIT_HASH_RE = re.compile(
187 r'\b(?P<prefix>r(evision\s+#?)?)?(?P<revnum>([a-f0-9]{40}))\b',
188 re.IGNORECASE | re.MULTILINE)
189
190# This is for SVN revisions and Git commit posisitons.
191_SVN_REF_RE = re.compile(
192 r'\b(?P<prefix>r(evision\s+#?)?)(?P<revnum>([0-9]{4,7}))\b',
193 re.IGNORECASE | re.MULTILINE)
194
195
196def GetReferencedRevisions(_mr, _refs):
197 """Load the referenced revision objects."""
198 # For now we just autolink any revision hash without actually
199 # checking that such a revision exists,
200 # TODO(jrobbins): Hit crrev.com and check that the revision exists
201 # and show a rollover with revision info.
202 return None
203
204
205def ExtractRevNums(_mr, autolink_regex_match):
206 """Return internal representation of a rev reference."""
207 ref = autolink_regex_match.group('revnum')
208 logging.debug('revision ref = %s', ref)
209 return [ref]
210
211
212def ReplaceRevisionRef(
213 mr, autolink_regex_match, _component_ref_artifacts):
214 """Return HTML markup for an autolink reference."""
215 prefix = autolink_regex_match.group('prefix')
216 revnum = autolink_regex_match.group('revnum')
217 url = _GetRevisionURLFormat(mr.project).format(revnum=revnum)
218 content = revnum
219 if prefix:
220 content = '%s%s' % (prefix, revnum)
221 return [template_helpers.TextRun(content, tag='a', href=url)]
222
223
224def _GetRevisionURLFormat(project):
225 # TODO(jrobbins): Expose a UI to customize it to point to whatever site
226 # hosts the source code. Also, site-wide default.
227 return (project.revision_url_format or settings.revision_url_format)
228
229
230# Regular expression to detect issue references.
231# Used to auto-link to other issues when displaying issue details.
232# Matches "issue " when "issue" is not part of a larger word, or
233# "issue #", or just a "#" when it is preceeded by a space.
234_ISSUE_REF_RE = re.compile(r"""
235 (?P<prefix>\b(issues?|bugs?)[ \t]*(:|=)?)
236 ([ \t]*(?P<project_name>\b[-a-z0-9]+[:\#])?
237 (?P<number_sign>\#?)
238 (?P<local_id>\d+)\b
239 (,?[ \t]*(and|or)?)?)+""", re.IGNORECASE | re.VERBOSE)
240
241# This is for chromium.org's crbug.com shorthand domain.
242_CRBUG_REF_RE = re.compile(r"""
243 (?P<prefix>\b(https?://)?crbug.com/)
244 ((?P<project_name>\b[-a-z0-9]+)(?P<separator>/))?
245 (?P<local_id>\d+)\b
246 (?P<anchor>\#c[0-9]+)?""", re.IGNORECASE | re.VERBOSE)
247
248# Once the overall issue reference has been detected, pick out the specific
249# issue project:id items within it. Often there is just one, but the "and|or"
250# syntax can allow multiple issues.
251_SINGLE_ISSUE_REF_RE = re.compile(r"""
252 (?P<prefix>\b(issue|bug)[ \t]*)?
253 (?P<project_name>\b[-a-z0-9]+[:\#])?
254 (?P<number_sign>\#?)
255 (?P<local_id>\d+)\b""", re.IGNORECASE | re.VERBOSE)
256
257
258def CurryGetReferencedIssues(services):
259 """Return a function to get ref'd issues with these services objects bound.
260
261 Currying is a convienent way to give the callback access to the services
262 objects, but without requiring that all possible services objects be passed
263 through the autolink registry and functions.
264
265 Args:
266 services: connection to issue, config, and project persistence layers.
267
268 Returns:
269 A ready-to-use function that accepts the arguments that autolink
270 expects to pass to it.
271 """
272
273 def GetReferencedIssues(mr, ref_tuples):
274 """Return lists of open and closed issues referenced by these comments.
275
276 Args:
277 mr: commonly used info parsed from the request.
278 ref_tuples: list of (project_name, local_id) tuples for each issue
279 that is mentioned in the comment text. The project_name may be None,
280 in which case the issue is assumed to be in the current project.
281
282 Returns:
283 A list of open and closed issue dicts.
284 """
285 ref_projects = services.project.GetProjectsByName(
286 mr.cnxn,
287 [(ref_pn or mr.project_name) for ref_pn, _ in ref_tuples])
288 issue_ids, _misses = services.issue.ResolveIssueRefs(
289 mr.cnxn, ref_projects, mr.project_name, ref_tuples)
290 open_issues, closed_issues = (
291 tracker_helpers.GetAllowedOpenedAndClosedIssues(
292 mr, issue_ids, services))
293
294 open_dict = {}
295 for issue in open_issues:
296 open_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue
297
298 closed_dict = {}
299 for issue in closed_issues:
300 closed_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue
301
302 logging.info('autolinking dicts %r and %r', open_dict, closed_dict)
303
304 return open_dict, closed_dict
305
306 return GetReferencedIssues
307
308
309def _ParseProjectNameMatch(project_name):
310 """Process the passed project name and determine the best representation.
311
312 Args:
313 project_name: a string with the project name matched in a regex
314
315 Returns:
316 A minimal representation of the project name, None if no valid content.
317 """
318 if not project_name:
319 return None
320 return project_name.lstrip().rstrip('#: \t\n')
321
322
323def _ExtractProjectAndIssueIds(
324 autolink_regex_match, subregex, default_project_name=None):
325 """Convert a regex match for a textual reference into our internal form."""
326 whole_str = autolink_regex_match.group(0)
327 refs = []
328 for submatch in subregex.finditer(whole_str):
329 project_name = (
330 _ParseProjectNameMatch(submatch.group('project_name')) or
331 default_project_name)
332 ref = (project_name, int(submatch.group('local_id')))
333 refs.append(ref)
334 logging.info('issue ref = %s', ref)
335
336 return refs
337
338
339def ExtractProjectAndIssueIdsNormal(_mr, autolink_regex_match):
340 """Convert a regex match for a textual reference into our internal form."""
341 return _ExtractProjectAndIssueIds(
342 autolink_regex_match, _SINGLE_ISSUE_REF_RE)
343
344
345def ExtractProjectAndIssueIdsCrBug(_mr, autolink_regex_match):
346 """Convert a regex match for a textual reference into our internal form."""
347 return _ExtractProjectAndIssueIds(
348 autolink_regex_match, _CRBUG_REF_RE, default_project_name='chromium')
349
350
351# This uses project name to avoid a lookup on project ID in a function
352# that has no services object.
353def _IssueProjectKey(project_name, local_id):
354 """Make a dictionary key to identify a referenced issue."""
355 return '%s:%d' % (project_name, local_id)
356
357
358class IssueRefRun(object):
359 """A text run that links to a referenced issue."""
360
361 def __init__(self, issue, is_closed, project_name, content, anchor):
362 self.tag = 'a'
363 self.css_class = 'closed_ref' if is_closed else None
364 self.title = issue.summary
365 self.href = '/p/%s/issues/detail?id=%d%s' % (
366 project_name, issue.local_id, anchor)
367
368 self.content = content
369 if is_closed:
370 self.content = ' %s ' % self.content
371
372
373def _ReplaceIssueRef(
374 autolink_regex_match, component_ref_artifacts, single_issue_regex,
375 default_project_name):
376 """Examine a textual reference and replace it with an autolink or not.
377
378 Args:
379 autolink_regex_match: regex match for the textual reference.
380 component_ref_artifacts: result of earlier call to GetReferencedIssues.
381 single_issue_regex: regular expression to parse individual issue references
382 out of a multi-issue-reference phrase. E.g., "issues 12 and 34".
383 default_project_name: project name to use when not specified.
384
385 Returns:
386 A list of IssueRefRuns and TextRuns to replace the textual
387 reference. If there is an issue to autolink to, we return an HTML
388 hyperlink. Otherwise, we the run will have the original plain
389 text.
390 """
391 open_dict, closed_dict = {}, {}
392 if component_ref_artifacts:
393 open_dict, closed_dict = component_ref_artifacts
394 original = autolink_regex_match.group(0)
395 logging.info('called ReplaceIssueRef on %r', original)
396 result_runs = []
397 pos = 0
398 for submatch in single_issue_regex.finditer(original):
399 if submatch.start() >= pos:
400 if original[pos: submatch.start()]:
401 result_runs.append(template_helpers.TextRun(
402 original[pos: submatch.start()]))
403 replacement_run = _ReplaceSingleIssueRef(
404 submatch, open_dict, closed_dict, default_project_name)
405 result_runs.append(replacement_run)
406 pos = submatch.end()
407
408 if original[pos:]:
409 result_runs.append(template_helpers.TextRun(original[pos:]))
410
411 return result_runs
412
413
414def ReplaceIssueRefNormal(mr, autolink_regex_match, component_ref_artifacts):
415 """Replaces occurances of 'issue 123' with link TextRuns as needed."""
416 return _ReplaceIssueRef(
417 autolink_regex_match, component_ref_artifacts,
418 _SINGLE_ISSUE_REF_RE, mr.project_name)
419
420
421def ReplaceIssueRefCrBug(_mr, autolink_regex_match, component_ref_artifacts):
422 """Replaces occurances of 'crbug.com/123' with link TextRuns as needed."""
423 return _ReplaceIssueRef(
424 autolink_regex_match, component_ref_artifacts,
425 _CRBUG_REF_RE, 'chromium')
426
427
428def _ReplaceSingleIssueRef(
429 submatch, open_dict, closed_dict, default_project_name):
430 """Replace one issue reference with a link, or the original text."""
431 content = submatch.group(0)
432 project_name = submatch.group('project_name')
433 anchor = submatch.groupdict().get('anchor') or ''
434 if project_name:
435 project_name = project_name.lstrip().rstrip(':#')
436 else:
437 # We need project_name for the URL, even if it is not in the text.
438 project_name = default_project_name
439
440 local_id = int(submatch.group('local_id'))
441 issue_key = _IssueProjectKey(project_name, local_id)
442 if issue_key in open_dict:
443 return IssueRefRun(
444 open_dict[issue_key], False, project_name, content, anchor)
445 elif issue_key in closed_dict:
446 return IssueRefRun(
447 closed_dict[issue_key], True, project_name, content, anchor)
448 else: # Don't link to non-existent issues.
449 return template_helpers.TextRun(content)
450
451
452class Autolink(object):
453 """Maintains a registry of autolink syntax and can apply it to comments."""
454
455 def __init__(self):
456 self.registry = {}
457
458 def RegisterComponent(self, component_name, artifact_lookup_function,
459 match_to_reference_function, autolink_re_subst_dict):
460 """Register all the autolink info for a software component.
461
462 Args:
463 component_name: string name of software component, must be unique.
464 artifact_lookup_function: function to batch lookup all artifacts that
465 might have been referenced in a set of comments:
466 function(all_matches) -> referenced_artifacts
467 the referenced_artifacts will be pased to each subst function.
468 match_to_reference_function: convert a regex match object to
469 some internal representation of the artifact reference.
470 autolink_re_subst_dict: dictionary of regular expressions and
471 the substitution function that should be called for each match:
472 function(match, referenced_artifacts) -> replacement_markup
473 """
474 self.registry[component_name] = (artifact_lookup_function,
475 match_to_reference_function,
476 autolink_re_subst_dict)
477
478 def GetAllReferencedArtifacts(
479 self, mr, comment_text_list, max_total_length=_MAX_TOTAL_LENGTH):
480 """Call callbacks to lookup all artifacts possibly referenced.
481
482 Args:
483 mr: information parsed out of the user HTTP request.
484 comment_text_list: list of comment content strings.
485 max_total_length: int max number of characters to accept:
486 if more than this, then skip autolinking entirely.
487
488 Returns:
489 Opaque object that can be pased to MarkupAutolinks. It's
490 structure happens to be {component_name: artifact_list, ...},
491 or the special value SKIP_LOOKUPS.
492 """
493 total_len = sum(len(comment_text) for comment_text in comment_text_list)
494 if total_len > max_total_length:
495 return SKIP_LOOKUPS
496
497 all_referenced_artifacts = {}
498 for comp, (lookup, match_to_refs, re_dict) in self.registry.items():
499 refs = set()
500 for comment_text in comment_text_list:
501 for regex in re_dict:
502 for match in regex.finditer(comment_text):
503 additional_refs = match_to_refs(mr, match)
504 if additional_refs:
505 refs.update(additional_refs)
506
507 all_referenced_artifacts[comp] = lookup(mr, refs)
508
509 return all_referenced_artifacts
510
511 def MarkupAutolinks(self, mr, text_runs, all_referenced_artifacts):
512 """Loop over components and regexes, applying all substitutions.
513
514 Args:
515 mr: info parsed from the user's HTTP request.
516 text_runs: List of text runs for the user's comment.
517 all_referenced_artifacts: result of previous call to
518 GetAllReferencedArtifacts.
519
520 Returns:
521 List of text runs for the entire user comment, some of which may have
522 attribures that cause them to render as links in render-rich-text.ezt.
523 """
524 items = list(self.registry.items())
525 items.sort() # Process components in determinate alphabetical order.
526 for component, (_lookup, _match_ref, re_subst_dict) in items:
527 if all_referenced_artifacts == SKIP_LOOKUPS:
528 component_ref_artifacts = None
529 else:
530 component_ref_artifacts = all_referenced_artifacts[component]
531 for regex, subst_fun in re_subst_dict.items():
532 text_runs = self._ApplySubstFunctionToRuns(
533 text_runs, regex, subst_fun, mr, component_ref_artifacts)
534
535 return text_runs
536
537 def _ApplySubstFunctionToRuns(
538 self, text_runs, regex, subst_fun, mr, component_ref_artifacts):
539 """Apply autolink regex and substitution function to each text run.
540
541 Args:
542 text_runs: list of TextRun objects with parts of the original comment.
543 regex: Regular expression for detecting textual references to artifacts.
544 subst_fun: function to return autolink markup, or original text.
545 mr: common info parsed from the user HTTP request.
546 component_ref_artifacts: already-looked-up destination artifacts to use
547 when computing substitution text.
548
549 Returns:
550 A new list with more and smaller runs, some of which may have tag
551 and link attributes set.
552 """
553 result_runs = []
554 for run in text_runs:
555 content = run.content
556 if run.tag:
557 # This chunk has already been substituted, don't allow nested
558 # autolinking to mess up our output.
559 result_runs.append(run)
560 else:
561 pos = 0
562 for match in regex.finditer(content):
563 if match.start() > pos:
564 result_runs.append(template_helpers.TextRun(
565 content[pos: match.start()]))
566 replacement_runs = subst_fun(mr, match, component_ref_artifacts)
567 result_runs.extend(replacement_runs)
568 pos = match.end()
569
570 if run.content[pos:]: # Keep any text that came after the last match
571 result_runs.append(template_helpers.TextRun(run.content[pos:]))
572
573 # TODO(jrobbins): ideally we would merge consecutive plain text runs
574 # so that regexes can match across those run boundaries.
575
576 return result_runs
577
578
579def RegisterAutolink(services):
580 """Register all the autolink hooks."""
581 # The order of the RegisterComponent() calls does not matter so that we could
582 # do this registration from separate modules in the future if needed.
583 # Priority order of application is determined by the names of the registered
584 # handers, which are sorted in MarkupAutolinks().
585
586 services.autolink.RegisterComponent(
587 '01-tracker-crbug',
588 CurryGetReferencedIssues(services),
589 ExtractProjectAndIssueIdsCrBug,
590 {_CRBUG_REF_RE: ReplaceIssueRefCrBug})
591
592 services.autolink.RegisterComponent(
593 '02-linkify-full-urls',
594 lambda request, mr: None,
595 lambda mr, match: None,
596 {autolink_constants.IS_A_LINK_RE: Linkify})
597
598 services.autolink.RegisterComponent(
599 '03-linkify-user-profiles-or-mailto',
600 CurryGetReferencedUsers(services),
601 lambda _mr, match: [match.group(0)],
602 {autolink_constants.IS_IMPLIED_EMAIL_RE: LinkifyEmail})
603
604 services.autolink.RegisterComponent(
605 '04-tracker-regular',
606 CurryGetReferencedIssues(services),
607 ExtractProjectAndIssueIdsNormal,
608 {_ISSUE_REF_RE: ReplaceIssueRefNormal})
609
610 services.autolink.RegisterComponent(
611 '05-linkify-shorthand',
612 lambda request, mr: None,
613 lambda mr, match: None,
614 {autolink_constants.IS_A_SHORT_LINK_RE: Linkify,
615 autolink_constants.IS_A_NUMERIC_SHORT_LINK_RE: Linkify,
616 autolink_constants.IS_IMPLIED_LINK_RE: Linkify,
617 })
618
619 services.autolink.RegisterComponent(
620 '06-versioncontrol',
621 GetReferencedRevisions,
622 ExtractRevNums,
623 {_GIT_HASH_RE: ReplaceRevisionRef,
624 _SVN_REF_RE: ReplaceRevisionRef})