| # Copyright 2016 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Autolink helps auto-link references to artifacts in text. |
| |
| This class maintains a registry of artifact autolink syntax specs and |
| callbacks. The structure of that registry is: |
| { component_name: (lookup_callback, match_to_reference_function, |
| { regex: substitution_callback, ...}), |
| ... |
| } |
| |
| For example: |
| { 'tracker': |
| (GetReferencedIssues, |
| ExtractProjectAndIssueIds, |
| {_ISSUE_REF_RE: ReplaceIssueRef}), |
| 'versioncontrol': |
| (GetReferencedRevisions, |
| ExtractProjectAndRevNum, |
| {_GIT_HASH_RE: ReplaceRevisionRef}), |
| } |
| |
| The dictionary of regexes is used here because, in the future, we |
| might add more regexes for each component rather than have one complex |
| regex per component. |
| """ |
| from __future__ import print_function |
| from __future__ import division |
| from __future__ import absolute_import |
| |
| import logging |
| import re |
| from six.moves import urllib |
| from six.moves.urllib.parse import urlparse |
| |
| import settings |
| from features import autolink_constants |
| from framework import template_helpers |
| from framework import validate |
| from mrproto import project_pb2 |
| from tracker import tracker_helpers |
| |
| |
| # If the total length of all comments is too large, we don't autolink. |
| _MAX_TOTAL_LENGTH = 150 * 1024 # 150KB |
| # Special all_referenced_artifacts value used to indicate that the |
| # text content is too big to lookup all referenced artifacts quickly. |
| SKIP_LOOKUPS = 'skip lookups' |
| |
| _CLOSING_TAG_RE = re.compile('</[a-z0-9]+>$', re.IGNORECASE) |
| |
| # These are allowed in links, but if any of closing delimiters appear |
| # at the end of the link, and the opening one is not part of the link, |
| # then trim off the closing delimiters. |
| _LINK_TRAILING_CHARS = [ |
| (None, ':'), |
| (None, '.'), |
| (None, ','), |
| ('(', ')'), |
| ('[', ']'), |
| ('{', '}'), |
| ('<', '>'), |
| ("'", "'"), |
| ('"', '"'), |
| ] |
| |
| |
| def LinkifyEmail(_mr, autolink_regex_match, component_ref_artifacts): |
| """Examine a textual reference and replace it with a hyperlink or not. |
| |
| This is a callback for use with the autolink feature. The function |
| parameters are standard for this type of callback. |
| |
| Args: |
| _mr: unused information parsed from the HTTP request. |
| autolink_regex_match: regex match for the textual reference. |
| component_ref_artifacts: result of call to GetReferencedUsers. |
| |
| Returns: |
| A list of TextRuns with tag=a linking to the user profile page of |
| any defined users, otherwise a mailto: link is generated. |
| """ |
| email = autolink_regex_match.group(0) |
| |
| if not validate.IsValidEmail(email): |
| return [template_helpers.TextRun(email)] |
| |
| if component_ref_artifacts and email in component_ref_artifacts: |
| href = '/u/%s' % email |
| else: |
| href = 'mailto:' + email |
| |
| result = [template_helpers.TextRun(email, tag='a', href=href)] |
| return result |
| |
| |
| def CurryGetReferencedUsers(services): |
| """Return a function to get ref'd users with these services objects bound. |
| |
| Currying is a convienent way to give the callback access to the services |
| objects, but without requiring that all possible services objects be passed |
| through the autolink registry and functions. |
| |
| Args: |
| services: connection to the user persistence layer. |
| |
| Returns: |
| A ready-to-use function that accepts the arguments that autolink |
| expects to pass to it. |
| """ |
| |
| def GetReferencedUsers(mr, emails): |
| """Return a dict of users referenced by these comments. |
| |
| Args: |
| mr: commonly used info parsed from the request. |
| ref_tuples: email address strings for each user |
| that is mentioned in the comment text. |
| |
| Returns: |
| A dictionary {email: user_pb} including all existing users. |
| """ |
| user_id_dict = services.user.LookupExistingUserIDs(mr.cnxn, emails) |
| users_by_id = services.user.GetUsersByIDs(mr.cnxn, |
| list(user_id_dict.values())) |
| users_by_email = { |
| email: users_by_id[user_id] |
| for email, user_id in user_id_dict.items()} |
| return users_by_email |
| |
| return GetReferencedUsers |
| |
| |
| def Linkify(_mr, autolink_regex_match, _component_ref_artifacts): |
| """Examine a textual reference and replace it with a hyperlink or not. |
| |
| This is a callback for use with the autolink feature. The function |
| parameters are standard for this type of callback. |
| |
| Args: |
| _mr: unused information parsed from the HTTP request. |
| autolink_regex_match: regex match for the textual reference. |
| _component_ref_artifacts: unused result of call to GetReferencedIssues. |
| |
| Returns: |
| A list of TextRuns with tag=a for all matched ftp, http, https and mailto |
| links converted into HTML hyperlinks. |
| """ |
| hyperlink = autolink_regex_match.group(0) |
| |
| trailing = '' |
| for begin, end in _LINK_TRAILING_CHARS: |
| if hyperlink.endswith(end): |
| if not begin or hyperlink[:-len(end)].find(begin) == -1: |
| trailing = end + trailing |
| hyperlink = hyperlink[:-len(end)] |
| |
| tag_match = _CLOSING_TAG_RE.search(hyperlink) |
| if tag_match: |
| trailing = hyperlink[tag_match.start(0):] + trailing |
| hyperlink = hyperlink[:tag_match.start(0)] |
| |
| href = hyperlink |
| if not href.lower().startswith(('http', 'ftp', 'mailto')): |
| # We use http because redirects for https are not all set up. |
| href = 'http://' + href |
| |
| if (not validate.IsValidURL(href) and |
| not (href.startswith('mailto') and validate.IsValidEmail(href[7:]))): |
| return [template_helpers.TextRun(autolink_regex_match.group(0))] |
| |
| result = [template_helpers.TextRun(hyperlink, tag='a', href=href)] |
| if trailing: |
| result.append(template_helpers.TextRun(trailing)) |
| |
| return result |
| |
| |
| # Regular expression to detect git hashes. |
| # Used to auto-link to Git hashes on crrev.com when displaying issue details. |
| # Matches "rN", "r#N", and "revision N" when "rN" is not part of a larger word |
| # and N is a hexadecimal string of 40 chars. |
| _GIT_HASH_RE = re.compile( |
| r'\b(?P<prefix>r(evision\s+#?)?)?(?P<revnum>([a-f0-9]{40}))\b', |
| re.IGNORECASE | re.MULTILINE) |
| |
| # This is for SVN revisions and Git commit posisitons. |
| _SVN_REF_RE = re.compile( |
| r'\b(?P<prefix>r(evision\s+#?)?)(?P<revnum>([0-9]{4,7}))\b', |
| re.IGNORECASE | re.MULTILINE) |
| |
| |
| def GetReferencedRevisions(_mr, _refs): |
| """Load the referenced revision objects.""" |
| # For now we just autolink any revision hash without actually |
| # checking that such a revision exists, |
| # TODO(jrobbins): Hit crrev.com and check that the revision exists |
| # and show a rollover with revision info. |
| return None |
| |
| |
| def ExtractRevNums(_mr, autolink_regex_match): |
| """Return internal representation of a rev reference.""" |
| ref = autolink_regex_match.group('revnum') |
| logging.debug('revision ref = %s', ref) |
| return [ref] |
| |
| |
| def ReplaceRevisionRef( |
| mr, autolink_regex_match, _component_ref_artifacts): |
| """Return HTML markup for an autolink reference.""" |
| prefix = autolink_regex_match.group('prefix') |
| revnum = autolink_regex_match.group('revnum') |
| url = _GetRevisionURLFormat(mr.project).format(revnum=revnum) |
| content = revnum |
| if prefix: |
| content = '%s%s' % (prefix, revnum) |
| return [template_helpers.TextRun(content, tag='a', href=url)] |
| |
| |
| def _GetRevisionURLFormat(project): |
| # TODO(jrobbins): Expose a UI to customize it to point to whatever site |
| # hosts the source code. Also, site-wide default. |
| return (project.revision_url_format or settings.revision_url_format) |
| |
| |
| # Regular expression to detect issue references. |
| # Used to auto-link to other issues when displaying issue details. |
| # Matches "issue " when "issue" is not part of a larger word, or |
| # "issue #", or just a "#" when it is preceeded by a space. |
| _ISSUE_REF_RE = re.compile(r""" |
| (?P<prefix>\b(issues?|bugs?)[ \t]*(:|=)?) |
| ([ \t]*(?P<project_name>\b[-a-z0-9]+[:\#])? |
| (?P<number_sign>\#?) |
| (?P<local_id>\d+)\b |
| (,?[ \t]*(and|or)?)?)+""", re.IGNORECASE | re.VERBOSE) |
| |
| # This is for chromium.org's crbug.com shorthand domain. |
| _CRBUG_REF_RE = re.compile(r""" |
| (?P<prefix>\b(https?://)?crbug.com/) |
| ((?P<project_name>\b[-a-z0-9]+)(?P<separator>/))? |
| (?P<local_id>\d+)\b |
| (?P<anchor>\#c[0-9]+)?""", re.IGNORECASE | re.VERBOSE) |
| |
| # Once the overall issue reference has been detected, pick out the specific |
| # issue project:id items within it. Often there is just one, but the "and|or" |
| # syntax can allow multiple issues. |
| _SINGLE_ISSUE_REF_RE = re.compile(r""" |
| (?P<prefix>\b(issue|bug)[ \t]*)? |
| (?P<project_name>\b[-a-z0-9]+[:\#])? |
| (?P<number_sign>\#?) |
| (?P<local_id>\d+)\b""", re.IGNORECASE | re.VERBOSE) |
| |
| |
| def CurryGetReferencedIssues(services): |
| """Return a function to get ref'd issues with these services objects bound. |
| |
| Currying is a convienent way to give the callback access to the services |
| objects, but without requiring that all possible services objects be passed |
| through the autolink registry and functions. |
| |
| Args: |
| services: connection to issue, config, and project persistence layers. |
| |
| Returns: |
| A ready-to-use function that accepts the arguments that autolink |
| expects to pass to it. |
| """ |
| |
| def GetReferencedIssues(mr, ref_tuples): |
| """Return lists of open and closed issues referenced by these comments. |
| |
| Args: |
| mr: commonly used info parsed from the request. |
| ref_tuples: list of (project_name, local_id) tuples for each issue |
| that is mentioned in the comment text. The project_name may be None, |
| in which case the issue is assumed to be in the current project. |
| |
| Returns: |
| A list of open and closed issue dicts. |
| """ |
| ref_projects = services.project.GetProjectsByName( |
| mr.cnxn, |
| [(ref_pn or mr.project_name) for ref_pn, _ in ref_tuples]) |
| issue_ids, _misses = services.issue.ResolveIssueRefs( |
| mr.cnxn, ref_projects, mr.project_name, ref_tuples) |
| open_issues, closed_issues = ( |
| tracker_helpers.GetAllowedOpenedAndClosedIssues( |
| mr, issue_ids, services)) |
| |
| open_dict = {} |
| for issue in open_issues: |
| open_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue |
| |
| closed_dict = {} |
| for issue in closed_issues: |
| closed_dict[_IssueProjectKey(issue.project_name, issue.local_id)] = issue |
| |
| logging.info('autolinking dicts %r and %r', open_dict, closed_dict) |
| |
| return open_dict, closed_dict |
| |
| return GetReferencedIssues |
| |
| |
| def _ParseProjectNameMatch(project_name): |
| """Process the passed project name and determine the best representation. |
| |
| Args: |
| project_name: a string with the project name matched in a regex |
| |
| Returns: |
| A minimal representation of the project name, None if no valid content. |
| """ |
| if not project_name: |
| return None |
| return project_name.lstrip().rstrip('#: \t\n') |
| |
| |
| def _ExtractProjectAndIssueIds( |
| autolink_regex_match, subregex, default_project_name=None): |
| """Convert a regex match for a textual reference into our internal form.""" |
| whole_str = autolink_regex_match.group(0) |
| refs = [] |
| for submatch in subregex.finditer(whole_str): |
| project_name = ( |
| _ParseProjectNameMatch(submatch.group('project_name')) or |
| default_project_name) |
| ref = (project_name, int(submatch.group('local_id'))) |
| refs.append(ref) |
| logging.info('issue ref = %s', ref) |
| |
| return refs |
| |
| |
| def ExtractProjectAndIssueIdsNormal(_mr, autolink_regex_match): |
| """Convert a regex match for a textual reference into our internal form.""" |
| return _ExtractProjectAndIssueIds( |
| autolink_regex_match, _SINGLE_ISSUE_REF_RE) |
| |
| |
| def ExtractProjectAndIssueIdsCrBug(_mr, autolink_regex_match): |
| """Convert a regex match for a textual reference into our internal form.""" |
| return _ExtractProjectAndIssueIds( |
| autolink_regex_match, _CRBUG_REF_RE, default_project_name='chromium') |
| |
| |
| # This uses project name to avoid a lookup on project ID in a function |
| # that has no services object. |
| def _IssueProjectKey(project_name, local_id): |
| """Make a dictionary key to identify a referenced issue.""" |
| return '%s:%d' % (project_name, local_id) |
| |
| |
| class IssueRefRun(object): |
| """A text run that links to a referenced issue.""" |
| |
| def __init__(self, issue, is_closed, project_name, content, anchor): |
| self.tag = 'a' |
| self.css_class = 'closed_ref' if is_closed else None |
| self.title = issue.summary |
| self.href = '/p/%s/issues/detail?id=%d%s' % ( |
| project_name, issue.local_id, anchor) |
| |
| self.content = content |
| if is_closed: |
| self.content = ' %s ' % self.content |
| |
| |
| def _ReplaceIssueRef( |
| autolink_regex_match, component_ref_artifacts, single_issue_regex, |
| default_project_name): |
| """Examine a textual reference and replace it with an autolink or not. |
| |
| Args: |
| autolink_regex_match: regex match for the textual reference. |
| component_ref_artifacts: result of earlier call to GetReferencedIssues. |
| single_issue_regex: regular expression to parse individual issue references |
| out of a multi-issue-reference phrase. E.g., "issues 12 and 34". |
| default_project_name: project name to use when not specified. |
| |
| Returns: |
| A list of IssueRefRuns and TextRuns to replace the textual |
| reference. If there is an issue to autolink to, we return an HTML |
| hyperlink. Otherwise, we the run will have the original plain |
| text. |
| """ |
| open_dict, closed_dict = {}, {} |
| if component_ref_artifacts: |
| open_dict, closed_dict = component_ref_artifacts |
| original = autolink_regex_match.group(0) |
| logging.info('called ReplaceIssueRef on %r', original) |
| result_runs = [] |
| pos = 0 |
| for submatch in single_issue_regex.finditer(original): |
| if submatch.start() >= pos: |
| if original[pos: submatch.start()]: |
| result_runs.append(template_helpers.TextRun( |
| original[pos: submatch.start()])) |
| replacement_run = _ReplaceSingleIssueRef( |
| submatch, open_dict, closed_dict, default_project_name) |
| result_runs.append(replacement_run) |
| pos = submatch.end() |
| |
| if original[pos:]: |
| result_runs.append(template_helpers.TextRun(original[pos:])) |
| |
| return result_runs |
| |
| |
| def ReplaceIssueRefNormal(mr, autolink_regex_match, component_ref_artifacts): |
| """Replaces occurances of 'issue 123' with link TextRuns as needed.""" |
| return _ReplaceIssueRef( |
| autolink_regex_match, component_ref_artifacts, |
| _SINGLE_ISSUE_REF_RE, mr.project_name) |
| |
| |
| def ReplaceIssueRefCrBug(_mr, autolink_regex_match, component_ref_artifacts): |
| """Replaces occurances of 'crbug.com/123' with link TextRuns as needed.""" |
| return _ReplaceIssueRef( |
| autolink_regex_match, component_ref_artifacts, |
| _CRBUG_REF_RE, 'chromium') |
| |
| |
| def _ReplaceSingleIssueRef( |
| submatch, open_dict, closed_dict, default_project_name): |
| """Replace one issue reference with a link, or the original text.""" |
| content = submatch.group(0) |
| project_name = submatch.group('project_name') |
| anchor = submatch.groupdict().get('anchor') or '' |
| if project_name: |
| project_name = project_name.lstrip().rstrip(':#') |
| else: |
| # We need project_name for the URL, even if it is not in the text. |
| project_name = default_project_name |
| |
| local_id = int(submatch.group('local_id')) |
| issue_key = _IssueProjectKey(project_name, local_id) |
| if issue_key in open_dict: |
| return IssueRefRun( |
| open_dict[issue_key], False, project_name, content, anchor) |
| elif issue_key in closed_dict: |
| return IssueRefRun( |
| closed_dict[issue_key], True, project_name, content, anchor) |
| else: # Don't link to non-existent issues. |
| return template_helpers.TextRun(content) |
| |
| |
| class Autolink(object): |
| """Maintains a registry of autolink syntax and can apply it to comments.""" |
| |
| def __init__(self): |
| self.registry = {} |
| |
| def RegisterComponent(self, component_name, artifact_lookup_function, |
| match_to_reference_function, autolink_re_subst_dict): |
| """Register all the autolink info for a software component. |
| |
| Args: |
| component_name: string name of software component, must be unique. |
| artifact_lookup_function: function to batch lookup all artifacts that |
| might have been referenced in a set of comments: |
| function(all_matches) -> referenced_artifacts |
| the referenced_artifacts will be pased to each subst function. |
| match_to_reference_function: convert a regex match object to |
| some internal representation of the artifact reference. |
| autolink_re_subst_dict: dictionary of regular expressions and |
| the substitution function that should be called for each match: |
| function(match, referenced_artifacts) -> replacement_markup |
| """ |
| self.registry[component_name] = (artifact_lookup_function, |
| match_to_reference_function, |
| autolink_re_subst_dict) |
| |
| def GetAllReferencedArtifacts( |
| self, mr, comment_text_list, max_total_length=_MAX_TOTAL_LENGTH): |
| """Call callbacks to lookup all artifacts possibly referenced. |
| |
| Args: |
| mr: information parsed out of the user HTTP request. |
| comment_text_list: list of comment content strings. |
| max_total_length: int max number of characters to accept: |
| if more than this, then skip autolinking entirely. |
| |
| Returns: |
| Opaque object that can be pased to MarkupAutolinks. It's |
| structure happens to be {component_name: artifact_list, ...}, |
| or the special value SKIP_LOOKUPS. |
| """ |
| total_len = sum(len(comment_text) for comment_text in comment_text_list) |
| if total_len > max_total_length: |
| return SKIP_LOOKUPS |
| |
| all_referenced_artifacts = {} |
| for comp, (lookup, match_to_refs, re_dict) in self.registry.items(): |
| refs = set() |
| for comment_text in comment_text_list: |
| for regex in re_dict: |
| for match in regex.finditer(comment_text): |
| additional_refs = match_to_refs(mr, match) |
| if additional_refs: |
| refs.update(additional_refs) |
| |
| all_referenced_artifacts[comp] = lookup(mr, refs) |
| |
| return all_referenced_artifacts |
| |
| def MarkupAutolinks(self, mr, text_runs, all_referenced_artifacts): |
| """Loop over components and regexes, applying all substitutions. |
| |
| Args: |
| mr: info parsed from the user's HTTP request. |
| text_runs: List of text runs for the user's comment. |
| all_referenced_artifacts: result of previous call to |
| GetAllReferencedArtifacts. |
| |
| Returns: |
| List of text runs for the entire user comment, some of which may have |
| attribures that cause them to render as links in render-rich-text.ezt. |
| """ |
| items = list(self.registry.items()) |
| items.sort() # Process components in determinate alphabetical order. |
| for component, (_lookup, _match_ref, re_subst_dict) in items: |
| if all_referenced_artifacts == SKIP_LOOKUPS: |
| component_ref_artifacts = None |
| else: |
| component_ref_artifacts = all_referenced_artifacts[component] |
| for regex, subst_fun in re_subst_dict.items(): |
| text_runs = self._ApplySubstFunctionToRuns( |
| text_runs, regex, subst_fun, mr, component_ref_artifacts) |
| |
| return text_runs |
| |
| def _ApplySubstFunctionToRuns( |
| self, text_runs, regex, subst_fun, mr, component_ref_artifacts): |
| """Apply autolink regex and substitution function to each text run. |
| |
| Args: |
| text_runs: list of TextRun objects with parts of the original comment. |
| regex: Regular expression for detecting textual references to artifacts. |
| subst_fun: function to return autolink markup, or original text. |
| mr: common info parsed from the user HTTP request. |
| component_ref_artifacts: already-looked-up destination artifacts to use |
| when computing substitution text. |
| |
| Returns: |
| A new list with more and smaller runs, some of which may have tag |
| and link attributes set. |
| """ |
| result_runs = [] |
| for run in text_runs: |
| content = run.content |
| if run.tag: |
| # This chunk has already been substituted, don't allow nested |
| # autolinking to mess up our output. |
| result_runs.append(run) |
| else: |
| pos = 0 |
| for match in regex.finditer(content): |
| if match.start() > pos: |
| result_runs.append(template_helpers.TextRun( |
| content[pos: match.start()])) |
| replacement_runs = subst_fun(mr, match, component_ref_artifacts) |
| result_runs.extend(replacement_runs) |
| pos = match.end() |
| |
| if run.content[pos:]: # Keep any text that came after the last match |
| result_runs.append(template_helpers.TextRun(run.content[pos:])) |
| |
| # TODO(jrobbins): ideally we would merge consecutive plain text runs |
| # so that regexes can match across those run boundaries. |
| |
| return result_runs |
| |
| |
| def RegisterAutolink(services): |
| """Register all the autolink hooks.""" |
| # The order of the RegisterComponent() calls does not matter so that we could |
| # do this registration from separate modules in the future if needed. |
| # Priority order of application is determined by the names of the registered |
| # handers, which are sorted in MarkupAutolinks(). |
| |
| services.autolink.RegisterComponent( |
| '01-tracker-crbug', |
| CurryGetReferencedIssues(services), |
| ExtractProjectAndIssueIdsCrBug, |
| {_CRBUG_REF_RE: ReplaceIssueRefCrBug}) |
| |
| services.autolink.RegisterComponent( |
| '02-linkify-full-urls', |
| lambda request, mr: None, |
| lambda mr, match: None, |
| {autolink_constants.IS_A_LINK_RE: Linkify}) |
| |
| services.autolink.RegisterComponent( |
| '03-linkify-user-profiles-or-mailto', |
| CurryGetReferencedUsers(services), |
| lambda _mr, match: [match.group(0)], |
| {autolink_constants.IS_IMPLIED_EMAIL_RE: LinkifyEmail}) |
| |
| services.autolink.RegisterComponent( |
| '04-tracker-regular', |
| CurryGetReferencedIssues(services), |
| ExtractProjectAndIssueIdsNormal, |
| {_ISSUE_REF_RE: ReplaceIssueRefNormal}) |
| |
| services.autolink.RegisterComponent( |
| '05-linkify-shorthand', |
| lambda request, mr: None, |
| lambda mr, match: None, |
| {autolink_constants.IS_A_SHORT_LINK_RE: Linkify, |
| autolink_constants.IS_A_NUMERIC_SHORT_LINK_RE: Linkify, |
| autolink_constants.IS_IMPLIED_LINK_RE: Linkify, |
| }) |
| |
| services.autolink.RegisterComponent( |
| '06-versioncontrol', |
| GetReferencedRevisions, |
| ExtractRevNums, |
| {_GIT_HASH_RE: ReplaceRevisionRef, |
| _SVN_REF_RE: ReplaceRevisionRef}) |