Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/search/ast2ast.py b/search/ast2ast.py
new file mode 100644
index 0000000..bf4de4f
--- /dev/null
+++ b/search/ast2ast.py
@@ -0,0 +1,558 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""Convert a user's issue search AST into a simplified AST.
+
+This phase of query processing simplifies the user's query by looking up
+the int IDs of any labels, statuses, or components that are mentioned by
+name in the original query. The data needed for lookups is typically cached
+in RAM in each backend job, so this will not put much load on the DB. The
+simplified ASTs are later converted into SQL which is simpler and has
+fewer joins.
+
+The simplified main query is better because:
+ + It is clearly faster, especially in the most common case where config
+ data is in RAM.
+ + Since less RAM is used to process the main query on each shard, query
+ execution time is more consistent with less variability under load. Less
+ variability is good because the user must wait for the slowest shard.
+ + The config tables (LabelDef, StatusDef, etc.) exist only on the primary DB,
+ so they cannot be mentioned in a query that runs on a shard.
+ + The query string itself is shorter when numeric IDs are substituted, which
+ means that we can handle user queries with long lists of labels in a
+ reasonable-sized query.
+ + It bisects the complexity of the operation: it's easier to test and debug
+ the lookup and simplification logic plus the main query logic this way
+ than it would be to deal with an even more complex SQL main query.
+"""
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import collections
+import logging
+import re
+
+from framework import exceptions
+from proto import ast_pb2
+from proto import tracker_pb2
+# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
+# remove this dep.
+from search import query2ast
+from tracker import tracker_bizobj
+from features import federated
+
+
+def PreprocessAST(
+ cnxn, query_ast, project_ids, services, harmonized_config, is_member=True):
+ """Preprocess the query by doing lookups so that the SQL query is simpler.
+
+ Args:
+ cnxn: connection to SQL database.
+ query_ast: user query abstract syntax tree parsed by query2ast.py.
+ project_ids: collection of int project IDs to use to look up status values
+ and labels.
+ services: Connections to persistence layer for users and configs.
+ harmonized_config: harmonized config for all projects being searched.
+ is_member: True if user is a member of all the projects being searched,
+ so they can do user substring searches.
+
+ Returns:
+ A new QueryAST PB with simplified conditions. Specifically, string values
+ for labels, statuses, and components are replaced with the int IDs of
+ those items. Also, is:open is distilled down to
+ status_id != closed_status_ids.
+ """
+ new_conjs = []
+ for conj in query_ast.conjunctions:
+ new_conds = [
+ _PreprocessCond(
+ cnxn, cond, project_ids, services, harmonized_config, is_member)
+ for cond in conj.conds]
+ new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
+
+ return ast_pb2.QueryAST(conjunctions=new_conjs)
+
+
+def _PreprocessIsOpenCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess an is:open cond into status_id != closed_status_ids."""
+ if project_ids:
+ closed_status_ids = []
+ for project_id in project_ids:
+ closed_status_ids.extend(services.config.LookupClosedStatusIDs(
+ cnxn, project_id))
+ else:
+ closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
+
+ # Invert the operator, because we're comparing against *closed* statuses.
+ if cond.op == ast_pb2.QueryOp.EQ:
+ op = ast_pb2.QueryOp.NE
+ elif cond.op == ast_pb2.QueryOp.NE:
+ op = ast_pb2.QueryOp.EQ
+ else:
+ raise MalformedQuery('Open condition got nonsensical op %r' % cond.op)
+
+ return ast_pb2.Condition(
+ op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
+ int_values=closed_status_ids)
+
+
+def _PreprocessIsBlockedCond(
+ _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
+ """Preprocess an is:blocked cond into issues that are blocked."""
+ if cond.op == ast_pb2.QueryOp.EQ:
+ op = ast_pb2.QueryOp.IS_DEFINED
+ elif cond.op == ast_pb2.QueryOp.NE:
+ op = ast_pb2.QueryOp.IS_NOT_DEFINED
+ else:
+ raise MalformedQuery('Blocked condition got nonsensical op %r' % cond.op)
+
+ return ast_pb2.Condition(
+ op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
+
+
+def _PreprocessIsSpamCond(
+ _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
+ """Preprocess an is:spam cond into is_spam == 1."""
+ if cond.op == ast_pb2.QueryOp.EQ:
+ int_values = [1]
+ elif cond.op == ast_pb2.QueryOp.NE:
+ int_values = [0]
+ else:
+ raise MalformedQuery('Spam condition got nonsensical op %r' % cond.op)
+
+ return ast_pb2.Condition(
+ op=ast_pb2.QueryOp.EQ,
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['is_spam']],
+ int_values=int_values)
+
+
+def _PreprocessBlockedOnCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess blockedon=xyz and has:blockedon conds.
+
+ Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
+ Preprocesses has:blockedon cond into issues that are blocked on other issues.
+ """
+ issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
+ cond, project_ids, services)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
+ int_values=issue_ids,
+ str_values=ext_issue_ids)
+
+
+def _PreprocessBlockingCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess blocking=xyz and has:blocking conds.
+
+ Preprocesses blocking=xyz cond into blocking_id:issue_ids.
+ Preprocesses has:blocking cond into issues that are blocking other issues.
+ """
+ issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
+ cond, project_ids, services)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
+ int_values=issue_ids,
+ str_values=ext_issue_ids)
+
+
+def _PreprocessMergedIntoCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess mergedinto=xyz and has:mergedinto conds.
+
+ Preprocesses mergedinto=xyz cond into mergedinto_id:issue_ids.
+ Preprocesses has:mergedinto cond into has:mergedinto_id.
+ """
+ issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
+ cond, project_ids, services)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['mergedinto_id']],
+ int_values=issue_ids,
+ str_values=ext_issue_ids)
+
+
+def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
+ """Returns global IDs from the local IDs provided in the cond."""
+ # Get {project_name: project} for all projects in project_ids.
+ ids_to_projects = services.project.GetProjects(cnxn, project_ids)
+ ref_projects = {pb.project_name: pb for pb in ids_to_projects.values()}
+ # Populate default_project_name if there is only one project id provided.
+ default_project_name = None
+ if len(ref_projects) == 1:
+ default_project_name = list(ref_projects.values())[0].project_name
+
+ # Populate refs with (project_name, local_id) pairs.
+ refs = []
+ # Populate ext_issue_ids with strings like 'b/1234'.
+ ext_issue_ids = []
+ for val in cond.str_values:
+ try:
+ project_name, local_id = tracker_bizobj.ParseIssueRef(val)
+ if not project_name:
+ if not default_project_name:
+ # TODO(rmistry): Support the below.
+ raise MalformedQuery(
+ 'Searching for issues accross multiple/all projects without '
+ 'project prefixes is ambiguous and is currently not supported.')
+ project_name = default_project_name
+ refs.append((project_name, int(local_id)))
+ except MalformedQuery as e:
+ raise e
+ # Can't parse issue id, try external issue pattern.
+ except ValueError as e:
+ if federated.FromShortlink(val):
+ ext_issue_ids.append(val)
+ else:
+ raise MalformedQuery('Could not parse issue reference: %s' % val)
+
+ issue_ids, _misses = services.issue.ResolveIssueRefs(
+ cnxn, ref_projects, default_project_name, refs)
+ return issue_ids, ext_issue_ids
+
+
+def _PreprocessStatusCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess a status=names cond into status_id=IDs."""
+ if project_ids:
+ status_ids = []
+ for project_id in project_ids:
+ status_ids.extend(services.config.LookupStatusIDs(
+ cnxn, project_id, cond.str_values))
+ else:
+ status_ids = services.config.LookupStatusIDsAnyProject(
+ cnxn, cond.str_values)
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
+ int_values=status_ids)
+
+
+def _IsEqualityOp(op):
+ """Return True for EQ and NE."""
+ return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
+
+
+def _IsDefinedOp(op):
+ """Return True for IS_DEFINED and IS_NOT_DEFINED."""
+ return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
+
+
+def _TextOpToIntOp(op):
+ """If a query is optimized from string to ID matching, use an equality op."""
+ if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
+ return ast_pb2.QueryOp.EQ
+ elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
+ return ast_pb2.QueryOp.NE
+ return op
+
+
+def _MakePrefixRegex(cond):
+ """Return a regex to match strings that start with cond values."""
+ all_prefixes = '|'.join(map(re.escape, cond.str_values))
+ return re.compile(r'(%s)-.+' % all_prefixes, re.I)
+
+
+def _MakeKeyValueRegex(cond):
+ """Return a regex to match the first token and remaining text separately."""
+ keys, values = list(zip(*[x.split('-', 1) for x in cond.str_values]))
+ if len(set(keys)) != 1:
+ raise MalformedQuery(
+ "KeyValue query with multiple different keys: %r" % cond.str_values)
+ all_values = '|'.join(map(re.escape, values))
+ return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
+
+
+def _MakeWordBoundaryRegex(cond):
+ """Return a regex to match the cond values as whole words."""
+ all_words = '|'.join(map(re.escape, cond.str_values))
+ return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
+
+
+def _PreprocessLabelCond(
+ cnxn, cond, project_ids, services, _harmonized_config, _is_member):
+ """Preprocess a label=names cond into label_id=IDs."""
+ if project_ids:
+ label_ids = []
+ for project_id in project_ids:
+ if _IsEqualityOp(cond.op):
+ label_ids.extend(services.config.LookupLabelIDs(
+ cnxn, project_id, cond.str_values))
+ elif _IsDefinedOp(cond.op):
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakePrefixRegex(cond)))
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS:
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakeKeyValueRegex(cond)))
+ else:
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakeWordBoundaryRegex(cond)))
+ else:
+ if _IsEqualityOp(cond.op):
+ label_ids = services.config.LookupLabelIDsAnyProject(
+ cnxn, cond.str_values)
+ elif _IsDefinedOp(cond.op):
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakePrefixRegex(cond))
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS:
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakeKeyValueRegex(cond))
+ else:
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakeWordBoundaryRegex(cond))
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
+ int_values=label_ids)
+
+
+def _PreprocessComponentCond(
+ cnxn, cond, project_ids, services, harmonized_config, _is_member):
+ """Preprocess a component= or component:name cond into component_id=IDs."""
+ exact = _IsEqualityOp(cond.op)
+ component_ids = []
+ if project_ids:
+ # We are searching within specific projects, so harmonized_config
+ # holds the config data for all those projects.
+ for comp_path in cond.str_values:
+ component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
+ comp_path, harmonized_config, exact=exact))
+ else:
+ # We are searching across the whole site, so we have no harmonized_config
+ # to use.
+ component_ids = services.config.FindMatchingComponentIDsAnyProject(
+ cnxn, cond.str_values, exact=exact)
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
+ int_values=component_ids)
+
+
+def _PreprocessExactUsers(
+ cnxn, cond, user_service, id_fields, is_member):
+ """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
+
+ This preprocesing step converts string conditions to int ID conditions.
+ E.g., [owner=email] to [owner_id=ID]. It only does it in cases
+ where (a) the email was "me", so it was already converted to an string of
+ digits in the search pipeline, or (b) it is "user@domain" which resolves to
+ a known Monorail user. It is also possible to search for, e.g.,
+ [owner:substring], but such searches remain 'owner' field searches rather
+ than 'owner_id', and they cannot be combined with the "me" keyword.
+
+ Args:
+ cnxn: connection to the DB.
+ cond: original parsed query Condition PB.
+ user_service: connection to user persistence layer.
+ id_fields: list of the search fields to use if the conversion to IDs
+ succeed.
+ is_member: True if user is a member of all the projects being searchers,
+ so they can do user substring searches.
+
+ Returns:
+ A new Condition PB that checks the id_field. Or, the original cond.
+
+ Raises:
+ MalformedQuery: A non-member used a query term that could be used to
+ guess full user email addresses.
+ """
+ op = _TextOpToIntOp(cond.op)
+ if _IsDefinedOp(op):
+ # No need to look up any IDs if we are just testing for any defined value.
+ return ast_pb2.Condition(op=op, field_defs=id_fields,
+ key_suffix=cond.key_suffix,
+ phase_name=cond.phase_name)
+
+ # This preprocessing step is only for ops that compare whole values, not
+ # substrings.
+ if not _IsEqualityOp(op):
+ logging.info('could not convert to IDs because op is %r', op)
+ if not is_member:
+ raise MalformedQuery('Only project members may compare user strings')
+ return cond
+
+ user_ids = []
+ for val in cond.str_values:
+ try:
+ user_ids.append(int(val))
+ except ValueError:
+ try:
+ user_ids.append(user_service.LookupUserID(cnxn, val))
+ except exceptions.NoSuchUserException:
+ if not is_member and val != 'me' and not val.startswith('@'):
+ logging.info('could not convert user %r to int ID', val)
+ if '@' in val:
+ raise MalformedQuery('User email address not found')
+ else:
+ raise MalformedQuery(
+ 'Only project members may search for user substrings')
+ return cond # preprocessing failed, stick with the original cond.
+
+ return ast_pb2.MakeCond(
+ op, id_fields, [], user_ids, key_suffix=cond.key_suffix,
+ phase_name=cond.phase_name)
+
+
+def _PreprocessOwnerCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, is_member):
+ """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']],
+ is_member)
+
+
+def _PreprocessCcCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, is_member):
+ """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']],
+ is_member)
+
+
+def _PreprocessReporterCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, is_member):
+ """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']], is_member)
+
+
+def _PreprocessStarredByCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, is_member):
+ """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']], is_member)
+
+
+def _PreprocessCommentByCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, is_member):
+ """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']], is_member)
+
+
+def _PreprocessHotlistCond(
+ cnxn, cond, _project_ids, services, _harmonized_config, _is_member):
+ """Preprocess hotlist query
+
+ Preprocesses a hotlist query in the form:
+ 'hotlist=<user_email>:<hotlist-name>,<hotlist-name>,<user2_email>:...
+ into hotlist_id=IDs, if exact.
+ """
+ # TODO(jojwang): add support for searches that don't contain domain names.
+ # eg jojwang:hotlist-name
+ users_to_hotlists = collections.defaultdict(list)
+ cur_user = ''
+ for val in cond.str_values:
+ if ':' in val:
+ cur_user, hotlists_str = val.split(':', 1)
+ else:
+ hotlists_str = val
+ try:
+ users_to_hotlists[int(cur_user)].append(hotlists_str)
+ except ValueError:
+ try:
+ user_id = services.user.LookupUserID(cnxn, cur_user)
+ users_to_hotlists[user_id].append(hotlists_str)
+ except exceptions.NoSuchUserException:
+ logging.info('could not convert user %r to int ID', val)
+ return cond
+ hotlist_ids = set()
+ for user_id, hotlists in users_to_hotlists.items():
+ if not hotlists[0]:
+ user_hotlists = services.features.GetHotlistsByUserID(cnxn, user_id)
+ user_hotlist_ids = [hotlist.hotlist_id for hotlist in user_hotlists if
+ user_id in hotlist.owner_ids]
+ else:
+ user_hotlist_ids = list(services.features.LookupHotlistIDs(
+ cnxn, hotlists, [user_id]).values())
+ for hotlist_id in user_hotlist_ids:
+ hotlist_ids.add(hotlist_id)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['hotlist_id']],
+ int_values=list(hotlist_ids))
+
+
+def _PreprocessCustomCond(cnxn, cond, services, is_member):
+ """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
+ # TODO(jrobbins): better support for ambiguous fields.
+ # For now, if any field is USER_TYPE and the value being searched
+ # for is the email address of an existing account, it will convert
+ # to a user ID and we go with exact ID matching. Otherwise, we
+ # leave the cond as-is for ast2select to do string matching on.
+ user_field_defs = [fd for fd in cond.field_defs
+ if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
+ if user_field_defs:
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, user_field_defs, is_member)
+
+ approval_field_defs = [fd for fd in cond.field_defs
+ if (fd.field_type ==
+ tracker_pb2.FieldTypes.APPROVAL_TYPE)]
+ if approval_field_defs:
+ if cond.key_suffix in [query2ast.APPROVER_SUFFIX, query2ast.SET_BY_SUFFIX]:
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, approval_field_defs, is_member)
+
+ return cond
+
+
+_PREPROCESSORS = {
+ 'open': _PreprocessIsOpenCond,
+ 'blocked': _PreprocessIsBlockedCond,
+ 'spam': _PreprocessIsSpamCond,
+ 'blockedon': _PreprocessBlockedOnCond,
+ 'blocking': _PreprocessBlockingCond,
+ 'mergedinto': _PreprocessMergedIntoCond,
+ 'status': _PreprocessStatusCond,
+ 'label': _PreprocessLabelCond,
+ 'component': _PreprocessComponentCond,
+ 'owner': _PreprocessOwnerCond,
+ 'cc': _PreprocessCcCond,
+ 'reporter': _PreprocessReporterCond,
+ 'starredby': _PreprocessStarredByCond,
+ 'commentby': _PreprocessCommentByCond,
+ 'hotlist': _PreprocessHotlistCond,
+ }
+
+
+def _PreprocessCond(
+ cnxn, cond, project_ids, services, harmonized_config, is_member):
+ """Preprocess query by looking up status, label and component IDs."""
+ # All the fields in a cond share the same name because they are parsed
+ # from a user query term, and the term syntax allows just one field name.
+ field_name = cond.field_defs[0].field_name
+ assert all(fd.field_name == field_name for fd in cond.field_defs)
+
+ # Case 1: The user is searching custom fields.
+ if any(fd.field_id for fd in cond.field_defs):
+ # There can't be a mix of custom and built-in fields because built-in
+ # field names are reserved and take priority over any conflicting ones.
+ assert all(fd.field_id for fd in cond.field_defs)
+ return _PreprocessCustomCond(cnxn, cond, services, is_member)
+
+ # Case 2: The user is searching a built-in field.
+ preproc = _PREPROCESSORS.get(field_name)
+ if preproc:
+ # We have a preprocessor for that built-in field.
+ return preproc(
+ cnxn, cond, project_ids, services, harmonized_config, is_member)
+ else:
+ # We don't have a preprocessor for it.
+ return cond
+
+
+class MalformedQuery(ValueError):
+ pass