| # Copyright 2016 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Convert a user's issue search AST into a simplified AST. |
| |
| This phase of query processing simplifies the user's query by looking up |
| the int IDs of any labels, statuses, or components that are mentioned by |
| name in the original query. The data needed for lookups is typically cached |
| in RAM in each backend job, so this will not put much load on the DB. The |
| simplified ASTs are later converted into SQL which is simpler and has |
| fewer joins. |
| |
| The simplified main query is better because: |
| + It is clearly faster, especially in the most common case where config |
| data is in RAM. |
| + Since less RAM is used to process the main query on each shard, query |
| execution time is more consistent with less variability under load. Less |
| variability is good because the user must wait for the slowest shard. |
| + The config tables (LabelDef, StatusDef, etc.) exist only on the primary DB, |
| so they cannot be mentioned in a query that runs on a shard. |
| + The query string itself is shorter when numeric IDs are substituted, which |
| means that we can handle user queries with long lists of labels in a |
| reasonable-sized query. |
| + It bisects the complexity of the operation: it's easier to test and debug |
| the lookup and simplification logic plus the main query logic this way |
| than it would be to deal with an even more complex SQL main query. |
| """ |
| from __future__ import print_function |
| from __future__ import division |
| from __future__ import absolute_import |
| |
| import collections |
| import logging |
| import re |
| |
| from framework import exceptions |
| from mrproto import ast_pb2 |
| from mrproto import tracker_pb2 |
| # TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could |
| # remove this dep. |
| from search import query2ast |
| from tracker import tracker_bizobj |
| from features import federated |
| |
| |
| def PreprocessAST( |
| cnxn, query_ast, project_ids, services, harmonized_config, is_member=True): |
| """Preprocess the query by doing lookups so that the SQL query is simpler. |
| |
| Args: |
| cnxn: connection to SQL database. |
| query_ast: user query abstract syntax tree parsed by query2ast.py. |
| project_ids: collection of int project IDs to use to look up status values |
| and labels. |
| services: Connections to persistence layer for users and configs. |
| harmonized_config: harmonized config for all projects being searched. |
| is_member: True if user is a member of all the projects being searched, |
| so they can do user substring searches. |
| |
| Returns: |
| A new QueryAST PB with simplified conditions. Specifically, string values |
| for labels, statuses, and components are replaced with the int IDs of |
| those items. Also, is:open is distilled down to |
| status_id != closed_status_ids. |
| """ |
| new_conjs = [] |
| for conj in query_ast.conjunctions: |
| new_conds = [ |
| _PreprocessCond( |
| cnxn, cond, project_ids, services, harmonized_config, is_member) |
| for cond in conj.conds] |
| new_conjs.append(ast_pb2.Conjunction(conds=new_conds)) |
| |
| return ast_pb2.QueryAST(conjunctions=new_conjs) |
| |
| |
| def _PreprocessIsOpenCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess an is:open cond into status_id != closed_status_ids.""" |
| if project_ids: |
| closed_status_ids = [] |
| for project_id in project_ids: |
| closed_status_ids.extend(services.config.LookupClosedStatusIDs( |
| cnxn, project_id)) |
| else: |
| closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn) |
| |
| # Invert the operator, because we're comparing against *closed* statuses. |
| if cond.op == ast_pb2.QueryOp.EQ: |
| op = ast_pb2.QueryOp.NE |
| elif cond.op == ast_pb2.QueryOp.NE: |
| op = ast_pb2.QueryOp.EQ |
| else: |
| raise MalformedQuery('Open condition got nonsensical op %r' % cond.op) |
| |
| return ast_pb2.Condition( |
| op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
| int_values=closed_status_ids) |
| |
| |
| def _PreprocessIsBlockedCond( |
| _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member): |
| """Preprocess an is:blocked cond into issues that are blocked.""" |
| if cond.op == ast_pb2.QueryOp.EQ: |
| op = ast_pb2.QueryOp.IS_DEFINED |
| elif cond.op == ast_pb2.QueryOp.NE: |
| op = ast_pb2.QueryOp.IS_NOT_DEFINED |
| else: |
| raise MalformedQuery('Blocked condition got nonsensical op %r' % cond.op) |
| |
| return ast_pb2.Condition( |
| op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']]) |
| |
| |
| def _PreprocessIsSpamCond( |
| _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member): |
| """Preprocess an is:spam cond into is_spam == 1.""" |
| if cond.op == ast_pb2.QueryOp.EQ: |
| int_values = [1] |
| elif cond.op == ast_pb2.QueryOp.NE: |
| int_values = [0] |
| else: |
| raise MalformedQuery('Spam condition got nonsensical op %r' % cond.op) |
| |
| return ast_pb2.Condition( |
| op=ast_pb2.QueryOp.EQ, |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['is_spam']], |
| int_values=int_values) |
| |
| |
| def _PreprocessBlockedOnCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess blockedon=xyz and has:blockedon conds. |
| |
| Preprocesses blockedon=xyz cond into blockedon_id:issue_ids. |
| Preprocesses has:blockedon cond into issues that are blocked on other issues. |
| """ |
| issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, |
| cond, project_ids, services) |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']], |
| int_values=issue_ids, |
| str_values=ext_issue_ids) |
| |
| |
| def _PreprocessBlockingCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess blocking=xyz and has:blocking conds. |
| |
| Preprocesses blocking=xyz cond into blocking_id:issue_ids. |
| Preprocesses has:blocking cond into issues that are blocking other issues. |
| """ |
| issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, |
| cond, project_ids, services) |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']], |
| int_values=issue_ids, |
| str_values=ext_issue_ids) |
| |
| |
| def _PreprocessMergedIntoCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess mergedinto=xyz and has:mergedinto conds. |
| |
| Preprocesses mergedinto=xyz cond into mergedinto_id:issue_ids. |
| Preprocesses has:mergedinto cond into has:mergedinto_id. |
| """ |
| issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, |
| cond, project_ids, services) |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['mergedinto_id']], |
| int_values=issue_ids, |
| str_values=ext_issue_ids) |
| |
| |
| def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services): |
| """Returns global IDs from the local IDs provided in the cond.""" |
| # Get {project_name: project} for all projects in project_ids. |
| ids_to_projects = services.project.GetProjects(cnxn, project_ids) |
| ref_projects = {pb.project_name: pb for pb in ids_to_projects.values()} |
| # Populate default_project_name if there is only one project id provided. |
| default_project_name = None |
| if len(ref_projects) == 1: |
| default_project_name = list(ref_projects.values())[0].project_name |
| |
| # Populate refs with (project_name, local_id) pairs. |
| refs = [] |
| # Populate ext_issue_ids with strings like 'b/1234'. |
| ext_issue_ids = [] |
| for val in cond.str_values: |
| try: |
| project_name, local_id = tracker_bizobj.ParseIssueRef(val) |
| if not project_name: |
| if not default_project_name: |
| # TODO(rmistry): Support the below. |
| raise MalformedQuery( |
| 'Searching for issues accross multiple/all projects without ' |
| 'project prefixes is ambiguous and is currently not supported.') |
| project_name = default_project_name |
| refs.append((project_name, int(local_id))) |
| except MalformedQuery as e: |
| raise e |
| # Can't parse issue id, try external issue pattern. |
| except ValueError as e: |
| if federated.FromShortlink(val): |
| ext_issue_ids.append(val) |
| else: |
| raise MalformedQuery('Could not parse issue reference: %s' % val) |
| |
| issue_ids, _misses = services.issue.ResolveIssueRefs( |
| cnxn, ref_projects, default_project_name, refs) |
| return issue_ids, ext_issue_ids |
| |
| |
| def _PreprocessStatusCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess a status=names cond into status_id=IDs.""" |
| if project_ids: |
| status_ids = [] |
| for project_id in project_ids: |
| status_ids.extend(services.config.LookupStatusIDs( |
| cnxn, project_id, cond.str_values)) |
| else: |
| status_ids = services.config.LookupStatusIDsAnyProject( |
| cnxn, cond.str_values) |
| |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
| int_values=status_ids) |
| |
| |
| def _IsEqualityOp(op): |
| """Return True for EQ and NE.""" |
| return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE) |
| |
| |
| def _IsDefinedOp(op): |
| """Return True for IS_DEFINED and IS_NOT_DEFINED.""" |
| return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED) |
| |
| |
| def _TextOpToIntOp(op): |
| """If a query is optimized from string to ID matching, use an equality op.""" |
| if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS: |
| return ast_pb2.QueryOp.EQ |
| elif op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
| return ast_pb2.QueryOp.NE |
| return op |
| |
| |
| def _MakePrefixRegex(cond): |
| """Return a regex to match strings that start with cond values.""" |
| all_prefixes = '|'.join(map(re.escape, cond.str_values)) |
| return re.compile(r'(%s)-.+' % all_prefixes, re.I) |
| |
| |
| def _MakeKeyValueRegex(cond): |
| """Return a regex to match the first token and remaining text separately.""" |
| keys, values = list(zip(*[x.split('-', 1) for x in cond.str_values])) |
| if len(set(keys)) != 1: |
| raise MalformedQuery( |
| "KeyValue query with multiple different keys: %r" % cond.str_values) |
| all_values = '|'.join(map(re.escape, values)) |
| return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I) |
| |
| |
| def _MakeWordBoundaryRegex(cond): |
| """Return a regex to match the cond values as whole words.""" |
| all_words = '|'.join(map(re.escape, cond.str_values)) |
| return re.compile(r'.*\b(%s)\b.*' % all_words, re.I) |
| |
| |
| def _PreprocessLabelCond( |
| cnxn, cond, project_ids, services, _harmonized_config, _is_member): |
| """Preprocess a label=names cond into label_id=IDs.""" |
| if project_ids: |
| label_ids = [] |
| for project_id in project_ids: |
| if _IsEqualityOp(cond.op): |
| label_ids.extend(services.config.LookupLabelIDs( |
| cnxn, project_id, cond.str_values)) |
| elif _IsDefinedOp(cond.op): |
| label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| cnxn, project_id, _MakePrefixRegex(cond))) |
| elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
| label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| cnxn, project_id, _MakeKeyValueRegex(cond))) |
| else: |
| label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| cnxn, project_id, _MakeWordBoundaryRegex(cond))) |
| else: |
| if _IsEqualityOp(cond.op): |
| label_ids = services.config.LookupLabelIDsAnyProject( |
| cnxn, cond.str_values) |
| elif _IsDefinedOp(cond.op): |
| label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| cnxn, _MakePrefixRegex(cond)) |
| elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
| label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| cnxn, _MakeKeyValueRegex(cond)) |
| else: |
| label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| cnxn, _MakeWordBoundaryRegex(cond)) |
| |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']], |
| int_values=label_ids) |
| |
| |
| def _PreprocessComponentCond( |
| cnxn, cond, project_ids, services, harmonized_config, _is_member): |
| """Preprocess a component= or component:name cond into component_id=IDs.""" |
| exact = _IsEqualityOp(cond.op) |
| component_ids = [] |
| if project_ids: |
| # We are searching within specific projects, so harmonized_config |
| # holds the config data for all those projects. |
| for comp_path in cond.str_values: |
| component_ids.extend(tracker_bizobj.FindMatchingComponentIDs( |
| comp_path, harmonized_config, exact=exact)) |
| else: |
| # We are searching across the whole site, so we have no harmonized_config |
| # to use. |
| component_ids = services.config.FindMatchingComponentIDsAnyProject( |
| cnxn, cond.str_values, exact=exact) |
| |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']], |
| int_values=component_ids) |
| |
| |
| def _PreprocessExactUsers( |
| cnxn, cond, user_service, id_fields, is_member): |
| """Preprocess a foo=emails cond into foo_id=IDs, if exact user match. |
| |
| This preprocesing step converts string conditions to int ID conditions. |
| E.g., [owner=email] to [owner_id=ID]. It only does it in cases |
| where (a) the email was "me", so it was already converted to an string of |
| digits in the search pipeline, or (b) it is "user@domain" which resolves to |
| a known Monorail user. It is also possible to search for, e.g., |
| [owner:substring], but such searches remain 'owner' field searches rather |
| than 'owner_id', and they cannot be combined with the "me" keyword. |
| |
| Args: |
| cnxn: connection to the DB. |
| cond: original parsed query Condition PB. |
| user_service: connection to user persistence layer. |
| id_fields: list of the search fields to use if the conversion to IDs |
| succeed. |
| is_member: True if user is a member of all the projects being searchers, |
| so they can do user substring searches. |
| |
| Returns: |
| A new Condition PB that checks the id_field. Or, the original cond. |
| |
| Raises: |
| MalformedQuery: A non-member used a query term that could be used to |
| guess full user email addresses. |
| """ |
| op = _TextOpToIntOp(cond.op) |
| if _IsDefinedOp(op): |
| # No need to look up any IDs if we are just testing for any defined value. |
| return ast_pb2.Condition(op=op, field_defs=id_fields, |
| key_suffix=cond.key_suffix, |
| phase_name=cond.phase_name) |
| |
| # This preprocessing step is only for ops that compare whole values, not |
| # substrings. |
| if not _IsEqualityOp(op): |
| logging.info('could not convert to IDs because op is %r', op) |
| if not is_member: |
| raise MalformedQuery('Only project members may compare user strings') |
| return cond |
| |
| user_ids = [] |
| for val in cond.str_values: |
| try: |
| user_ids.append(int(val)) |
| except ValueError: |
| try: |
| user_ids.append(user_service.LookupUserID(cnxn, val)) |
| except exceptions.NoSuchUserException: |
| if not is_member and val != 'me' and not val.startswith('@'): |
| logging.info('could not convert user %r to int ID', val) |
| if '@' in val: |
| raise MalformedQuery('User email address not found') |
| else: |
| raise MalformedQuery( |
| 'Only project members may search for user substrings') |
| return cond # preprocessing failed, stick with the original cond. |
| |
| return ast_pb2.MakeCond( |
| op, id_fields, [], user_ids, key_suffix=cond.key_suffix, |
| phase_name=cond.phase_name) |
| |
| |
| def _PreprocessOwnerCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, is_member): |
| """Preprocess a owner=emails cond into owner_id=IDs, if exact user match.""" |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']], |
| is_member) |
| |
| |
| def _PreprocessCcCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, is_member): |
| """Preprocess a cc=emails cond into cc_id=IDs, if exact user match.""" |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']], |
| is_member) |
| |
| |
| def _PreprocessReporterCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, is_member): |
| """Preprocess a reporter=emails cond into reporter_id=IDs, if exact.""" |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, |
| [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']], is_member) |
| |
| |
| def _PreprocessStarredByCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, is_member): |
| """Preprocess a starredby=emails cond into starredby_id=IDs, if exact.""" |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, |
| [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']], is_member) |
| |
| |
| def _PreprocessCommentByCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, is_member): |
| """Preprocess a commentby=emails cond into commentby_id=IDs, if exact.""" |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, |
| [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']], is_member) |
| |
| |
| def _PreprocessHotlistCond( |
| cnxn, cond, _project_ids, services, _harmonized_config, _is_member): |
| """Preprocess hotlist query |
| |
| Preprocesses a hotlist query in the form: |
| 'hotlist=<user_email>:<hotlist-name>,<hotlist-name>,<user2_email>:... |
| into hotlist_id=IDs, if exact. |
| """ |
| # TODO(jojwang): add support for searches that don't contain domain names. |
| # eg jojwang:hotlist-name |
| users_to_hotlists = collections.defaultdict(list) |
| cur_user = '' |
| for val in cond.str_values: |
| if ':' in val: |
| cur_user, hotlists_str = val.split(':', 1) |
| else: |
| hotlists_str = val |
| try: |
| users_to_hotlists[int(cur_user)].append(hotlists_str) |
| except ValueError: |
| try: |
| user_id = services.user.LookupUserID(cnxn, cur_user) |
| users_to_hotlists[user_id].append(hotlists_str) |
| except exceptions.NoSuchUserException: |
| logging.info('could not convert user %r to int ID', val) |
| return cond |
| hotlist_ids = set() |
| for user_id, hotlists in users_to_hotlists.items(): |
| if not hotlists[0]: |
| user_hotlists = services.features.GetHotlistsByUserID(cnxn, user_id) |
| user_hotlist_ids = [hotlist.hotlist_id for hotlist in user_hotlists if |
| user_id in hotlist.owner_ids] |
| else: |
| user_hotlist_ids = list(services.features.LookupHotlistIDs( |
| cnxn, hotlists, [user_id]).values()) |
| for hotlist_id in user_hotlist_ids: |
| hotlist_ids.add(hotlist_id) |
| return ast_pb2.Condition( |
| op=_TextOpToIntOp(cond.op), |
| field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['hotlist_id']], |
| int_values=list(hotlist_ids)) |
| |
| |
| def _PreprocessCustomCond(cnxn, cond, services, is_member): |
| """Preprocess a custom_user_field=emails cond into IDs, if exact matches.""" |
| # TODO(jrobbins): better support for ambiguous fields. |
| # For now, if any field is USER_TYPE and the value being searched |
| # for is the email address of an existing account, it will convert |
| # to a user ID and we go with exact ID matching. Otherwise, we |
| # leave the cond as-is for ast2select to do string matching on. |
| user_field_defs = [fd for fd in cond.field_defs |
| if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE] |
| if user_field_defs: |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, user_field_defs, is_member) |
| |
| approval_field_defs = [fd for fd in cond.field_defs |
| if (fd.field_type == |
| tracker_pb2.FieldTypes.APPROVAL_TYPE)] |
| if approval_field_defs: |
| if cond.key_suffix in [query2ast.APPROVER_SUFFIX, query2ast.SET_BY_SUFFIX]: |
| return _PreprocessExactUsers( |
| cnxn, cond, services.user, approval_field_defs, is_member) |
| |
| return cond |
| |
| |
| _PREPROCESSORS = { |
| 'open': _PreprocessIsOpenCond, |
| 'blocked': _PreprocessIsBlockedCond, |
| 'spam': _PreprocessIsSpamCond, |
| 'blockedon': _PreprocessBlockedOnCond, |
| 'blocking': _PreprocessBlockingCond, |
| 'mergedinto': _PreprocessMergedIntoCond, |
| 'status': _PreprocessStatusCond, |
| 'label': _PreprocessLabelCond, |
| 'component': _PreprocessComponentCond, |
| 'owner': _PreprocessOwnerCond, |
| 'cc': _PreprocessCcCond, |
| 'reporter': _PreprocessReporterCond, |
| 'starredby': _PreprocessStarredByCond, |
| 'commentby': _PreprocessCommentByCond, |
| 'hotlist': _PreprocessHotlistCond, |
| } |
| |
| |
| def _PreprocessCond( |
| cnxn, cond, project_ids, services, harmonized_config, is_member): |
| """Preprocess query by looking up status, label and component IDs.""" |
| # All the fields in a cond share the same name because they are parsed |
| # from a user query term, and the term syntax allows just one field name. |
| field_name = cond.field_defs[0].field_name |
| assert all(fd.field_name == field_name for fd in cond.field_defs) |
| |
| # Case 1: The user is searching custom fields. |
| if any(fd.field_id for fd in cond.field_defs): |
| # There can't be a mix of custom and built-in fields because built-in |
| # field names are reserved and take priority over any conflicting ones. |
| assert all(fd.field_id for fd in cond.field_defs) |
| return _PreprocessCustomCond(cnxn, cond, services, is_member) |
| |
| # Case 2: The user is searching a built-in field. |
| preproc = _PREPROCESSORS.get(field_name) |
| if preproc: |
| # We have a preprocessor for that built-in field. |
| return preproc( |
| cnxn, cond, project_ids, services, harmonized_config, is_member) |
| else: |
| # We don't have a preprocessor for it. |
| return cond |
| |
| |
| class MalformedQuery(ValueError): |
| pass |