blob: bf4de4f72b3cf779956149352c153be59af5c861 [file] [log] [blame]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file or at
# https://developers.google.com/open-source/licenses/bsd
"""Convert a user's issue search AST into a simplified AST.
This phase of query processing simplifies the user's query by looking up
the int IDs of any labels, statuses, or components that are mentioned by
name in the original query. The data needed for lookups is typically cached
in RAM in each backend job, so this will not put much load on the DB. The
simplified ASTs are later converted into SQL which is simpler and has
fewer joins.
The simplified main query is better because:
+ It is clearly faster, especially in the most common case where config
data is in RAM.
+ Since less RAM is used to process the main query on each shard, query
execution time is more consistent with less variability under load. Less
variability is good because the user must wait for the slowest shard.
+ The config tables (LabelDef, StatusDef, etc.) exist only on the primary DB,
so they cannot be mentioned in a query that runs on a shard.
+ The query string itself is shorter when numeric IDs are substituted, which
means that we can handle user queries with long lists of labels in a
reasonable-sized query.
+ It bisects the complexity of the operation: it's easier to test and debug
the lookup and simplification logic plus the main query logic this way
than it would be to deal with an even more complex SQL main query.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import collections
import logging
import re
from framework import exceptions
from proto import ast_pb2
from proto import tracker_pb2
# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
# remove this dep.
from search import query2ast
from tracker import tracker_bizobj
from features import federated
def PreprocessAST(
cnxn, query_ast, project_ids, services, harmonized_config, is_member=True):
"""Preprocess the query by doing lookups so that the SQL query is simpler.
Args:
cnxn: connection to SQL database.
query_ast: user query abstract syntax tree parsed by query2ast.py.
project_ids: collection of int project IDs to use to look up status values
and labels.
services: Connections to persistence layer for users and configs.
harmonized_config: harmonized config for all projects being searched.
is_member: True if user is a member of all the projects being searched,
so they can do user substring searches.
Returns:
A new QueryAST PB with simplified conditions. Specifically, string values
for labels, statuses, and components are replaced with the int IDs of
those items. Also, is:open is distilled down to
status_id != closed_status_ids.
"""
new_conjs = []
for conj in query_ast.conjunctions:
new_conds = [
_PreprocessCond(
cnxn, cond, project_ids, services, harmonized_config, is_member)
for cond in conj.conds]
new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
return ast_pb2.QueryAST(conjunctions=new_conjs)
def _PreprocessIsOpenCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess an is:open cond into status_id != closed_status_ids."""
if project_ids:
closed_status_ids = []
for project_id in project_ids:
closed_status_ids.extend(services.config.LookupClosedStatusIDs(
cnxn, project_id))
else:
closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
# Invert the operator, because we're comparing against *closed* statuses.
if cond.op == ast_pb2.QueryOp.EQ:
op = ast_pb2.QueryOp.NE
elif cond.op == ast_pb2.QueryOp.NE:
op = ast_pb2.QueryOp.EQ
else:
raise MalformedQuery('Open condition got nonsensical op %r' % cond.op)
return ast_pb2.Condition(
op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
int_values=closed_status_ids)
def _PreprocessIsBlockedCond(
_cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
"""Preprocess an is:blocked cond into issues that are blocked."""
if cond.op == ast_pb2.QueryOp.EQ:
op = ast_pb2.QueryOp.IS_DEFINED
elif cond.op == ast_pb2.QueryOp.NE:
op = ast_pb2.QueryOp.IS_NOT_DEFINED
else:
raise MalformedQuery('Blocked condition got nonsensical op %r' % cond.op)
return ast_pb2.Condition(
op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
def _PreprocessIsSpamCond(
_cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
"""Preprocess an is:spam cond into is_spam == 1."""
if cond.op == ast_pb2.QueryOp.EQ:
int_values = [1]
elif cond.op == ast_pb2.QueryOp.NE:
int_values = [0]
else:
raise MalformedQuery('Spam condition got nonsensical op %r' % cond.op)
return ast_pb2.Condition(
op=ast_pb2.QueryOp.EQ,
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['is_spam']],
int_values=int_values)
def _PreprocessBlockedOnCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess blockedon=xyz and has:blockedon conds.
Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
Preprocesses has:blockedon cond into issues that are blocked on other issues.
"""
issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
cond, project_ids, services)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
int_values=issue_ids,
str_values=ext_issue_ids)
def _PreprocessBlockingCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess blocking=xyz and has:blocking conds.
Preprocesses blocking=xyz cond into blocking_id:issue_ids.
Preprocesses has:blocking cond into issues that are blocking other issues.
"""
issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
cond, project_ids, services)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
int_values=issue_ids,
str_values=ext_issue_ids)
def _PreprocessMergedIntoCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess mergedinto=xyz and has:mergedinto conds.
Preprocesses mergedinto=xyz cond into mergedinto_id:issue_ids.
Preprocesses has:mergedinto cond into has:mergedinto_id.
"""
issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
cond, project_ids, services)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['mergedinto_id']],
int_values=issue_ids,
str_values=ext_issue_ids)
def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
"""Returns global IDs from the local IDs provided in the cond."""
# Get {project_name: project} for all projects in project_ids.
ids_to_projects = services.project.GetProjects(cnxn, project_ids)
ref_projects = {pb.project_name: pb for pb in ids_to_projects.values()}
# Populate default_project_name if there is only one project id provided.
default_project_name = None
if len(ref_projects) == 1:
default_project_name = list(ref_projects.values())[0].project_name
# Populate refs with (project_name, local_id) pairs.
refs = []
# Populate ext_issue_ids with strings like 'b/1234'.
ext_issue_ids = []
for val in cond.str_values:
try:
project_name, local_id = tracker_bizobj.ParseIssueRef(val)
if not project_name:
if not default_project_name:
# TODO(rmistry): Support the below.
raise MalformedQuery(
'Searching for issues accross multiple/all projects without '
'project prefixes is ambiguous and is currently not supported.')
project_name = default_project_name
refs.append((project_name, int(local_id)))
except MalformedQuery as e:
raise e
# Can't parse issue id, try external issue pattern.
except ValueError as e:
if federated.FromShortlink(val):
ext_issue_ids.append(val)
else:
raise MalformedQuery('Could not parse issue reference: %s' % val)
issue_ids, _misses = services.issue.ResolveIssueRefs(
cnxn, ref_projects, default_project_name, refs)
return issue_ids, ext_issue_ids
def _PreprocessStatusCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess a status=names cond into status_id=IDs."""
if project_ids:
status_ids = []
for project_id in project_ids:
status_ids.extend(services.config.LookupStatusIDs(
cnxn, project_id, cond.str_values))
else:
status_ids = services.config.LookupStatusIDsAnyProject(
cnxn, cond.str_values)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
int_values=status_ids)
def _IsEqualityOp(op):
"""Return True for EQ and NE."""
return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
def _IsDefinedOp(op):
"""Return True for IS_DEFINED and IS_NOT_DEFINED."""
return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
def _TextOpToIntOp(op):
"""If a query is optimized from string to ID matching, use an equality op."""
if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
return ast_pb2.QueryOp.EQ
elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
return ast_pb2.QueryOp.NE
return op
def _MakePrefixRegex(cond):
"""Return a regex to match strings that start with cond values."""
all_prefixes = '|'.join(map(re.escape, cond.str_values))
return re.compile(r'(%s)-.+' % all_prefixes, re.I)
def _MakeKeyValueRegex(cond):
"""Return a regex to match the first token and remaining text separately."""
keys, values = list(zip(*[x.split('-', 1) for x in cond.str_values]))
if len(set(keys)) != 1:
raise MalformedQuery(
"KeyValue query with multiple different keys: %r" % cond.str_values)
all_values = '|'.join(map(re.escape, values))
return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
def _MakeWordBoundaryRegex(cond):
"""Return a regex to match the cond values as whole words."""
all_words = '|'.join(map(re.escape, cond.str_values))
return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
def _PreprocessLabelCond(
cnxn, cond, project_ids, services, _harmonized_config, _is_member):
"""Preprocess a label=names cond into label_id=IDs."""
if project_ids:
label_ids = []
for project_id in project_ids:
if _IsEqualityOp(cond.op):
label_ids.extend(services.config.LookupLabelIDs(
cnxn, project_id, cond.str_values))
elif _IsDefinedOp(cond.op):
label_ids.extend(services.config.LookupIDsOfLabelsMatching(
cnxn, project_id, _MakePrefixRegex(cond)))
elif cond.op == ast_pb2.QueryOp.KEY_HAS:
label_ids.extend(services.config.LookupIDsOfLabelsMatching(
cnxn, project_id, _MakeKeyValueRegex(cond)))
else:
label_ids.extend(services.config.LookupIDsOfLabelsMatching(
cnxn, project_id, _MakeWordBoundaryRegex(cond)))
else:
if _IsEqualityOp(cond.op):
label_ids = services.config.LookupLabelIDsAnyProject(
cnxn, cond.str_values)
elif _IsDefinedOp(cond.op):
label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
cnxn, _MakePrefixRegex(cond))
elif cond.op == ast_pb2.QueryOp.KEY_HAS:
label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
cnxn, _MakeKeyValueRegex(cond))
else:
label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
cnxn, _MakeWordBoundaryRegex(cond))
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
int_values=label_ids)
def _PreprocessComponentCond(
cnxn, cond, project_ids, services, harmonized_config, _is_member):
"""Preprocess a component= or component:name cond into component_id=IDs."""
exact = _IsEqualityOp(cond.op)
component_ids = []
if project_ids:
# We are searching within specific projects, so harmonized_config
# holds the config data for all those projects.
for comp_path in cond.str_values:
component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
comp_path, harmonized_config, exact=exact))
else:
# We are searching across the whole site, so we have no harmonized_config
# to use.
component_ids = services.config.FindMatchingComponentIDsAnyProject(
cnxn, cond.str_values, exact=exact)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
int_values=component_ids)
def _PreprocessExactUsers(
cnxn, cond, user_service, id_fields, is_member):
"""Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
This preprocesing step converts string conditions to int ID conditions.
E.g., [owner=email] to [owner_id=ID]. It only does it in cases
where (a) the email was "me", so it was already converted to an string of
digits in the search pipeline, or (b) it is "user@domain" which resolves to
a known Monorail user. It is also possible to search for, e.g.,
[owner:substring], but such searches remain 'owner' field searches rather
than 'owner_id', and they cannot be combined with the "me" keyword.
Args:
cnxn: connection to the DB.
cond: original parsed query Condition PB.
user_service: connection to user persistence layer.
id_fields: list of the search fields to use if the conversion to IDs
succeed.
is_member: True if user is a member of all the projects being searchers,
so they can do user substring searches.
Returns:
A new Condition PB that checks the id_field. Or, the original cond.
Raises:
MalformedQuery: A non-member used a query term that could be used to
guess full user email addresses.
"""
op = _TextOpToIntOp(cond.op)
if _IsDefinedOp(op):
# No need to look up any IDs if we are just testing for any defined value.
return ast_pb2.Condition(op=op, field_defs=id_fields,
key_suffix=cond.key_suffix,
phase_name=cond.phase_name)
# This preprocessing step is only for ops that compare whole values, not
# substrings.
if not _IsEqualityOp(op):
logging.info('could not convert to IDs because op is %r', op)
if not is_member:
raise MalformedQuery('Only project members may compare user strings')
return cond
user_ids = []
for val in cond.str_values:
try:
user_ids.append(int(val))
except ValueError:
try:
user_ids.append(user_service.LookupUserID(cnxn, val))
except exceptions.NoSuchUserException:
if not is_member and val != 'me' and not val.startswith('@'):
logging.info('could not convert user %r to int ID', val)
if '@' in val:
raise MalformedQuery('User email address not found')
else:
raise MalformedQuery(
'Only project members may search for user substrings')
return cond # preprocessing failed, stick with the original cond.
return ast_pb2.MakeCond(
op, id_fields, [], user_ids, key_suffix=cond.key_suffix,
phase_name=cond.phase_name)
def _PreprocessOwnerCond(
cnxn, cond, _project_ids, services, _harmonized_config, is_member):
"""Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
return _PreprocessExactUsers(
cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']],
is_member)
def _PreprocessCcCond(
cnxn, cond, _project_ids, services, _harmonized_config, is_member):
"""Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
return _PreprocessExactUsers(
cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']],
is_member)
def _PreprocessReporterCond(
cnxn, cond, _project_ids, services, _harmonized_config, is_member):
"""Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
return _PreprocessExactUsers(
cnxn, cond, services.user,
[query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']], is_member)
def _PreprocessStarredByCond(
cnxn, cond, _project_ids, services, _harmonized_config, is_member):
"""Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
return _PreprocessExactUsers(
cnxn, cond, services.user,
[query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']], is_member)
def _PreprocessCommentByCond(
cnxn, cond, _project_ids, services, _harmonized_config, is_member):
"""Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
return _PreprocessExactUsers(
cnxn, cond, services.user,
[query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']], is_member)
def _PreprocessHotlistCond(
cnxn, cond, _project_ids, services, _harmonized_config, _is_member):
"""Preprocess hotlist query
Preprocesses a hotlist query in the form:
'hotlist=<user_email>:<hotlist-name>,<hotlist-name>,<user2_email>:...
into hotlist_id=IDs, if exact.
"""
# TODO(jojwang): add support for searches that don't contain domain names.
# eg jojwang:hotlist-name
users_to_hotlists = collections.defaultdict(list)
cur_user = ''
for val in cond.str_values:
if ':' in val:
cur_user, hotlists_str = val.split(':', 1)
else:
hotlists_str = val
try:
users_to_hotlists[int(cur_user)].append(hotlists_str)
except ValueError:
try:
user_id = services.user.LookupUserID(cnxn, cur_user)
users_to_hotlists[user_id].append(hotlists_str)
except exceptions.NoSuchUserException:
logging.info('could not convert user %r to int ID', val)
return cond
hotlist_ids = set()
for user_id, hotlists in users_to_hotlists.items():
if not hotlists[0]:
user_hotlists = services.features.GetHotlistsByUserID(cnxn, user_id)
user_hotlist_ids = [hotlist.hotlist_id for hotlist in user_hotlists if
user_id in hotlist.owner_ids]
else:
user_hotlist_ids = list(services.features.LookupHotlistIDs(
cnxn, hotlists, [user_id]).values())
for hotlist_id in user_hotlist_ids:
hotlist_ids.add(hotlist_id)
return ast_pb2.Condition(
op=_TextOpToIntOp(cond.op),
field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['hotlist_id']],
int_values=list(hotlist_ids))
def _PreprocessCustomCond(cnxn, cond, services, is_member):
"""Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
# TODO(jrobbins): better support for ambiguous fields.
# For now, if any field is USER_TYPE and the value being searched
# for is the email address of an existing account, it will convert
# to a user ID and we go with exact ID matching. Otherwise, we
# leave the cond as-is for ast2select to do string matching on.
user_field_defs = [fd for fd in cond.field_defs
if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
if user_field_defs:
return _PreprocessExactUsers(
cnxn, cond, services.user, user_field_defs, is_member)
approval_field_defs = [fd for fd in cond.field_defs
if (fd.field_type ==
tracker_pb2.FieldTypes.APPROVAL_TYPE)]
if approval_field_defs:
if cond.key_suffix in [query2ast.APPROVER_SUFFIX, query2ast.SET_BY_SUFFIX]:
return _PreprocessExactUsers(
cnxn, cond, services.user, approval_field_defs, is_member)
return cond
_PREPROCESSORS = {
'open': _PreprocessIsOpenCond,
'blocked': _PreprocessIsBlockedCond,
'spam': _PreprocessIsSpamCond,
'blockedon': _PreprocessBlockedOnCond,
'blocking': _PreprocessBlockingCond,
'mergedinto': _PreprocessMergedIntoCond,
'status': _PreprocessStatusCond,
'label': _PreprocessLabelCond,
'component': _PreprocessComponentCond,
'owner': _PreprocessOwnerCond,
'cc': _PreprocessCcCond,
'reporter': _PreprocessReporterCond,
'starredby': _PreprocessStarredByCond,
'commentby': _PreprocessCommentByCond,
'hotlist': _PreprocessHotlistCond,
}
def _PreprocessCond(
cnxn, cond, project_ids, services, harmonized_config, is_member):
"""Preprocess query by looking up status, label and component IDs."""
# All the fields in a cond share the same name because they are parsed
# from a user query term, and the term syntax allows just one field name.
field_name = cond.field_defs[0].field_name
assert all(fd.field_name == field_name for fd in cond.field_defs)
# Case 1: The user is searching custom fields.
if any(fd.field_id for fd in cond.field_defs):
# There can't be a mix of custom and built-in fields because built-in
# field names are reserved and take priority over any conflicting ones.
assert all(fd.field_id for fd in cond.field_defs)
return _PreprocessCustomCond(cnxn, cond, services, is_member)
# Case 2: The user is searching a built-in field.
preproc = _PREPROCESSORS.get(field_name)
if preproc:
# We have a preprocessor for that built-in field.
return preproc(
cnxn, cond, project_ids, services, harmonized_config, is_member)
else:
# We don't have a preprocessor for it.
return cond
class MalformedQuery(ValueError):
pass