blob: 56fb6afc7e111a36c56b5c34447ff6a0512d8a26 [file] [log] [blame]
# Copyright 2016 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Backend issue issue search and sorting.
Each of several "besearch" backend jobs manages one shard of the overall set
of issues in the system. The backend search pipeline retrieves the issues
that match the user query, puts them into memcache, and returns them to
the frontend search pipeline.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import logging
import re
import time
from google.appengine.api import memcache
import settings
from features import savedqueries_helpers
from framework import authdata
from framework import framework_constants
from framework import framework_helpers
from framework import sorting
from framework import sql
from mrproto import ast_pb2
from mrproto import tracker_pb2
from search import ast2ast
from search import ast2select
from search import ast2sort
from search import query2ast
from search import searchpipeline
from services import tracker_fulltext
from services import fulltext_helpers
from tracker import tracker_bizobj
# Used in constructing the at-risk query.
AT_RISK_LABEL_RE = re.compile(r'^(restrict-view-.+)$', re.IGNORECASE)
# Limit on the number of list items to show in debug log statements
MAX_LOG = 200
class BackendSearchPipeline(object):
"""Manage the process of issue search, including Promises and caching.
Even though the code is divided into several methods, the public
methods should be called in sequence, so the execution of the code
is pretty much in the order of the source code lines here.
"""
def __init__(
self, mr, services, default_results_per_page,
query_project_names, logged_in_user_id, me_user_ids):
self.mr = mr
self.services = services
self.default_results_per_page = default_results_per_page
self.query_project_list = list(services.project.GetProjectsByName(
mr.cnxn, query_project_names).values())
self.query_project_ids = [
p.project_id for p in self.query_project_list]
self.me_user_ids = me_user_ids
self.mr.auth = authdata.AuthData.FromUserID(
mr.cnxn, logged_in_user_id, services)
# The following fields are filled in as the pipeline progresses.
# The value None means that we still need to compute that value.
self.result_iids = None # Sorted issue IDs that match the query
self.search_limit_reached = False # True if search results limit is hit.
self.error = None
self._MakePromises()
def _MakePromises(self):
config_dict = self.services.config.GetProjectConfigs(
self.mr.cnxn, self.query_project_ids)
self.harmonized_config = tracker_bizobj.HarmonizeConfigs(
list(config_dict.values()))
self.canned_query = savedqueries_helpers.SavedQueryIDToCond(
self.mr.cnxn, self.services.features, self.mr.can)
self.canned_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
self.me_user_ids, self.canned_query)
self.mr.warnings.extend(warnings)
self.user_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
self.me_user_ids, self.mr.query)
self.mr.warnings.extend(warnings)
logging.debug('Searching query: %s %s', self.canned_query, self.user_query)
slice_term = ('Issue.shard = %s', [self.mr.shard_id])
sd = sorting.ComputeSortDirectives(
self.harmonized_config, self.mr.group_by_spec, self.mr.sort_spec)
self.result_iids_promise = framework_helpers.Promise(
_GetQueryResultIIDs, self.mr.cnxn,
self.services, self.canned_query, self.user_query,
self.query_project_ids, self.harmonized_config, sd,
slice_term, self.mr.shard_id, self.mr.invalidation_timestep)
def SearchForIIDs(self):
"""Wait for the search Promises and store their results."""
with self.mr.profiler.Phase('WaitOnPromises'):
self.result_iids, self.search_limit_reached, self.error = (
self.result_iids_promise.WaitAndGetValue())
def SearchProjectCan(
cnxn, services, project_ids, query_ast, shard_id, harmonized_config,
left_joins=None, where=None, sort_directives=None, query_desc=''):
"""Return a list of issue global IDs in the projects that satisfy the query.
Args:
cnxn: Regular database connection to the primary DB.
services: interface to issue storage backends.
project_ids: list of int IDs of the project to search
query_ast: A QueryAST PB with conjunctions and conditions.
shard_id: limit search to the specified shard ID int.
harmonized_config: harmonized config for all projects being searched.
left_joins: SQL LEFT JOIN clauses that are needed in addition to
anything generated from the query_ast.
where: SQL WHERE clauses that are needed in addition to
anything generated from the query_ast.
sort_directives: list of strings specifying the columns to sort on.
query_desc: descriptive string for debugging.
Returns:
(issue_ids, capped, error) where issue_ids is a list of issue issue_ids
that satisfy the query, capped is True if the number of results were
capped due to an implementation limit, and error is any well-known error
(probably a query parsing error) encountered during search.
"""
logging.info('searching projects %r for AST %r', project_ids, query_ast)
start_time = time.time()
left_joins = left_joins or []
where = where or []
if project_ids:
cond_str = 'Issue.project_id IN (%s)' % sql.PlaceHolders(project_ids)
where.append((cond_str, project_ids))
try:
query_ast = ast2ast.PreprocessAST(
cnxn, query_ast, project_ids, services, harmonized_config)
logging.info('simplified AST is %r', query_ast)
query_left_joins, query_where, _ = ast2select.BuildSQLQuery(query_ast)
left_joins.extend(query_left_joins)
where.extend(query_where)
except ast2ast.MalformedQuery as e:
# TODO(jrobbins): inform the user that their query had invalid tokens.
logging.info('Invalid query tokens %s.\n %r\n\n', str(e), query_ast)
return [], False, e
except ast2select.NoPossibleResults as e:
# TODO(jrobbins): inform the user that their query was impossible.
logging.info('Impossible query %s.\n %r\n\n', str(e), query_ast)
return [], False, e
logging.info('translated to left_joins %r', left_joins)
logging.info('translated to where %r', where)
fts_capped = False
if query_ast.conjunctions:
# TODO(jrobbins): Handle "OR" in queries. For now, we just process the
# first conjunction.
assert len(query_ast.conjunctions) == 1
conj = query_ast.conjunctions[0]
full_text_iids, fts_capped = tracker_fulltext.SearchIssueFullText(
project_ids, conj, shard_id)
if full_text_iids is not None:
if not full_text_iids:
return [], False, None # No match on fulltext, so don't bother DB.
cond_str = 'Issue.id IN (%s)' % sql.PlaceHolders(full_text_iids)
where.append((cond_str, full_text_iids))
label_def_rows = []
status_def_rows = []
if sort_directives:
if project_ids:
for pid in project_ids:
label_def_rows.extend(services.config.GetLabelDefRows(cnxn, pid))
status_def_rows.extend(services.config.GetStatusDefRows(cnxn, pid))
else:
label_def_rows = services.config.GetLabelDefRowsAnyProject(cnxn)
status_def_rows = services.config.GetStatusDefRowsAnyProject(cnxn)
harmonized_labels = tracker_bizobj.HarmonizeLabelOrStatusRows(
label_def_rows)
harmonized_statuses = tracker_bizobj.HarmonizeLabelOrStatusRows(
status_def_rows)
harmonized_fields = harmonized_config.field_defs
sort_left_joins, order_by = ast2sort.BuildSortClauses(
sort_directives, harmonized_labels, harmonized_statuses,
harmonized_fields)
logging.info('translated to sort left_joins %r', sort_left_joins)
logging.info('translated to order_by %r', order_by)
issue_ids, db_capped = services.issue.RunIssueQuery(
cnxn, left_joins + sort_left_joins, where, order_by, shard_id=shard_id)
logging.warning(
'executed "%s" query %r for %d issues in %dms', query_desc, query_ast,
len(issue_ids), int((time.time() - start_time) * 1000))
capped = fts_capped or db_capped
return issue_ids, capped, None
def _FilterSpam(query_ast):
uses_spam = False
# TODO(jrobbins): Handle "OR" in queries. For now, we just modify the
# first conjunction.
conjunction = query_ast.conjunctions[0]
for condition in conjunction.conds:
for field in condition.field_defs:
if field.field_name == 'spam':
uses_spam = True
if not uses_spam:
query_ast.conjunctions[0].conds.append(
ast_pb2.MakeCond(
ast_pb2.QueryOp.NE,
[tracker_pb2.FieldDef(
field_name='spam',
field_type=tracker_pb2.FieldTypes.BOOL_TYPE)
],
[], []))
return query_ast
def _GetQueryResultIIDs(
cnxn, services, canned_query, user_query,
query_project_ids, harmonized_config, sd, slice_term,
shard_id, invalidation_timestep):
"""Do a search and return a list of matching issue IDs.
Args:
cnxn: connection to the database.
services: interface to issue storage backends.
canned_query: string part of the query from the drop-down menu.
user_query: string part of the query that the user typed in.
query_project_ids: list of project IDs to search.
harmonized_config: combined configs for all the queried projects.
sd: list of sort directives.
slice_term: additional query term to narrow results to a logical shard
within a physical shard.
shard_id: int number of the database shard to search.
invalidation_timestep: int timestep to use keep memcached items fresh.
Returns:
Tuple consisting of:
A list of issue issue_ids that match the user's query. An empty list, [],
is returned if no issues match the query.
Boolean that is set to True if the search results limit of this shard is
hit.
An error (subclass of Exception) encountered during query processing. None
means that no error was encountered.
"""
query_ast = _FilterSpam(query2ast.ParseUserQuery(
user_query, canned_query, query2ast.BUILTIN_ISSUE_FIELDS,
harmonized_config))
logging.info('query_project_ids is %r', query_project_ids)
is_fulltext_query = bool(
query_ast.conjunctions and
fulltext_helpers.BuildFTSQuery(
query_ast.conjunctions[0], tracker_fulltext.ISSUE_FULLTEXT_FIELDS))
expiration = framework_constants.CACHE_EXPIRATION
if is_fulltext_query:
expiration = framework_constants.FULLTEXT_MEMCACHE_EXPIRATION
# Might raise ast2ast.MalformedQuery or ast2select.NoPossibleResults.
result_iids, search_limit_reached, error = SearchProjectCan(
cnxn, services, query_project_ids, query_ast, shard_id,
harmonized_config, sort_directives=sd, where=[slice_term],
query_desc='getting query issue IDs')
logging.info('Found %d result_iids', len(result_iids))
if error:
logging.warning('Got error %r', error)
projects_str = ','.join(str(pid) for pid in sorted(query_project_ids))
projects_str = projects_str or 'all'
memcache_key = ';'.join([
projects_str, canned_query, user_query, ' '.join(sd), str(shard_id)])
memcache.set(memcache_key, (result_iids, invalidation_timestep),
time=expiration, namespace=settings.memcache_namespace)
logging.info('set memcache key %r', memcache_key)
search_limit_memcache_key = ';'.join([
projects_str, canned_query, user_query, ' '.join(sd),
'search_limit_reached', str(shard_id)])
memcache.set(search_limit_memcache_key,
(search_limit_reached, invalidation_timestep),
time=expiration, namespace=settings.memcache_namespace)
logging.info('set search limit memcache key %r',
search_limit_memcache_key)
timestamps_for_projects = memcache.get_multi(
keys=(['%d;%d' % (pid, shard_id) for pid in query_project_ids] +
['all:%d' % shard_id]),
namespace=settings.memcache_namespace)
if query_project_ids:
for pid in query_project_ids:
key = '%d;%d' % (pid, shard_id)
if key not in timestamps_for_projects:
memcache.set(
key,
invalidation_timestep,
time=framework_constants.CACHE_EXPIRATION,
namespace=settings.memcache_namespace)
else:
key = 'all;%d' % shard_id
if key not in timestamps_for_projects:
memcache.set(
key,
invalidation_timestep,
time=framework_constants.CACHE_EXPIRATION,
namespace=settings.memcache_namespace)
return result_iids, search_limit_reached, error