search/backendsearchpipeline.py - monorail-avm99963 - Gitiles

 # Copyright 2016 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Backend issue issue search and sorting.

 Each of several "besearch" backend jobs manages one shard of the overall set
 of issues in the system. The backend search pipeline retrieves the issues
 that match the user query, puts them into memcache, and returns them to
 the frontend search pipeline.
 """
 from __future__ import print_function
 from __future__ import division
 from __future__ import absolute_import

 import logging
 import re
 import time

 from google.appengine.api import memcache

 import settings
 from features import savedqueries_helpers
 from framework import authdata
 from framework import framework_constants
 from framework import framework_helpers
 from framework import sorting
 from framework import sql
 from mrproto import ast_pb2
 from mrproto import tracker_pb2
 from search import ast2ast
 from search import ast2select
 from search import ast2sort
 from search import query2ast
 from search import searchpipeline
 from services import tracker_fulltext
 from services import fulltext_helpers
 from tracker import tracker_bizobj


 # Used in constructing the at-risk query.
 AT_RISK_LABEL_RE = re.compile(r'^(restrict-view-.+)$', re.IGNORECASE)

 # Limit on the number of list items to show in debug log statements
 MAX_LOG = 200


 class BackendSearchPipeline(object):
   """Manage the process of issue search, including Promises and caching.

   Even though the code is divided into several methods, the public
   methods should be called in sequence, so the execution of the code
   is pretty much in the order of the source code lines here.
   """

   def __init__(
       self, mr, services, default_results_per_page,
       query_project_names, logged_in_user_id, me_user_ids):

     self.mr = mr
     self.services = services
     self.default_results_per_page = default_results_per_page

     self.query_project_list = list(services.project.GetProjectsByName(
         mr.cnxn, query_project_names).values())
     self.query_project_ids = [
         p.project_id for p in self.query_project_list]

     self.me_user_ids = me_user_ids
     self.mr.auth = authdata.AuthData.FromUserID(
         mr.cnxn, logged_in_user_id, services)

     # The following fields are filled in as the pipeline progresses.
     # The value None means that we still need to compute that value.
     self.result_iids = None  # Sorted issue IDs that match the query
     self.search_limit_reached = False  # True if search results limit is hit.
     self.error = None

     self._MakePromises()

   def _MakePromises(self):
     config_dict = self.services.config.GetProjectConfigs(
         self.mr.cnxn, self.query_project_ids)
     self.harmonized_config = tracker_bizobj.HarmonizeConfigs(
         list(config_dict.values()))

     self.canned_query = savedqueries_helpers.SavedQueryIDToCond(
         self.mr.cnxn, self.services.features, self.mr.can)

     self.canned_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
         self.me_user_ids, self.canned_query)
     self.mr.warnings.extend(warnings)
     self.user_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
         self.me_user_ids, self.mr.query)
     self.mr.warnings.extend(warnings)
     logging.debug('Searching query: %s %s', self.canned_query, self.user_query)

     slice_term = ('Issue.shard = %s', [self.mr.shard_id])

     sd = sorting.ComputeSortDirectives(
         self.harmonized_config, self.mr.group_by_spec, self.mr.sort_spec)

     self.result_iids_promise = framework_helpers.Promise(
         _GetQueryResultIIDs, self.mr.cnxn,
         self.services, self.canned_query, self.user_query,
         self.query_project_ids, self.harmonized_config, sd,
         slice_term, self.mr.shard_id, self.mr.invalidation_timestep)

   def SearchForIIDs(self):
     """Wait for the search Promises and store their results."""
     with self.mr.profiler.Phase('WaitOnPromises'):
       self.result_iids, self.search_limit_reached, self.error = (
           self.result_iids_promise.WaitAndGetValue())


 def SearchProjectCan(
     cnxn, services, project_ids, query_ast, shard_id, harmonized_config,
     left_joins=None, where=None, sort_directives=None, query_desc=''):
   """Return a list of issue global IDs in the projects that satisfy the query.

   Args:
     cnxn: Regular database connection to the primary DB.
     services: interface to issue storage backends.
     project_ids: list of int IDs of the project to search
     query_ast: A QueryAST PB with conjunctions and conditions.
     shard_id: limit search to the specified shard ID int.
     harmonized_config: harmonized config for all projects being searched.
     left_joins: SQL LEFT JOIN clauses that are needed in addition to
         anything generated from the query_ast.
     where: SQL WHERE clauses that are needed in addition to
         anything generated from the query_ast.
     sort_directives: list of strings specifying the columns to sort on.
     query_desc: descriptive string for debugging.

   Returns:
     (issue_ids, capped, error) where issue_ids is a list of issue issue_ids
     that satisfy the query, capped is True if the number of results were
     capped due to an implementation limit, and error is any well-known error
     (probably a query parsing error) encountered during search.
   """
   logging.info('searching projects %r for AST %r', project_ids, query_ast)
   start_time = time.time()
   left_joins = left_joins or []
   where = where or []
   if project_ids:
     cond_str = 'Issue.project_id IN (%s)' % sql.PlaceHolders(project_ids)
     where.append((cond_str, project_ids))

   try:
     query_ast = ast2ast.PreprocessAST(
         cnxn, query_ast, project_ids, services, harmonized_config)
     logging.info('simplified AST is %r', query_ast)
     query_left_joins, query_where, _ = ast2select.BuildSQLQuery(query_ast)
     left_joins.extend(query_left_joins)
     where.extend(query_where)
   except ast2ast.MalformedQuery as e:
     # TODO(jrobbins): inform the user that their query had invalid tokens.
     logging.info('Invalid query tokens %s.\n %r\n\n', str(e), query_ast)
     return [], False, e
   except ast2select.NoPossibleResults as e:
     # TODO(jrobbins): inform the user that their query was impossible.
     logging.info('Impossible query %s.\n %r\n\n', str(e), query_ast)
     return [], False, e
   logging.info('translated to left_joins %r', left_joins)
   logging.info('translated to where %r', where)

   fts_capped = False
   if query_ast.conjunctions:
     # TODO(jrobbins): Handle "OR" in queries.  For now, we just process the
     # first conjunction.
     assert len(query_ast.conjunctions) == 1
     conj = query_ast.conjunctions[0]
     full_text_iids, fts_capped = tracker_fulltext.SearchIssueFullText(
         project_ids, conj, shard_id)
     if full_text_iids is not None:
       if not full_text_iids:
         return [], False, None  # No match on fulltext, so don't bother DB.
       cond_str = 'Issue.id IN (%s)' % sql.PlaceHolders(full_text_iids)
       where.append((cond_str, full_text_iids))

   label_def_rows = []
   status_def_rows = []
   if sort_directives:
     if project_ids:
       for pid in project_ids:
         label_def_rows.extend(services.config.GetLabelDefRows(cnxn, pid))
         status_def_rows.extend(services.config.GetStatusDefRows(cnxn, pid))
     else:
       label_def_rows = services.config.GetLabelDefRowsAnyProject(cnxn)
       status_def_rows = services.config.GetStatusDefRowsAnyProject(cnxn)

   harmonized_labels = tracker_bizobj.HarmonizeLabelOrStatusRows(
       label_def_rows)
   harmonized_statuses = tracker_bizobj.HarmonizeLabelOrStatusRows(
       status_def_rows)
   harmonized_fields = harmonized_config.field_defs
   sort_left_joins, order_by = ast2sort.BuildSortClauses(
       sort_directives, harmonized_labels, harmonized_statuses,
       harmonized_fields)
   logging.info('translated to sort left_joins %r', sort_left_joins)
   logging.info('translated to order_by %r', order_by)

   issue_ids, db_capped = services.issue.RunIssueQuery(
       cnxn, left_joins + sort_left_joins, where, order_by, shard_id=shard_id)
   logging.warning(
       'executed "%s" query %r for %d issues in %dms', query_desc, query_ast,
       len(issue_ids), int((time.time() - start_time) * 1000))
   capped = fts_capped or db_capped
   return issue_ids, capped, None

 def _FilterSpam(query_ast):
   uses_spam = False
   # TODO(jrobbins): Handle "OR" in queries.  For now, we just modify the
   # first conjunction.
   conjunction = query_ast.conjunctions[0]
   for condition in conjunction.conds:
     for field in condition.field_defs:
       if field.field_name == 'spam':
         uses_spam = True

   if not uses_spam:
     query_ast.conjunctions[0].conds.append(
         ast_pb2.MakeCond(
             ast_pb2.QueryOp.NE,
             [tracker_pb2.FieldDef(
                 field_name='spam',
                 field_type=tracker_pb2.FieldTypes.BOOL_TYPE)
              ],
         [], []))

   return query_ast

 def _GetQueryResultIIDs(
     cnxn, services, canned_query, user_query,
     query_project_ids, harmonized_config, sd, slice_term,
     shard_id, invalidation_timestep):
   """Do a search and return a list of matching issue IDs.

   Args:
     cnxn: connection to the database.
     services: interface to issue storage backends.
     canned_query: string part of the query from the drop-down menu.
     user_query: string part of the query that the user typed in.
     query_project_ids: list of project IDs to search.
     harmonized_config: combined configs for all the queried projects.
     sd: list of sort directives.
     slice_term: additional query term to narrow results to a logical shard
         within a physical shard.
     shard_id: int number of the database shard to search.
     invalidation_timestep: int timestep to use keep memcached items fresh.

   Returns:
     Tuple consisting of:
       A list of issue issue_ids that match the user's query.  An empty list, [],
       is returned if no issues match the query.
       Boolean that is set to True if the search results limit of this shard is
       hit.
       An error (subclass of Exception) encountered during query processing. None
       means that no error was encountered.
   """
   query_ast = _FilterSpam(query2ast.ParseUserQuery(
       user_query, canned_query, query2ast.BUILTIN_ISSUE_FIELDS,
       harmonized_config))

   logging.info('query_project_ids is %r', query_project_ids)

   is_fulltext_query = bool(
     query_ast.conjunctions and
     fulltext_helpers.BuildFTSQuery(
       query_ast.conjunctions[0], tracker_fulltext.ISSUE_FULLTEXT_FIELDS))
   expiration = framework_constants.CACHE_EXPIRATION
   if is_fulltext_query:
     expiration = framework_constants.FULLTEXT_MEMCACHE_EXPIRATION

   # Might raise ast2ast.MalformedQuery or ast2select.NoPossibleResults.
   result_iids, search_limit_reached, error = SearchProjectCan(
       cnxn, services, query_project_ids, query_ast, shard_id,
       harmonized_config, sort_directives=sd, where=[slice_term],
       query_desc='getting query issue IDs')
   logging.info('Found %d result_iids', len(result_iids))
   if error:
     logging.warning('Got error %r', error)

   projects_str = ','.join(str(pid) for pid in sorted(query_project_ids))
   projects_str = projects_str or 'all'
   memcache_key = ';'.join([
       projects_str, canned_query, user_query, ' '.join(sd), str(shard_id)])
   memcache.set(memcache_key, (result_iids, invalidation_timestep),
                time=expiration, namespace=settings.memcache_namespace)
   logging.info('set memcache key %r', memcache_key)

   search_limit_memcache_key = ';'.join([
       projects_str, canned_query, user_query, ' '.join(sd),
       'search_limit_reached', str(shard_id)])
   memcache.set(search_limit_memcache_key,
                (search_limit_reached, invalidation_timestep),
                time=expiration, namespace=settings.memcache_namespace)
   logging.info('set search limit memcache key %r',
                search_limit_memcache_key)

   timestamps_for_projects = memcache.get_multi(
       keys=(['%d;%d' % (pid, shard_id) for pid in query_project_ids] +
             ['all:%d' % shard_id]),
       namespace=settings.memcache_namespace)

   if query_project_ids:
     for pid in query_project_ids:
       key = '%d;%d' % (pid, shard_id)
       if key not in timestamps_for_projects:
         memcache.set(
             key,
             invalidation_timestep,
             time=framework_constants.CACHE_EXPIRATION,
             namespace=settings.memcache_namespace)
   else:
     key = 'all;%d' % shard_id
     if key not in timestamps_for_projects:
       memcache.set(
           key,
           invalidation_timestep,
           time=framework_constants.CACHE_EXPIRATION,
           namespace=settings.memcache_namespace)

   return result_iids, search_limit_reached, error
	# Copyright 2016 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Backend issue issue search and sorting.

	Each of several "besearch" backend jobs manages one shard of the overall set
	of issues in the system. The backend search pipeline retrieves the issues
	that match the user query, puts them into memcache, and returns them to
	the frontend search pipeline.
	"""
	from __future__ import print_function
	from __future__ import division
	from __future__ import absolute_import

	import logging
	import re
	import time

	from google.appengine.api import memcache

	import settings
	from features import savedqueries_helpers
	from framework import authdata
	from framework import framework_constants
	from framework import framework_helpers
	from framework import sorting
	from framework import sql
	from mrproto import ast_pb2
	from mrproto import tracker_pb2
	from search import ast2ast
	from search import ast2select
	from search import ast2sort
	from search import query2ast
	from search import searchpipeline
	from services import tracker_fulltext
	from services import fulltext_helpers
	from tracker import tracker_bizobj


	# Used in constructing the at-risk query.
	AT_RISK_LABEL_RE = re.compile(r'^(restrict-view-.+)$', re.IGNORECASE)

	# Limit on the number of list items to show in debug log statements
	MAX_LOG = 200


	class BackendSearchPipeline(object):
	"""Manage the process of issue search, including Promises and caching.

	Even though the code is divided into several methods, the public
	methods should be called in sequence, so the execution of the code
	is pretty much in the order of the source code lines here.
	"""

	def __init__(
	self, mr, services, default_results_per_page,
	query_project_names, logged_in_user_id, me_user_ids):

	self.mr = mr
	self.services = services
	self.default_results_per_page = default_results_per_page

	self.query_project_list = list(services.project.GetProjectsByName(
	mr.cnxn, query_project_names).values())
	self.query_project_ids = [
	p.project_id for p in self.query_project_list]

	self.me_user_ids = me_user_ids
	self.mr.auth = authdata.AuthData.FromUserID(
	mr.cnxn, logged_in_user_id, services)

	# The following fields are filled in as the pipeline progresses.
	# The value None means that we still need to compute that value.
	self.result_iids = None # Sorted issue IDs that match the query
	self.search_limit_reached = False # True if search results limit is hit.
	self.error = None

	self._MakePromises()

	def _MakePromises(self):
	config_dict = self.services.config.GetProjectConfigs(
	self.mr.cnxn, self.query_project_ids)
	self.harmonized_config = tracker_bizobj.HarmonizeConfigs(
	list(config_dict.values()))

	self.canned_query = savedqueries_helpers.SavedQueryIDToCond(
	self.mr.cnxn, self.services.features, self.mr.can)

	self.canned_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
	self.me_user_ids, self.canned_query)
	self.mr.warnings.extend(warnings)
	self.user_query, warnings = searchpipeline.ReplaceKeywordsWithUserIDs(
	self.me_user_ids, self.mr.query)
	self.mr.warnings.extend(warnings)
	logging.debug('Searching query: %s %s', self.canned_query, self.user_query)

	slice_term = ('Issue.shard = %s', [self.mr.shard_id])

	sd = sorting.ComputeSortDirectives(
	self.harmonized_config, self.mr.group_by_spec, self.mr.sort_spec)

	self.result_iids_promise = framework_helpers.Promise(
	_GetQueryResultIIDs, self.mr.cnxn,
	self.services, self.canned_query, self.user_query,
	self.query_project_ids, self.harmonized_config, sd,
	slice_term, self.mr.shard_id, self.mr.invalidation_timestep)

	def SearchForIIDs(self):
	"""Wait for the search Promises and store their results."""
	with self.mr.profiler.Phase('WaitOnPromises'):
	self.result_iids, self.search_limit_reached, self.error = (
	self.result_iids_promise.WaitAndGetValue())


	def SearchProjectCan(
	cnxn, services, project_ids, query_ast, shard_id, harmonized_config,
	left_joins=None, where=None, sort_directives=None, query_desc=''):
	"""Return a list of issue global IDs in the projects that satisfy the query.

	Args:
	cnxn: Regular database connection to the primary DB.
	services: interface to issue storage backends.
	project_ids: list of int IDs of the project to search
	query_ast: A QueryAST PB with conjunctions and conditions.
	shard_id: limit search to the specified shard ID int.
	harmonized_config: harmonized config for all projects being searched.
	left_joins: SQL LEFT JOIN clauses that are needed in addition to
	anything generated from the query_ast.
	where: SQL WHERE clauses that are needed in addition to
	anything generated from the query_ast.
	sort_directives: list of strings specifying the columns to sort on.
	query_desc: descriptive string for debugging.

	Returns:
	(issue_ids, capped, error) where issue_ids is a list of issue issue_ids
	that satisfy the query, capped is True if the number of results were
	capped due to an implementation limit, and error is any well-known error
	(probably a query parsing error) encountered during search.
	"""
	logging.info('searching projects %r for AST %r', project_ids, query_ast)
	start_time = time.time()
	left_joins = left_joins or []
	where = where or []
	if project_ids:
	cond_str = 'Issue.project_id IN (%s)' % sql.PlaceHolders(project_ids)
	where.append((cond_str, project_ids))

	try:
	query_ast = ast2ast.PreprocessAST(
	cnxn, query_ast, project_ids, services, harmonized_config)
	logging.info('simplified AST is %r', query_ast)
	query_left_joins, query_where, _ = ast2select.BuildSQLQuery(query_ast)
	left_joins.extend(query_left_joins)
	where.extend(query_where)
	except ast2ast.MalformedQuery as e:
	# TODO(jrobbins): inform the user that their query had invalid tokens.
	logging.info('Invalid query tokens %s.\n %r\n\n', str(e), query_ast)
	return [], False, e
	except ast2select.NoPossibleResults as e:
	# TODO(jrobbins): inform the user that their query was impossible.
	logging.info('Impossible query %s.\n %r\n\n', str(e), query_ast)
	return [], False, e
	logging.info('translated to left_joins %r', left_joins)
	logging.info('translated to where %r', where)

	fts_capped = False
	if query_ast.conjunctions:
	# TODO(jrobbins): Handle "OR" in queries. For now, we just process the
	# first conjunction.
	assert len(query_ast.conjunctions) == 1
	conj = query_ast.conjunctions[0]
	full_text_iids, fts_capped = tracker_fulltext.SearchIssueFullText(
	project_ids, conj, shard_id)
	if full_text_iids is not None:
	if not full_text_iids:
	return [], False, None # No match on fulltext, so don't bother DB.
	cond_str = 'Issue.id IN (%s)' % sql.PlaceHolders(full_text_iids)
	where.append((cond_str, full_text_iids))

	label_def_rows = []
	status_def_rows = []
	if sort_directives:
	if project_ids:
	for pid in project_ids:
	label_def_rows.extend(services.config.GetLabelDefRows(cnxn, pid))
	status_def_rows.extend(services.config.GetStatusDefRows(cnxn, pid))
	else:
	label_def_rows = services.config.GetLabelDefRowsAnyProject(cnxn)
	status_def_rows = services.config.GetStatusDefRowsAnyProject(cnxn)

	harmonized_labels = tracker_bizobj.HarmonizeLabelOrStatusRows(
	label_def_rows)
	harmonized_statuses = tracker_bizobj.HarmonizeLabelOrStatusRows(
	status_def_rows)
	harmonized_fields = harmonized_config.field_defs
	sort_left_joins, order_by = ast2sort.BuildSortClauses(
	sort_directives, harmonized_labels, harmonized_statuses,
	harmonized_fields)
	logging.info('translated to sort left_joins %r', sort_left_joins)
	logging.info('translated to order_by %r', order_by)

	issue_ids, db_capped = services.issue.RunIssueQuery(
	cnxn, left_joins + sort_left_joins, where, order_by, shard_id=shard_id)
	logging.warning(
	'executed "%s" query %r for %d issues in %dms', query_desc, query_ast,
	len(issue_ids), int((time.time() - start_time) * 1000))
	capped = fts_capped or db_capped
	return issue_ids, capped, None

	def _FilterSpam(query_ast):
	uses_spam = False
	# TODO(jrobbins): Handle "OR" in queries. For now, we just modify the
	# first conjunction.
	conjunction = query_ast.conjunctions[0]
	for condition in conjunction.conds:
	for field in condition.field_defs:
	if field.field_name == 'spam':
	uses_spam = True

	if not uses_spam:
	query_ast.conjunctions[0].conds.append(
	ast_pb2.MakeCond(
	ast_pb2.QueryOp.NE,
	[tracker_pb2.FieldDef(
	field_name='spam',
	field_type=tracker_pb2.FieldTypes.BOOL_TYPE)
	],
	[], []))

	return query_ast

	def _GetQueryResultIIDs(
	cnxn, services, canned_query, user_query,
	query_project_ids, harmonized_config, sd, slice_term,
	shard_id, invalidation_timestep):
	"""Do a search and return a list of matching issue IDs.

	Args:
	cnxn: connection to the database.
	services: interface to issue storage backends.
	canned_query: string part of the query from the drop-down menu.
	user_query: string part of the query that the user typed in.
	query_project_ids: list of project IDs to search.
	harmonized_config: combined configs for all the queried projects.
	sd: list of sort directives.
	slice_term: additional query term to narrow results to a logical shard
	within a physical shard.
	shard_id: int number of the database shard to search.
	invalidation_timestep: int timestep to use keep memcached items fresh.

	Returns:
	Tuple consisting of:
	A list of issue issue_ids that match the user's query. An empty list, [],
	is returned if no issues match the query.
	Boolean that is set to True if the search results limit of this shard is
	hit.
	An error (subclass of Exception) encountered during query processing. None
	means that no error was encountered.
	"""
	query_ast = _FilterSpam(query2ast.ParseUserQuery(
	user_query, canned_query, query2ast.BUILTIN_ISSUE_FIELDS,
	harmonized_config))

	logging.info('query_project_ids is %r', query_project_ids)

	is_fulltext_query = bool(
	query_ast.conjunctions and
	fulltext_helpers.BuildFTSQuery(
	query_ast.conjunctions[0], tracker_fulltext.ISSUE_FULLTEXT_FIELDS))
	expiration = framework_constants.CACHE_EXPIRATION
	if is_fulltext_query:
	expiration = framework_constants.FULLTEXT_MEMCACHE_EXPIRATION

	# Might raise ast2ast.MalformedQuery or ast2select.NoPossibleResults.
	result_iids, search_limit_reached, error = SearchProjectCan(
	cnxn, services, query_project_ids, query_ast, shard_id,
	harmonized_config, sort_directives=sd, where=[slice_term],
	query_desc='getting query issue IDs')
	logging.info('Found %d result_iids', len(result_iids))
	if error:
	logging.warning('Got error %r', error)

	projects_str = ','.join(str(pid) for pid in sorted(query_project_ids))
	projects_str = projects_str or 'all'
	memcache_key = ';'.join([
	projects_str, canned_query, user_query, ' '.join(sd), str(shard_id)])
	memcache.set(memcache_key, (result_iids, invalidation_timestep),
	time=expiration, namespace=settings.memcache_namespace)
	logging.info('set memcache key %r', memcache_key)

	search_limit_memcache_key = ';'.join([
	projects_str, canned_query, user_query, ' '.join(sd),
	'search_limit_reached', str(shard_id)])
	memcache.set(search_limit_memcache_key,
	(search_limit_reached, invalidation_timestep),
	time=expiration, namespace=settings.memcache_namespace)
	logging.info('set search limit memcache key %r',
	search_limit_memcache_key)

	timestamps_for_projects = memcache.get_multi(
	keys=(['%d;%d' % (pid, shard_id) for pid in query_project_ids] +
	['all:%d' % shard_id]),
	namespace=settings.memcache_namespace)

	if query_project_ids:
	for pid in query_project_ids:
	key = '%d;%d' % (pid, shard_id)
	if key not in timestamps_for_projects:
	memcache.set(
	key,
	invalidation_timestep,
	time=framework_constants.CACHE_EXPIRATION,
	namespace=settings.memcache_namespace)
	else:
	key = 'all;%d' % shard_id
	if key not in timestamps_for_projects:
	memcache.set(
	key,
	invalidation_timestep,
	time=framework_constants.CACHE_EXPIRATION,
	namespace=settings.memcache_namespace)

	return result_iids, search_limit_reached, error