Blame - services/tracker_fulltext.py - monorail-avm99963

blob: ecbfc44bb236415f4017a158f5e094a7b21bf571 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame^]	1	# Copyright 2016 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5
				6	"""A set of functions that provide fulltext search for issues."""
				7	from __future__ import print_function
				8	from __future__ import division
				9	from __future__ import absolute_import
				10
				11	import collections
				12	import logging
				13	import time
				14
				15	from six import string_types
				16
				17	from google.appengine.api import search
				18
				19	import settings
				20	from framework import framework_constants
				21	from framework import framework_helpers
				22	from framework import framework_views
				23	from services import fulltext_helpers
				24	from tracker import tracker_bizobj
				25
				26
				27	# When updating and re-indexing all issues in a project, work in batches
				28	# of this size to manage memory usage and avoid rpc timeouts.
				29	_INDEX_BATCH_SIZE = 40
				30
				31
				32	# The user can search for text that occurs specifically in these
				33	# parts of an issue.
				34	ISSUE_FULLTEXT_FIELDS = ['summary', 'description', 'comment']
				35	# Note: issue documents also contain a "metadata" field, but we do not
				36	# expose that to users. Issue metadata can be searched in a structured way
				37	# by giving a specific field name such as "owner:" or "status:". The metadata
				38	# search field exists only for fulltext queries that do not specify any field.
				39
				40
				41	def IndexIssues(cnxn, issues, user_service, issue_service, config_service):
				42	"""(Re)index all the given issues.
				43
				44	Args:
				45	cnxn: connection to SQL database.
				46	issues: list of Issue PBs to index.
				47	user_service: interface to user data storage.
				48	issue_service: interface to issue data storage.
				49	config_service: interface to configuration data storage.
				50	"""
				51	issues = list(issues)
				52	config_dict = config_service.GetProjectConfigs(
				53	cnxn, {issue.project_id for issue in issues})
				54	for start in range(0, len(issues), _INDEX_BATCH_SIZE):
				55	logging.info('indexing issues: %d remaining', len(issues) - start)
				56	_IndexIssueBatch(
				57	cnxn, issues[start:start + _INDEX_BATCH_SIZE], user_service,
				58	issue_service, config_dict)
				59
				60
				61	def _IndexIssueBatch(cnxn, issues, user_service, issue_service, config_dict):
				62	"""Internal method to (re)index the given batch of issues.
				63
				64	Args:
				65	cnxn: connection to SQL database.
				66	issues: list of Issue PBs to index.
				67	user_service: interface to user data storage.
				68	issue_service: interface to issue data storage.
				69	config_dict: dict {project_id: config} for all the projects that
				70	the given issues are in.
				71	"""
				72	user_ids = tracker_bizobj.UsersInvolvedInIssues(issues)
				73	comments_dict = issue_service.GetCommentsForIssues(
				74	cnxn, [issue.issue_id for issue in issues])
				75	for comments in comments_dict.values():
				76	user_ids.update([ic.user_id for ic in comments])
				77
				78	users_by_id = framework_views.MakeAllUserViews(
				79	cnxn, user_service, user_ids)
				80	_CreateIssueSearchDocuments(issues, comments_dict, users_by_id, config_dict)
				81
				82
				83	def _CreateIssueSearchDocuments(
				84	issues, comments_dict, users_by_id, config_dict):
				85	"""Make the GAE search index documents for the given issue batch.
				86
				87	Args:
				88	issues: list of issues to index.
				89	comments_dict: prefetched dictionary of comments on those issues.
				90	users_by_id: dictionary {user_id: UserView} so that the email
				91	addresses of users who left comments can be found via search.
				92	config_dict: dict {project_id: config} for all the projects that
				93	the given issues are in.
				94	"""
				95	documents_by_shard = collections.defaultdict(list)
				96	for issue in issues:
				97	summary = issue.summary
				98	# TODO(jrobbins): allow search specifically on explicit vs derived
				99	# fields.
				100	owner_id = tracker_bizobj.GetOwnerId(issue)
				101	owner_email = users_by_id[owner_id].email
				102	config = config_dict[issue.project_id]
				103	component_paths = []
				104	for component_id in issue.component_ids:
				105	cd = tracker_bizobj.FindComponentDefByID(component_id, config)
				106	if cd:
				107	component_paths.append(cd.path)
				108
				109	field_values = [tracker_bizobj.GetFieldValue(fv, users_by_id)
				110	for fv in issue.field_values]
				111	# Convert to string only the values that are not strings already.
				112	# This is done because the default encoding in appengine seems to be 'ascii'
				113	# and string values might contain unicode characters, so str will fail to
				114	# encode them.
				115	field_values = [value if isinstance(value, string_types) else str(value)
				116	for value in field_values]
				117
				118	metadata = '%s %s %s %s %s %s' % (
				119	tracker_bizobj.GetStatus(issue),
				120	owner_email,
				121	[users_by_id[cc_id].email for cc_id in
				122	tracker_bizobj.GetCcIds(issue)],
				123	' '.join(component_paths),
				124	' '.join(field_values),
				125	' '.join(tracker_bizobj.GetLabels(issue)))
				126	custom_fields = _BuildCustomFTSFields(issue)
				127
				128	comments = comments_dict.get(issue.issue_id, [])
				129	room_for_comments = (framework_constants.MAX_FTS_FIELD_SIZE -
				130	len(summary) -
				131	len(metadata) -
				132	sum(len(cf.value) for cf in custom_fields))
				133	comments = _IndexableComments(
				134	comments, users_by_id, remaining_chars=room_for_comments)
				135	logging.info('len(comments) is %r', len(comments))
				136	if comments:
				137	description = _ExtractCommentText(comments[0], users_by_id)
				138	description = description[:framework_constants.MAX_FTS_FIELD_SIZE]
				139	all_comments = ' '. join(
				140	_ExtractCommentText(c, users_by_id) for c in comments[1:])
				141	all_comments = all_comments[:framework_constants.MAX_FTS_FIELD_SIZE]
				142	else:
				143	description = ''
				144	all_comments = ''
				145	logging.info(
				146	'Issue %s:%r has zero indexable comments',
				147	issue.project_name, issue.local_id)
				148
				149	logging.info('Building document for %s:%d',
				150	issue.project_name, issue.local_id)
				151	logging.info('len(summary) = %d', len(summary))
				152	logging.info('len(metadata) = %d', len(metadata))
				153	logging.info('len(description) = %d', len(description))
				154	logging.info('len(comment) = %d', len(all_comments))
				155	for cf in custom_fields:
				156	logging.info('len(%s) = %d', cf.name, len(cf.value))
				157
				158	doc = search.Document(
				159	doc_id=str(issue.issue_id),
				160	fields=[
				161	search.NumberField(name='project_id', value=issue.project_id),
				162	search.TextField(name='summary', value=summary),
				163	search.TextField(name='metadata', value=metadata),
				164	search.TextField(name='description', value=description),
				165	search.TextField(name='comment', value=all_comments),
				166	] + custom_fields)
				167
				168	shard_id = issue.issue_id % settings.num_logical_shards
				169	documents_by_shard[shard_id].append(doc)
				170
				171	start_time = time.time()
				172	promises = []
				173	for shard_id, documents in documents_by_shard.items():
				174	if documents:
				175	promises.append(framework_helpers.Promise(
				176	_IndexDocsInShard, shard_id, documents))
				177
				178	for promise in promises:
				179	promise.WaitAndGetValue()
				180
				181	logging.info('Finished %d indexing in shards in %d ms',
				182	len(documents_by_shard), int((time.time() - start_time) * 1000))
				183
				184
				185	def _IndexableComments(comments, users_by_id, remaining_chars=None):
				186	"""We only index the comments that are not deleted or banned.
				187
				188	Args:
				189	comments: list of Comment PBs for one issue.
				190	users_by_id: Dict of (user_id -> UserView) for all users.
				191	remaining_chars: number of characters available for comment text
				192	without hitting the GAE search index max document size.
				193
				194	Returns:
				195	A list of comments filtered to not have any deleted comments or
				196	comments from banned users. If the issue has a huge number of
				197	comments, only a certain number of the first and last comments
				198	are actually indexed.
				199	"""
				200	if remaining_chars is None:
				201	remaining_chars = framework_constants.MAX_FTS_FIELD_SIZE
				202	allowed_comments = []
				203	for comment in comments:
				204	user_view = users_by_id.get(comment.user_id)
				205	if not (comment.deleted_by or (user_view and user_view.banned)):
				206	if comment.is_description and allowed_comments:
				207	# index the latest description, but not older descriptions
				208	allowed_comments[0] = comment
				209	else:
				210	allowed_comments.append(comment)
				211
				212	reasonable_size = (framework_constants.INITIAL_COMMENTS_TO_INDEX +
				213	framework_constants.FINAL_COMMENTS_TO_INDEX)
				214	if len(allowed_comments) <= reasonable_size:
				215	candidates = allowed_comments
				216	else:
				217	candidates = ( # Prioritize the description and recent comments.
				218	allowed_comments[0:1] +
				219	allowed_comments[-framework_constants.FINAL_COMMENTS_TO_INDEX:] +
				220	allowed_comments[1:framework_constants.INITIAL_COMMENTS_TO_INDEX])
				221
				222	total_length = 0
				223	result = []
				224	for comment in candidates:
				225	total_length += len(comment.content)
				226	if total_length > remaining_chars:
				227	break
				228	result.append(comment)
				229
				230	return result
				231
				232
				233	def _IndexDocsInShard(shard_id, documents):
				234	search_index = search.Index(
				235	name=settings.search_index_name_format % shard_id)
				236	search_index.put(documents)
				237	logging.info('FTS indexed %d docs in shard %d', len(documents), shard_id)
				238	# TODO(jrobbins): catch OverQuotaError and add the issues to the
				239	# ReindexQueue table instead.
				240
				241
				242	def _ExtractCommentText(comment, users_by_id):
				243	"""Return a string with all the searchable text of the given Comment PB."""
				244	commenter_email = users_by_id[comment.user_id].email
				245	return '%s %s %s' % (
				246	commenter_email,
				247	comment.content,
				248	' '.join(attach.filename
				249	for attach in comment.attachments
				250	if not attach.deleted))
				251
				252
				253	def _BuildCustomFTSFields(issue):
				254	"""Return a list of FTS Fields to index string-valued custom fields."""
				255	fts_fields = []
				256	for fv in issue.field_values:
				257	if fv.str_value:
				258	# TODO(jrobbins): also indicate which were derived vs. explicit.
				259	# TODO(jrobbins): also toss in the email addresses of any users in
				260	# user-valued custom fields, ints for int-valued fields, etc.
				261	fts_field = search.TextField(
				262	name='custom_%d' % fv.field_id, value=fv.str_value)
				263	fts_fields.append(fts_field)
				264
				265	return fts_fields
				266
				267
				268	def UnindexIssues(issue_ids):
				269	"""Remove many issues from the sharded search indexes."""
				270	iids_by_shard = {}
				271	for issue_id in issue_ids:
				272	shard_id = issue_id % settings.num_logical_shards
				273	iids_by_shard.setdefault(shard_id, [])
				274	iids_by_shard[shard_id].append(issue_id)
				275
				276	for shard_id, iids_in_shard in iids_by_shard.items():
				277	try:
				278	logging.info(
				279	'unindexing %r issue_ids in %r', len(iids_in_shard), shard_id)
				280	search_index = search.Index(
				281	name=settings.search_index_name_format % shard_id)
				282	search_index.delete([str(iid) for iid in iids_in_shard])
				283	except search.Error:
				284	logging.exception('FTS deletion failed')
				285
				286
				287	def SearchIssueFullText(project_ids, query_ast_conj, shard_id):
				288	"""Do full-text search in GAE FTS.
				289
				290	Args:
				291	project_ids: list of project ID numbers to consider.
				292	query_ast_conj: One conjuctive clause from the AST parsed
				293	from the user's query.
				294	shard_id: int shard ID for the shard to consider.
				295
				296	Returns:
				297	(issue_ids, capped) where issue_ids is a list of issue issue_ids that match
				298	the full-text query. And, capped is True if the results were capped due to
				299	an implementation limitation. Or, return (None, False) if the given AST
				300	conjunction contains no full-text conditions.
				301	"""
				302	fulltext_query = fulltext_helpers.BuildFTSQuery(
				303	query_ast_conj, ISSUE_FULLTEXT_FIELDS)
				304	if fulltext_query is None:
				305	return None, False
				306
				307	if project_ids:
				308	project_clause = ' OR '.join(
				309	'project_id:%d' % pid for pid in project_ids)
				310	fulltext_query = '(%s) %s' % (project_clause, fulltext_query)
				311
				312	# TODO(jrobbins): it would be good to also include some other
				313	# structured search terms to narrow down the set of index
				314	# documents considered. E.g., most queries are only over the
				315	# open issues.
				316	logging.info('FTS query is %r', fulltext_query)
				317	issue_ids = fulltext_helpers.ComprehensiveSearch(
				318	fulltext_query, settings.search_index_name_format % shard_id)
				319	capped = len(issue_ids) >= settings.fulltext_limit_per_shard
				320	return issue_ids, capped