Blame - search/query2ast.py - monorail-avm99963

blob: 40c5cdcb528c2fdd5e715b024ed05e05b67586a1 [file] [log] [blame]

Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	1	# Copyright 2016 The Chromium Authors
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	4
				5	"""A set of functions that integrate the GAE search index with Monorail."""
				6	from __future__ import print_function
				7	from __future__ import division
				8	from __future__ import absolute_import
				9
				10	import collections
				11	import datetime
				12	import logging
				13	import re
				14	import time
				15
				16	from google.appengine.api import search
				17
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	18	from mrproto import ast_pb2
				19	from mrproto import tracker_pb2
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	20
				21
				22	# TODO(jrobbins): Consider re-implementing this whole file by using a
				23	# BNF syntax specification and a parser generator or library.
				24
				25	# encodings
				26	UTF8 = 'utf-8'
				27
				28	# Operator used for OR statements.
				29	OR_SYMBOL = ' OR '
				30
				31	# Token types used for parentheses parsing.
				32	SUBQUERY = ast_pb2.TokenType.SUBQUERY
				33	LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN
				34	RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN
				35	OR = ast_pb2.TokenType.OR
				36
				37	# Field types and operators
				38	BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
				39	DATE = tracker_pb2.FieldTypes.DATE_TYPE
				40	NUM = tracker_pb2.FieldTypes.INT_TYPE
				41	TXT = tracker_pb2.FieldTypes.STR_TYPE
				42	APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE
				43
				44	EQ = ast_pb2.QueryOp.EQ
				45	NE = ast_pb2.QueryOp.NE
				46	LT = ast_pb2.QueryOp.LT
				47	GT = ast_pb2.QueryOp.GT
				48	LE = ast_pb2.QueryOp.LE
				49	GE = ast_pb2.QueryOp.GE
				50	TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
				51	NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
				52	IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
				53	IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
				54	KEY_HAS = ast_pb2.QueryOp.KEY_HAS
				55
				56	# Mapping from user query comparison operators to our internal representation.
				57	OPS = {
				58	':': TEXT_HAS,
				59	'=': EQ,
				60	'!=': NE,
				61	'<': LT,
				62	'>': GT,
				63	'<=': LE,
				64	'>=': GE,
				65	}
				66
				67	# When the query has a leading minus, switch the operator for its opposite.
				68	NEGATED_OPS = {
				69	EQ: NE,
				70	NE: EQ,
				71	LT: GE,
				72	GT: LE,
				73	LE: GT,
				74	GE: LT,
				75	TEXT_HAS: NOT_TEXT_HAS,
				76	# IS_DEFINED is handled separately.
				77	}
				78
				79	# This is a partial regular expression that matches all of our comparison
				80	# operators, such as =, 1=, >, and <. Longer ones listed first so that the
				81	# shorter ones don't cause premature matches.
				82	OPS_PATTERN = '\|'.join(
				83	map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op))))
				84
				85	# This RE extracts search terms from a subquery string.
				86	TERM_RE = re.compile(
				87	r'(-?"[^"]+")\|' # E.g., ["division by zero"]
				88	r'(\S+(%s)[^ "]+)\|' # E.g., [stars>10]
				89	r'(\w+(%s)"[^"]+")\|' # E.g., [summary:"memory leak"]
				90	r'(-?[._\\w][-._\\w]+)' # E.g., [-workaround]
				91	% (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
				92
				93	# This RE is used to further decompose a comparison term into prefix, op, and
				94	# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
				95	# can include a leading "-" to negate the comparison.
				96	OP_RE = re.compile(
				97	r'^(?P<prefix>[-_.\w]*?)'
				98	r'(?P<op>%s)'
				99	r'(?P<value>([-@\w][-\,./:<=>@\w]\|"[^"]*"))$' %
				100	OPS_PATTERN,
				101	flags=re.UNICODE)
				102
				103
				104	# Predefined issue fields passed to the query parser.
				105	_ISSUE_FIELDS_LIST = [
				106	(ast_pb2.ANY_FIELD, TXT),
				107	('attachment', TXT), # attachment file names
				108	('attachments', NUM), # number of attachment files
				109	('blocked', BOOL),
				110	('blockedon', TXT),
				111	('blockedon_id', NUM),
				112	('blocking', TXT),
				113	('blocking_id', NUM),
				114	('cc', TXT),
				115	('cc_id', NUM),
				116	('comment', TXT),
				117	('commentby', TXT),
				118	('commentby_id', NUM),
				119	('component', TXT),
				120	('component_id', NUM),
				121	('description', TXT),
				122	('gate', TXT),
				123	('hotlist', TXT),
				124	('hotlist_id', NUM),
				125	('id', NUM),
				126	('is_spam', BOOL),
				127	('label', TXT),
				128	('label_id', NUM),
				129	('mergedinto', NUM),
				130	('mergedinto_id', NUM),
				131	('open', BOOL),
				132	('owner', TXT),
				133	('ownerbouncing', BOOL),
				134	('owner_id', NUM),
				135	('project', TXT),
				136	('reporter', TXT),
				137	('reporter_id', NUM),
				138	('spam', BOOL),
				139	('stars', NUM),
				140	('starredby', TXT),
				141	('starredby_id', NUM),
				142	('status', TXT),
				143	('status_id', NUM),
				144	('summary', TXT),
				145	]
				146
				147	_DATE_FIELDS = (
				148	'closed',
				149	'modified',
				150	'opened',
				151	'ownermodified',
				152	'ownerlastvisit',
				153	'statusmodified',
				154	'componentmodified',
				155	)
				156
				157	# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
				158	_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
				159
				160	_DATE_FIELD_SUFFIX_TO_OP = {
				161	'-after': '>',
				162	'-before': '<',
				163	}
				164
				165	SET_BY_SUFFIX = '-by'
				166	SET_ON_SUFFIX = '-on'
				167	APPROVER_SUFFIX = '-approver'
				168	STATUS_SUFFIX = '-status'
				169
				170	_APPROVAL_SUFFIXES = (
				171	SET_BY_SUFFIX,
				172	SET_ON_SUFFIX,
				173	APPROVER_SUFFIX,
				174	STATUS_SUFFIX,
				175	)
				176
				177	BUILTIN_ISSUE_FIELDS = {
				178	f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
				179	for f_name, f_type in _ISSUE_FIELDS_LIST}
				180
				181
				182	# Do not treat strings that start with the below as key:value search terms.
				183	# See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail.
				184	NON_OP_PREFIXES = (
				185	'http:',
				186	'https:',
				187	)
				188
				189
				190	def ParseUserQuery(
				191	query, scope, builtin_fields, harmonized_config, warnings=None,
				192	now=None):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	193	# type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef],
				194	# mrproto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) ->
				195	# mrproto.ast_pb2.QueryAST
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	196	"""Parse a user query and return a set of structure terms.
				197
				198	Args:
				199	query: string with user's query. E.g., 'Priority=High'.
				200	scope: string search terms that define the scope in which the
				201	query should be executed. They are expressed in the same
				202	user query language. E.g., adding the canned query.
				203	builtin_fields: dict {field_name: FieldDef(field_name, type)}
				204	mapping field names to FieldDef objects for built-in fields.
				205	harmonized_config: config for all the projects being searched.
				206	@@@ custom field name is not unique in cross project search.
				207	- custom_fields = {field_name: [fd, ...]}
				208	- query build needs to OR each possible interpretation
				209	- could be label in one project and field in another project.
				210	@@@ what about searching across all projects?
				211	warnings: optional list to accumulate warning messages.
				212	now: optional timestamp for tests, otherwise time.time() is used.
				213
				214	Returns:
				215	A QueryAST with conjunctions (usually just one), where each has a list of
				216	Condition PBs with op, fields, str_values and int_values. E.g., the query
				217	[priority=high leak OR stars>100] over open issues would return
				218	QueryAST(
				219	Conjunction(Condition(EQ, [open_fd], [], [1]),
				220	Condition(EQ, [label_fd], ['priority-high'], []),
				221	Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
				222	Conjunction(Condition(EQ, [open_fd], [], [1]),
				223	Condition(GT, [stars_fd], [], [100])))
				224
				225	Raises:
				226	InvalidQueryError: If a problem was detected in the user's query.
				227	"""
				228	if warnings is None:
				229	warnings = []
				230
				231	# Convert the overall query into one or more OR'd subqueries.
				232	subqueries = QueryToSubqueries(query)
				233
				234	# Make a dictionary of all fields: built-in + custom in each project.
				235	combined_fields = collections.defaultdict(
				236	list, {field_name: [field_def]
				237	for field_name, field_def in builtin_fields.items()})
				238	for fd in harmonized_config.field_defs:
				239	if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
				240	# Only do non-enum fields because enums are stored as labels
				241	combined_fields[fd.field_name.lower()].append(fd)
				242	if fd.field_type == APPROVAL:
				243	for approval_suffix in _APPROVAL_SUFFIXES:
				244	combined_fields[fd.field_name.lower() + approval_suffix].append(fd)
				245
				246	conjunctions = [
				247	_ParseConjunction(sq, scope, combined_fields, warnings, now=now)
				248	for sq in subqueries]
				249	return ast_pb2.QueryAST(conjunctions=conjunctions)
				250
				251
				252	def _ParseConjunction(subquery, scope, fields, warnings, now=None):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	253	# type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str],
				254	# int) -> mrproto.ast_pb2.Condition
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	255	"""Parse part of a user query into a Conjunction PB."""
				256	scoped_query = ('%s %s' % (scope, subquery)).lower()
				257	cond_strs = _ExtractConds(scoped_query, warnings)
				258	conds = [_ParseCond(cond_str, fields, warnings, now=now)
				259	for cond_str in cond_strs]
				260	conds = [cond for cond in conds if cond]
				261	return ast_pb2.Conjunction(conds=conds)
				262
				263
				264	def _ParseCond(cond_str, fields, warnings, now=None):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	265	# type: (str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str],
				266	# int) -> mrproto.ast_pb2.Condition
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	267	"""Parse one user query condition string into a Condition PB."""
				268	op_match = OP_RE.match(cond_str)
				269	# Do not treat as key:value search terms if any of the special prefixes match.
				270	special_prefixes_match = any(
				271	cond_str.startswith(p) for p in NON_OP_PREFIXES)
				272	if op_match and not special_prefixes_match:
				273	prefix = op_match.group('prefix')
				274	op = op_match.group('op')
				275	val = op_match.group('value')
				276	# Special case handling to continue to support old date query terms from
				277	# code.google.com. See monorail:151 for more details.
				278	if prefix.startswith(_DATE_FIELDS):
				279	for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
				280	if prefix.endswith(date_suffix):
				281	prefix = prefix.rstrip(date_suffix)
				282	op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
				283	return _ParseStructuredTerm(prefix, op, val, fields, now=now)
				284
				285	# Treat the cond as a full-text search term, which might be negated.
				286	if cond_str.startswith('-'):
				287	op = NOT_TEXT_HAS
				288	cond_str = cond_str[1:]
				289	else:
				290	op = TEXT_HAS
				291
				292	# Construct a full-text Query object as a dry-run to validate that
				293	# the syntax is acceptable.
				294	try:
				295	_fts_query = search.Query(cond_str)
				296	except search.QueryError:
				297	warnings.append('Ignoring full-text term: %s' % cond_str)
				298	return None
				299
				300	# Flag a potential user misunderstanding.
				301	if cond_str.lower() in ('and', 'or', 'not'):
				302	warnings.append(
				303	'The only supported boolean operator is OR (all capitals).')
				304
				305	return ast_pb2.MakeCond(
				306	op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
				307
				308
				309	def _ParseStructuredTerm(prefix, op_str, value, fields, now=None):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	310	# type: (str, str, str, Mapping[str, mrproto.tracker_pb2.FieldDef]) ->
				311	# mrproto.ast_pb2.Condition
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	312	"""Parse one user structured query term into an internal representation.
				313
				314	Args:
				315	prefix: The query operator, usually a field name. E.g., summary. It can
				316	also be special operators like "is" to test boolean fields.
				317	op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
				318	value: the value to compare against, e.g., term to find in that field.
				319	fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
				320	now: optional timestamp for tests, otherwise time.time() is used.
				321
				322	Returns:
				323	A Condition PB.
				324	"""
				325	unquoted_value = value.strip('"')
				326	# Quick-OR is a convenient way to write one condition that matches any one of
				327	# multiple values, like set membership. E.g., [Priority=High,Critical].
				328	# Ignore empty values caused by duplicated or trailing commas. E.g.,
				329	# [Priority=High,,Critical,] is equivalent to [Priority=High,Critical].
				330	quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()]
				331
				332	op = OPS[op_str]
				333	negate = False
				334	if prefix.startswith('-'):
				335	negate = True
				336	op = NEGATED_OPS.get(op, op)
				337	prefix = prefix[1:]
				338
				339	if prefix == 'is' and unquoted_value in [
				340	'open', 'blocked', 'spam', 'ownerbouncing']:
				341	return ast_pb2.MakeCond(
				342	NE if negate else EQ, fields[unquoted_value], [], [])
				343
				344	# Search entries with or without any value in the specified field.
				345	if prefix == 'has':
				346	op = IS_NOT_DEFINED if negate else IS_DEFINED
				347	if '.' in unquoted_value: # Possible search for phase field with any value.
				348	phase_name, possible_field = unquoted_value.split('.', 1)
				349	if possible_field in fields:
				350	return ast_pb2.MakeCond(
				351	op, fields[possible_field], [], [], phase_name=phase_name)
				352	elif unquoted_value in fields: # Look for that field with any value.
				353	return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
				354	else: # Look for any label with that prefix.
				355	return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
				356
				357	# Search entries with certain gates.
				358	if prefix == 'gate':
				359	return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, [])
				360
				361	# Determine hotlist query type.
				362	# If prefix is not 'hotlist', quick_or_vals is empty, or qov
				363	# does not contain ':', is_fields will remain True
				364	is_fields = True
				365	if prefix == 'hotlist':
				366	try:
				367	if ':' not in quick_or_vals[0]:
				368	is_fields = False
				369	except IndexError:
				370	is_fields = False
				371
				372	phase_name = None
				373	if '.' in prefix and is_fields:
				374	split_prefix = prefix.split('.', 1)
				375	if split_prefix[1] in fields:
				376	phase_name, prefix = split_prefix
				377
				378	# search built-in and custom fields. E.g., summary.
				379	if prefix in fields and is_fields:
				380	# Note: if first matching field is date-type, we assume they all are.
				381	# TODO(jrobbins): better handling for rare case where multiple projects
				382	# define the same custom field name, and one is a date and another is not.
				383	first_field = fields[prefix][0]
				384	if first_field.field_type == DATE:
				385	date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
				386	return ast_pb2.MakeCond(op, fields[prefix], [], date_values)
				387	elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX):
				388	date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
				389	return ast_pb2.MakeCond(
				390	op,
				391	fields[prefix], [],
				392	date_values,
				393	key_suffix=SET_ON_SUFFIX,
				394	phase_name=phase_name)
				395	else:
				396	quick_or_ints = []
				397	for qov in quick_or_vals:
				398	try:
				399	quick_or_ints.append(int(qov))
				400	except ValueError:
				401	pass
				402	if first_field.field_type == APPROVAL:
				403	for approval_suffix in _APPROVAL_SUFFIXES:
				404	if prefix.endswith(approval_suffix):
				405	return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
				406	quick_or_ints, key_suffix=approval_suffix,
				407	phase_name=phase_name)
				408	return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
				409	quick_or_ints, phase_name=phase_name)
				410
				411	# Since it is not a field, treat it as labels, E.g., Priority.
				412	quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
				413	# Convert substring match to key-value match if user typed 'foo:bar'.
				414	if op == TEXT_HAS:
				415	op = KEY_HAS
				416	return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
				417
				418
				419	def _ExtractConds(query, warnings):
				420	# type: (str, Sequence[str]) -> Sequence[str]
				421	"""Parse a query string into a list of individual condition strings.
				422
				423	Args:
				424	query: UTF-8 encoded search query string.
				425	warnings: list to accumulate warning messages.
				426
				427	Returns:
				428	A list of query condition strings.
				429	"""
				430	# Convert to unicode then search for distinct terms.
				431	term_matches = TERM_RE.findall(query)
				432
				433	terms = []
				434	for (phrase, word_label, _op1, phrase_label, _op2,
				435	word) in term_matches:
				436	# Case 1: Quoted phrases, e.g., ["hot dog"].
				437	if phrase_label or phrase:
				438	terms.append(phrase_label or phrase)
				439
				440	# Case 2: Comparisons
				441	elif word_label:
				442	special_prefixes_match = any(
				443	word_label.startswith(p) for p in NON_OP_PREFIXES)
				444	match = OP_RE.match(word_label)
				445	if match and not special_prefixes_match:
				446	label = match.group('prefix')
				447	op = match.group('op')
				448	word = match.group('value')
				449	terms.append('%s%s"%s"' % (label, op, word))
				450	else:
				451	# It looked like a key:value cond, but not exactly, so treat it
				452	# as fulltext search. It is probably a tiny bit of source code.
				453	terms.append('"%s"' % word_label)
				454
				455	# Case 3: Simple words.
				456	elif word:
				457	terms.append(word)
				458
				459	else: # pragma: no coverage
				460	warnings.append('Unparsable search term')
				461
				462	return terms
				463
				464
				465	def _ParseDateValue(val, now=None):
				466	# type: (str, int) -> int
				467	"""Convert the user-entered date into timestamp."""
				468	# Support timestamp value such as opened>1437671476
				469	try:
				470	return int(val)
				471	except ValueError:
				472	pass
				473
				474	# TODO(jrobbins): future: take timezones into account.
				475	# TODO(jrobbins): for now, explain to users that "today" is
				476	# actually now: the current time, not 12:01am in their timezone.
				477	# In fact, it is not very useful because everything in the system
				478	# happened before the current time.
				479	if val == 'today':
				480	return _CalculatePastDate(0, now=now)
				481	elif val.startswith('today-'):
				482	try:
				483	days_ago = int(val.split('-')[1])
				484	except ValueError:
				485	raise InvalidQueryError('Could not parse date: ' + val)
				486	return _CalculatePastDate(days_ago, now=now)
				487
				488	try:
				489	if '/' in val:
				490	year, month, day = [int(x) for x in val.split('/')]
				491	elif '-' in val:
				492	year, month, day = [int(x) for x in val.split('-')]
				493	else:
				494	raise InvalidQueryError('Could not parse date: ' + val)
				495	except ValueError:
				496	raise InvalidQueryError('Could not parse date: ' + val)
				497
				498	try:
				499	return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
				500	except ValueError:
				501	raise InvalidQueryError('Could not parse date: ' + val)
				502
				503
				504	def _CalculatePastDate(days_ago, now=None):
				505	# type: (int, int) -> int
				506	"""Calculates the timestamp N days ago from now."""
				507	if now is None:
				508	now = int(time.time())
				509	ts = now - days_ago * 24 * 60 * 60
				510	return ts
				511
				512
				513	def QueryToSubqueries(query):
				514	# type (str) -> Sequence[str]
				515	"""Splits a query into smaller queries based on Monorail's search syntax.
				516
				517	This function handles parsing parentheses and OR statements in Monorail's
				518	search syntax. By doing this parsing for OR statements and parentheses up
				519	front in FrontendSearchPipeline, we are able to convert complex queries
				520	with lots of ORs into smaller, more easily cacheable query terms.
				521
				522	These outputted subqueries should collectively return the same query results
				523	as the initial input query without containing any ORs or parentheses,
				524	allowing later search layers to parse queries without worrying about ORs
				525	or parentheses.
				526
				527	Some examples of possible queries and their expected output:
				528
				529	- '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F']
				530	- '(A) OR (B)' -> ['A', 'B']
				531	- '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F']
				532
				533	Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me",
				534	"Pri=1", "hello world Hotlist=test", "label!=a11y", etc
				535
				536	Note: Monorail implicitly ANDs any query terms separated by a space. For
				537	the most part, AND functionality is handled at a later layer in search
				538	processing. However, this case becomes important here when considering the
				539	fact that a prentheses group can either be ANDed or ORed with terms that
				540	surround it.
				541
				542	The _MultiplySubqueries helper is used to AND the results of different
				543	groups together whereas concatenating lists is used to OR subqueries
				544	together.
				545
				546	Args:
				547	query: The initial query that was sent to the search.
				548
				549	Returns:
				550	List of query fragments to be independently processed as search terms.
				551
				552	Raises:
				553	InvalidQueryError if parentheses are unmatched.
				554	"""
				555	tokens = _ValidateAndTokenizeQuery(query)
				556
				557	# Using an iterator allows us to keep our current loop position across
				558	# helpers. This makes recursion a lot easier.
				559	token_iterator = PeekIterator(tokens)
				560
				561	subqueries = _ParseQuery(token_iterator)
				562
				563	if not len(subqueries):
				564	# Several cases, such as an empty query or a query with only parentheses
				565	# will result in an empty set of subqueries. In these cases, we still want
				566	# to give the search pipeline a single empty query to process.
				567	return ['']
				568
				569	return subqueries
				570
				571
				572	def _ParseQuery(token_iterator):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	573	# type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	574	"""Recursive helper to convert query tokens into a list of subqueries.
				575
				576	Parses a Query based on the following grammar (EBNF):
				577
				578	Query := OrGroup { [OrOperator] OrGroup }
				579	OrGroup := AndGroup { AndGroup }
				580	AndGroup := Subquery \| ParenthesesGroup
				581	ParenthesesGroup := "(" Query ")"
				582	Subquery := /.+/
				583	OrOperator := " OR "
				584
				585	An important nuance is that two groups can be next to each other, separated
				586	only by a word boundary (ie: space or parentheses). In this case, they are
				587	implicitly ANDed. In practice, because unparenthesized fragments ANDed by
				588	spaces are stored as single tokens, we only need to handle the AND case when
				589	a parentheses group is implicitly ANDed with an adjacent group.
				590
				591	Order of precedence is implemented by recursing through OR groups before
				592	recursing through AND groups.
				593
				594	Args:
				595	token_iterator: Iterator over a list of query tokens.
				596
				597	Returns:
				598	List of query fragments to be processed as search terms.
				599
				600	Raises:
				601	InvalidQueryError if tokens were inputted in a format that does not follow
				602	our search grammar.
				603	"""
				604	subqueries = []
				605	try:
				606	if token_iterator.peek().token_type == OR:
				607	# Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup.
				608	# ie: "(OR A)" will be processed as "A"
				609	next(token_iterator)
				610
				611	subqueries = _ParseOrGroup(token_iterator)
				612
				613	while token_iterator.peek().token_type == OR:
				614	# Consume the OR tokens without doing anything with it.
				615	next(token_iterator)
				616
				617	next_token = token_iterator.peek()
				618	if next_token.token_type == RIGHT_PAREN:
				619	# Edge case: Ignore empty OR groups at the end of a ParenthesesGroup.
				620	# ie: "(A OR)" will be processed as "A"
				621	return subqueries
				622
				623	next_subqueries = _ParseOrGroup(token_iterator)
				624
				625	# Concatenate results of OR groups together.
				626	subqueries = subqueries + next_subqueries
				627
				628	except StopIteration:
				629	pass
				630	# Return when we've reached the end of the string.
				631	return subqueries
				632
				633
				634	def _ParseOrGroup(token_iterator):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	635	# type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	636	"""Recursive helper to convert a single "OrGroup" into subqueries.
				637
				638	An OrGroup here is based on the following grammar:
				639
				640	Query := OrGroup { [OrOperator] OrGroup }
				641	OrGroup := AndGroup { AndGroup }
				642	AndGroup := Subquery \| ParenthesesGroup
				643	ParenthesesGroup := "(" Query ")"
				644	Subquery := /.+/
				645	OrOperator := " OR "
				646
				647	Args:
				648	token_iterator: Iterator over a list of query tokens.
				649
				650	Returns:
				651	List of query fragments to be processed as search terms.
				652
				653	Raises:
				654	InvalidQueryError if tokens were inputted in a format that does not follow
				655	our search grammar.
				656	"""
				657	subqueries = _ParseAndGroup(token_iterator)
				658
				659	try:
				660	# Iterate until there are no more AND groups left to see.
				661	# Subquery or left parentheses are the possible starts of an AndGroup.
				662	while (token_iterator.peek().token_type == SUBQUERY or
				663	token_iterator.peek().token_type == LEFT_PAREN):
				664
				665	# Find subqueries from the next AND group.
				666	next_subqueries = _ParseAndGroup(token_iterator)
				667
				668	# Multiply all results across AND groups together.
				669	subqueries = _MultiplySubqueries(subqueries, next_subqueries)
				670	except StopIteration:
				671	pass
				672
				673	return subqueries
				674
				675
				676	def _ParseAndGroup(token_iterator):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	677	# type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	678	"""Recursive helper to convert a single "AndGroup" into subqueries.
				679
				680	An OrGroup here is based on the following grammar:
				681
				682	Query := OrGroup { [OrOperator] OrGroup }
				683	OrGroup := AndGroup { AndGroup }
				684	AndGroup := Subquery \| ParenthesesGroup
				685	ParenthesesGroup := "(" Query ")"
				686	Subquery := /.+/
				687	OrOperator := " OR "
				688
				689	Args:
				690	token_iterator: Iterator over a list of query tokens.
				691
				692	Returns:
				693	List of query fragments to be processed as search terms.
				694
				695	Raises:
				696	InvalidQueryError if tokens were inputted in a format that does not follow
				697	our search grammar.
				698	"""
				699	try:
				700	token = next(token_iterator)
				701	if token.token_type == LEFT_PAREN:
				702	if token_iterator.peek().token_type == RIGHT_PAREN:
				703	# Don't recurse into the ParenthesesGroup if there's nothing inside.
				704	next(token_iterator)
				705	return []
				706
				707	# Recurse into the ParenthesesGroup.
				708	subqueries = _ParseQuery(token_iterator)
				709
				710	# Next token should be a right parenthesis.
				711	next(token_iterator)
				712
				713	return subqueries
				714	elif token.token_type == SUBQUERY:
				715	return [token.value]
				716	else:
				717	# This should not happen if other QueryToSubqueries helpers are working
				718	# properly.
				719	raise InvalidQueryError('Inputted tokens do not follow grammar.')
				720	except StopIteration:
				721	pass
				722	return []
				723
				724
				725	def _ValidateAndTokenizeQuery(query):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	726	# type: (str) -> Sequence[mrproto.ast_pb2.QueryToken]
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	727	"""Converts the input query into a set of tokens for easier parsing.
				728
				729	Tokenizing the query string before parsing allows us to not have to as many
				730	string manipulations while parsing, which simplifies our later code.
				731
				732	Args:
				733	query: Query to tokenize.
				734
				735	Returns:
				736	List of Token objects for use in query processing.
				737
				738	Raises:
				739	InvalidQueryError if parentheses are unmatched.
				740	"""
				741	tokens = [] # Function result
				742	count = 0 # Used for checking if parentheses are balanced
				743	s = '' # Records current string fragment. Cleared when a token is added.
				744
				745	for ch in query:
				746	if ch == '(':
				747	count += 1
				748
				749	# Add subquery from before we hit this parenthesis.
				750	tokens.extend(_TokenizeSubqueryOnOr(s))
				751	s = ''
				752
				753	tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN))
				754	elif ch == ')':
				755	count -= 1
				756
				757	if count < 0:
				758	# More closing parentheses then open parentheses.
				759	raise InvalidQueryError('Search query has unbalanced parentheses.')
				760
				761	# Add subquery from before we hit this parenthesis.
				762	tokens.extend(_TokenizeSubqueryOnOr(s))
				763	s = ''
				764
				765	tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN))
				766	else:
				767	s += ch
				768
				769	if count != 0:
				770	raise InvalidQueryError('Search query has unbalanced parentheses.')
				771
				772	# Add any trailing tokens.
				773	tokens.extend(_TokenizeSubqueryOnOr(s))
				774
				775	return tokens
				776
				777
				778	def _TokenizeSubqueryOnOr(subquery):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	779	# type: (str) -> Sequence[mrproto.ast_pb2.QueryToken]
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	780	"""Helper to split a subquery by OR and convert the result into tokens.
				781
				782	Args:
				783	subquery: A string without parentheses to tokenize.
				784
				785	Returns:
				786	Tokens for the subquery with OR tokens separating query strings if
				787	applicable.
				788	"""
				789	if len(subquery) == 0:
				790	return []
				791
				792	result = []
				793	fragments = subquery.split(OR_SYMBOL)
				794	for f in fragments:
				795	# Interleave the string fragments with OR tokens.
				796	result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip()))
				797	result.append(ast_pb2.QueryToken(token_type=OR))
				798
				799	# Remove trailing OR.
				800	result.pop()
				801
				802	# Trim empty strings at the beginning or end. ie: if subquery is ' OR ',
				803	# we want the list to be ['OR'], not ['', 'OR', ''].
				804	if len(result) > 1 and result[0].value == '':
				805	result.pop(0)
				806	if len(result) > 1 and result[-1].value == '':
				807	result.pop()
				808	return result
				809
				810
				811	def _MultiplySubqueries(a, b):
				812	# type: (Sequence[str], Sequence[str]) -> Sequence[str]
				813	"""Helper to AND subqueries from two separate lists.
				814
				815	Args:
				816	a: First list of subqueries.
				817	b: Second list of subqueries.
				818
				819	Returns:
				820	List with n x m subqueries.
				821	"""
				822	if not len(a):
				823	return b
				824	if not len(b):
				825	return a
				826	res = []
				827	for q1 in a:
				828	for q2 in b:
				829	# AND two subqueries together by concatenating them.
				830	query = (q1.strip() + ' ' + q2.strip()).strip()
				831	res.append(query)
				832	return res
				833
				834
				835	class PeekIterator:
				836	"""Simple iterator with peek() functionality.
				837
				838	Used by QueryToSubqueries to maintain state easily across recursive calls.
				839	"""
				840
				841	def __init__(self, source):
				842	# type: (Sequence[Any])
				843	self.__source = source
				844	self.__i = 0
				845
				846	def peek(self):
				847	# type: () -> Any
				848	"""Gets the next value in the iterator without side effects.
				849
				850	Returns:
				851	Next value in iterator.
				852
				853	Raises:
				854	StopIteration if you're at the end of the iterator.
				855	"""
				856	if self.__i >= len(self.__source):
				857	raise StopIteration
				858	return self.__source[self.__i]
				859
				860	def __iter__(self):
				861	# type: () -> Sequence[Any]
				862	"""Return self to make iterator iterable."""
				863	return self
				864
				865	def __repr__(self):
				866	# type: () -> str
				867	"""Allow logging current iterator value for debugging."""
				868	try:
				869	return str(self.peek())
				870	except StopIteration:
				871	pass
				872	return 'End of PeekIterator'
				873
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	874	def __next__(self):
				875	# type: () -> Any
				876	"""Gets the next value in the iterator and increments pointer.
				877
				878	Returns:
				879	Next value in iterator.
				880
				881	Raises:
				882	StopIteration if you're at the end of the iterator.
				883	"""
				884	return self.next()
				885
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	886	def next(self):
				887	# type: () -> Any
				888	"""Gets the next value in the iterator and increments pointer.
				889
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	890	For backwards compatibility with Python 2.
				891
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	892	Returns:
				893	Next value in iterator.
				894
				895	Raises:
				896	StopIteration if you're at the end of the iterator.
				897	"""
				898	if self.__i >= len(self.__source):
				899	raise StopIteration
				900	value = self.__source[self.__i]
				901	self.__i += 1
				902	return value
				903
				904
				905	class Error(Exception):
				906	"""Base exception class for this package."""
				907	pass
				908
				909
				910	class InvalidQueryError(Error):
				911	"""Error raised when an invalid query is requested."""
				912	pass