Blame - search/query2ast.py - monorail-avm99963

blob: 235f9b3923c52f495e1343e564da7cd84af41814 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame^]	1	# Copyright 2016 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5
				6	"""A set of functions that integrate the GAE search index with Monorail."""
				7	from __future__ import print_function
				8	from __future__ import division
				9	from __future__ import absolute_import
				10
				11	import collections
				12	import datetime
				13	import logging
				14	import re
				15	import time
				16
				17	from google.appengine.api import search
				18
				19	from proto import ast_pb2
				20	from proto import tracker_pb2
				21
				22
				23	# TODO(jrobbins): Consider re-implementing this whole file by using a
				24	# BNF syntax specification and a parser generator or library.
				25
				26	# encodings
				27	UTF8 = 'utf-8'
				28
				29	# Operator used for OR statements.
				30	OR_SYMBOL = ' OR '
				31
				32	# Token types used for parentheses parsing.
				33	SUBQUERY = ast_pb2.TokenType.SUBQUERY
				34	LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN
				35	RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN
				36	OR = ast_pb2.TokenType.OR
				37
				38	# Field types and operators
				39	BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
				40	DATE = tracker_pb2.FieldTypes.DATE_TYPE
				41	NUM = tracker_pb2.FieldTypes.INT_TYPE
				42	TXT = tracker_pb2.FieldTypes.STR_TYPE
				43	APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE
				44
				45	EQ = ast_pb2.QueryOp.EQ
				46	NE = ast_pb2.QueryOp.NE
				47	LT = ast_pb2.QueryOp.LT
				48	GT = ast_pb2.QueryOp.GT
				49	LE = ast_pb2.QueryOp.LE
				50	GE = ast_pb2.QueryOp.GE
				51	TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
				52	NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
				53	IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
				54	IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
				55	KEY_HAS = ast_pb2.QueryOp.KEY_HAS
				56
				57	# Mapping from user query comparison operators to our internal representation.
				58	OPS = {
				59	':': TEXT_HAS,
				60	'=': EQ,
				61	'!=': NE,
				62	'<': LT,
				63	'>': GT,
				64	'<=': LE,
				65	'>=': GE,
				66	}
				67
				68	# When the query has a leading minus, switch the operator for its opposite.
				69	NEGATED_OPS = {
				70	EQ: NE,
				71	NE: EQ,
				72	LT: GE,
				73	GT: LE,
				74	LE: GT,
				75	GE: LT,
				76	TEXT_HAS: NOT_TEXT_HAS,
				77	# IS_DEFINED is handled separately.
				78	}
				79
				80	# This is a partial regular expression that matches all of our comparison
				81	# operators, such as =, 1=, >, and <. Longer ones listed first so that the
				82	# shorter ones don't cause premature matches.
				83	OPS_PATTERN = '\|'.join(
				84	map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op))))
				85
				86	# This RE extracts search terms from a subquery string.
				87	TERM_RE = re.compile(
				88	r'(-?"[^"]+")\|' # E.g., ["division by zero"]
				89	r'(\S+(%s)[^ "]+)\|' # E.g., [stars>10]
				90	r'(\w+(%s)"[^"]+")\|' # E.g., [summary:"memory leak"]
				91	r'(-?[._\\w][-._\\w]+)' # E.g., [-workaround]
				92	% (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
				93
				94	# This RE is used to further decompose a comparison term into prefix, op, and
				95	# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
				96	# can include a leading "-" to negate the comparison.
				97	OP_RE = re.compile(
				98	r'^(?P<prefix>[-_.\w]*?)'
				99	r'(?P<op>%s)'
				100	r'(?P<value>([-@\w][-\,./:<=>@\w]\|"[^"]*"))$' %
				101	OPS_PATTERN,
				102	flags=re.UNICODE)
				103
				104
				105	# Predefined issue fields passed to the query parser.
				106	_ISSUE_FIELDS_LIST = [
				107	(ast_pb2.ANY_FIELD, TXT),
				108	('attachment', TXT), # attachment file names
				109	('attachments', NUM), # number of attachment files
				110	('blocked', BOOL),
				111	('blockedon', TXT),
				112	('blockedon_id', NUM),
				113	('blocking', TXT),
				114	('blocking_id', NUM),
				115	('cc', TXT),
				116	('cc_id', NUM),
				117	('comment', TXT),
				118	('commentby', TXT),
				119	('commentby_id', NUM),
				120	('component', TXT),
				121	('component_id', NUM),
				122	('description', TXT),
				123	('gate', TXT),
				124	('hotlist', TXT),
				125	('hotlist_id', NUM),
				126	('id', NUM),
				127	('is_spam', BOOL),
				128	('label', TXT),
				129	('label_id', NUM),
				130	('mergedinto', NUM),
				131	('mergedinto_id', NUM),
				132	('open', BOOL),
				133	('owner', TXT),
				134	('ownerbouncing', BOOL),
				135	('owner_id', NUM),
				136	('project', TXT),
				137	('reporter', TXT),
				138	('reporter_id', NUM),
				139	('spam', BOOL),
				140	('stars', NUM),
				141	('starredby', TXT),
				142	('starredby_id', NUM),
				143	('status', TXT),
				144	('status_id', NUM),
				145	('summary', TXT),
				146	]
				147
				148	_DATE_FIELDS = (
				149	'closed',
				150	'modified',
				151	'opened',
				152	'ownermodified',
				153	'ownerlastvisit',
				154	'statusmodified',
				155	'componentmodified',
				156	)
				157
				158	# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
				159	_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
				160
				161	_DATE_FIELD_SUFFIX_TO_OP = {
				162	'-after': '>',
				163	'-before': '<',
				164	}
				165
				166	SET_BY_SUFFIX = '-by'
				167	SET_ON_SUFFIX = '-on'
				168	APPROVER_SUFFIX = '-approver'
				169	STATUS_SUFFIX = '-status'
				170
				171	_APPROVAL_SUFFIXES = (
				172	SET_BY_SUFFIX,
				173	SET_ON_SUFFIX,
				174	APPROVER_SUFFIX,
				175	STATUS_SUFFIX,
				176	)
				177
				178	BUILTIN_ISSUE_FIELDS = {
				179	f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
				180	for f_name, f_type in _ISSUE_FIELDS_LIST}
				181
				182
				183	# Do not treat strings that start with the below as key:value search terms.
				184	# See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail.
				185	NON_OP_PREFIXES = (
				186	'http:',
				187	'https:',
				188	)
				189
				190
				191	def ParseUserQuery(
				192	query, scope, builtin_fields, harmonized_config, warnings=None,
				193	now=None):
				194	# type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef],
				195	# proto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) ->
				196	# proto.ast_pb2.QueryAST
				197	"""Parse a user query and return a set of structure terms.
				198
				199	Args:
				200	query: string with user's query. E.g., 'Priority=High'.
				201	scope: string search terms that define the scope in which the
				202	query should be executed. They are expressed in the same
				203	user query language. E.g., adding the canned query.
				204	builtin_fields: dict {field_name: FieldDef(field_name, type)}
				205	mapping field names to FieldDef objects for built-in fields.
				206	harmonized_config: config for all the projects being searched.
				207	@@@ custom field name is not unique in cross project search.
				208	- custom_fields = {field_name: [fd, ...]}
				209	- query build needs to OR each possible interpretation
				210	- could be label in one project and field in another project.
				211	@@@ what about searching across all projects?
				212	warnings: optional list to accumulate warning messages.
				213	now: optional timestamp for tests, otherwise time.time() is used.
				214
				215	Returns:
				216	A QueryAST with conjunctions (usually just one), where each has a list of
				217	Condition PBs with op, fields, str_values and int_values. E.g., the query
				218	[priority=high leak OR stars>100] over open issues would return
				219	QueryAST(
				220	Conjunction(Condition(EQ, [open_fd], [], [1]),
				221	Condition(EQ, [label_fd], ['priority-high'], []),
				222	Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
				223	Conjunction(Condition(EQ, [open_fd], [], [1]),
				224	Condition(GT, [stars_fd], [], [100])))
				225
				226	Raises:
				227	InvalidQueryError: If a problem was detected in the user's query.
				228	"""
				229	if warnings is None:
				230	warnings = []
				231
				232	# Convert the overall query into one or more OR'd subqueries.
				233	subqueries = QueryToSubqueries(query)
				234
				235	# Make a dictionary of all fields: built-in + custom in each project.
				236	combined_fields = collections.defaultdict(
				237	list, {field_name: [field_def]
				238	for field_name, field_def in builtin_fields.items()})
				239	for fd in harmonized_config.field_defs:
				240	if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
				241	# Only do non-enum fields because enums are stored as labels
				242	combined_fields[fd.field_name.lower()].append(fd)
				243	if fd.field_type == APPROVAL:
				244	for approval_suffix in _APPROVAL_SUFFIXES:
				245	combined_fields[fd.field_name.lower() + approval_suffix].append(fd)
				246
				247	conjunctions = [
				248	_ParseConjunction(sq, scope, combined_fields, warnings, now=now)
				249	for sq in subqueries]
				250	return ast_pb2.QueryAST(conjunctions=conjunctions)
				251
				252
				253	def _ParseConjunction(subquery, scope, fields, warnings, now=None):
				254	# type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
				255	# int) -> proto.ast_pb2.Condition
				256	"""Parse part of a user query into a Conjunction PB."""
				257	scoped_query = ('%s %s' % (scope, subquery)).lower()
				258	cond_strs = _ExtractConds(scoped_query, warnings)
				259	conds = [_ParseCond(cond_str, fields, warnings, now=now)
				260	for cond_str in cond_strs]
				261	conds = [cond for cond in conds if cond]
				262	return ast_pb2.Conjunction(conds=conds)
				263
				264
				265	def _ParseCond(cond_str, fields, warnings, now=None):
				266	# type: (str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
				267	# int) -> proto.ast_pb2.Condition
				268	"""Parse one user query condition string into a Condition PB."""
				269	op_match = OP_RE.match(cond_str)
				270	# Do not treat as key:value search terms if any of the special prefixes match.
				271	special_prefixes_match = any(
				272	cond_str.startswith(p) for p in NON_OP_PREFIXES)
				273	if op_match and not special_prefixes_match:
				274	prefix = op_match.group('prefix')
				275	op = op_match.group('op')
				276	val = op_match.group('value')
				277	# Special case handling to continue to support old date query terms from
				278	# code.google.com. See monorail:151 for more details.
				279	if prefix.startswith(_DATE_FIELDS):
				280	for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
				281	if prefix.endswith(date_suffix):
				282	prefix = prefix.rstrip(date_suffix)
				283	op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
				284	return _ParseStructuredTerm(prefix, op, val, fields, now=now)
				285
				286	# Treat the cond as a full-text search term, which might be negated.
				287	if cond_str.startswith('-'):
				288	op = NOT_TEXT_HAS
				289	cond_str = cond_str[1:]
				290	else:
				291	op = TEXT_HAS
				292
				293	# Construct a full-text Query object as a dry-run to validate that
				294	# the syntax is acceptable.
				295	try:
				296	_fts_query = search.Query(cond_str)
				297	except search.QueryError:
				298	warnings.append('Ignoring full-text term: %s' % cond_str)
				299	return None
				300
				301	# Flag a potential user misunderstanding.
				302	if cond_str.lower() in ('and', 'or', 'not'):
				303	warnings.append(
				304	'The only supported boolean operator is OR (all capitals).')
				305
				306	return ast_pb2.MakeCond(
				307	op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
				308
				309
				310	def _ParseStructuredTerm(prefix, op_str, value, fields, now=None):
				311	# type: (str, str, str, Mapping[str, proto.tracker_pb2.FieldDef]) ->
				312	# proto.ast_pb2.Condition
				313	"""Parse one user structured query term into an internal representation.
				314
				315	Args:
				316	prefix: The query operator, usually a field name. E.g., summary. It can
				317	also be special operators like "is" to test boolean fields.
				318	op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
				319	value: the value to compare against, e.g., term to find in that field.
				320	fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
				321	now: optional timestamp for tests, otherwise time.time() is used.
				322
				323	Returns:
				324	A Condition PB.
				325	"""
				326	unquoted_value = value.strip('"')
				327	# Quick-OR is a convenient way to write one condition that matches any one of
				328	# multiple values, like set membership. E.g., [Priority=High,Critical].
				329	# Ignore empty values caused by duplicated or trailing commas. E.g.,
				330	# [Priority=High,,Critical,] is equivalent to [Priority=High,Critical].
				331	quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()]
				332
				333	op = OPS[op_str]
				334	negate = False
				335	if prefix.startswith('-'):
				336	negate = True
				337	op = NEGATED_OPS.get(op, op)
				338	prefix = prefix[1:]
				339
				340	if prefix == 'is' and unquoted_value in [
				341	'open', 'blocked', 'spam', 'ownerbouncing']:
				342	return ast_pb2.MakeCond(
				343	NE if negate else EQ, fields[unquoted_value], [], [])
				344
				345	# Search entries with or without any value in the specified field.
				346	if prefix == 'has':
				347	op = IS_NOT_DEFINED if negate else IS_DEFINED
				348	if '.' in unquoted_value: # Possible search for phase field with any value.
				349	phase_name, possible_field = unquoted_value.split('.', 1)
				350	if possible_field in fields:
				351	return ast_pb2.MakeCond(
				352	op, fields[possible_field], [], [], phase_name=phase_name)
				353	elif unquoted_value in fields: # Look for that field with any value.
				354	return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
				355	else: # Look for any label with that prefix.
				356	return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
				357
				358	# Search entries with certain gates.
				359	if prefix == 'gate':
				360	return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, [])
				361
				362	# Determine hotlist query type.
				363	# If prefix is not 'hotlist', quick_or_vals is empty, or qov
				364	# does not contain ':', is_fields will remain True
				365	is_fields = True
				366	if prefix == 'hotlist':
				367	try:
				368	if ':' not in quick_or_vals[0]:
				369	is_fields = False
				370	except IndexError:
				371	is_fields = False
				372
				373	phase_name = None
				374	if '.' in prefix and is_fields:
				375	split_prefix = prefix.split('.', 1)
				376	if split_prefix[1] in fields:
				377	phase_name, prefix = split_prefix
				378
				379	# search built-in and custom fields. E.g., summary.
				380	if prefix in fields and is_fields:
				381	# Note: if first matching field is date-type, we assume they all are.
				382	# TODO(jrobbins): better handling for rare case where multiple projects
				383	# define the same custom field name, and one is a date and another is not.
				384	first_field = fields[prefix][0]
				385	if first_field.field_type == DATE:
				386	date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
				387	return ast_pb2.MakeCond(op, fields[prefix], [], date_values)
				388	elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX):
				389	date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
				390	return ast_pb2.MakeCond(
				391	op,
				392	fields[prefix], [],
				393	date_values,
				394	key_suffix=SET_ON_SUFFIX,
				395	phase_name=phase_name)
				396	else:
				397	quick_or_ints = []
				398	for qov in quick_or_vals:
				399	try:
				400	quick_or_ints.append(int(qov))
				401	except ValueError:
				402	pass
				403	if first_field.field_type == APPROVAL:
				404	for approval_suffix in _APPROVAL_SUFFIXES:
				405	if prefix.endswith(approval_suffix):
				406	return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
				407	quick_or_ints, key_suffix=approval_suffix,
				408	phase_name=phase_name)
				409	return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
				410	quick_or_ints, phase_name=phase_name)
				411
				412	# Since it is not a field, treat it as labels, E.g., Priority.
				413	quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
				414	# Convert substring match to key-value match if user typed 'foo:bar'.
				415	if op == TEXT_HAS:
				416	op = KEY_HAS
				417	return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
				418
				419
				420	def _ExtractConds(query, warnings):
				421	# type: (str, Sequence[str]) -> Sequence[str]
				422	"""Parse a query string into a list of individual condition strings.
				423
				424	Args:
				425	query: UTF-8 encoded search query string.
				426	warnings: list to accumulate warning messages.
				427
				428	Returns:
				429	A list of query condition strings.
				430	"""
				431	# Convert to unicode then search for distinct terms.
				432	term_matches = TERM_RE.findall(query)
				433
				434	terms = []
				435	for (phrase, word_label, _op1, phrase_label, _op2,
				436	word) in term_matches:
				437	# Case 1: Quoted phrases, e.g., ["hot dog"].
				438	if phrase_label or phrase:
				439	terms.append(phrase_label or phrase)
				440
				441	# Case 2: Comparisons
				442	elif word_label:
				443	special_prefixes_match = any(
				444	word_label.startswith(p) for p in NON_OP_PREFIXES)
				445	match = OP_RE.match(word_label)
				446	if match and not special_prefixes_match:
				447	label = match.group('prefix')
				448	op = match.group('op')
				449	word = match.group('value')
				450	terms.append('%s%s"%s"' % (label, op, word))
				451	else:
				452	# It looked like a key:value cond, but not exactly, so treat it
				453	# as fulltext search. It is probably a tiny bit of source code.
				454	terms.append('"%s"' % word_label)
				455
				456	# Case 3: Simple words.
				457	elif word:
				458	terms.append(word)
				459
				460	else: # pragma: no coverage
				461	warnings.append('Unparsable search term')
				462
				463	return terms
				464
				465
				466	def _ParseDateValue(val, now=None):
				467	# type: (str, int) -> int
				468	"""Convert the user-entered date into timestamp."""
				469	# Support timestamp value such as opened>1437671476
				470	try:
				471	return int(val)
				472	except ValueError:
				473	pass
				474
				475	# TODO(jrobbins): future: take timezones into account.
				476	# TODO(jrobbins): for now, explain to users that "today" is
				477	# actually now: the current time, not 12:01am in their timezone.
				478	# In fact, it is not very useful because everything in the system
				479	# happened before the current time.
				480	if val == 'today':
				481	return _CalculatePastDate(0, now=now)
				482	elif val.startswith('today-'):
				483	try:
				484	days_ago = int(val.split('-')[1])
				485	except ValueError:
				486	raise InvalidQueryError('Could not parse date: ' + val)
				487	return _CalculatePastDate(days_ago, now=now)
				488
				489	try:
				490	if '/' in val:
				491	year, month, day = [int(x) for x in val.split('/')]
				492	elif '-' in val:
				493	year, month, day = [int(x) for x in val.split('-')]
				494	else:
				495	raise InvalidQueryError('Could not parse date: ' + val)
				496	except ValueError:
				497	raise InvalidQueryError('Could not parse date: ' + val)
				498
				499	try:
				500	return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
				501	except ValueError:
				502	raise InvalidQueryError('Could not parse date: ' + val)
				503
				504
				505	def _CalculatePastDate(days_ago, now=None):
				506	# type: (int, int) -> int
				507	"""Calculates the timestamp N days ago from now."""
				508	if now is None:
				509	now = int(time.time())
				510	ts = now - days_ago * 24 * 60 * 60
				511	return ts
				512
				513
				514	def QueryToSubqueries(query):
				515	# type (str) -> Sequence[str]
				516	"""Splits a query into smaller queries based on Monorail's search syntax.
				517
				518	This function handles parsing parentheses and OR statements in Monorail's
				519	search syntax. By doing this parsing for OR statements and parentheses up
				520	front in FrontendSearchPipeline, we are able to convert complex queries
				521	with lots of ORs into smaller, more easily cacheable query terms.
				522
				523	These outputted subqueries should collectively return the same query results
				524	as the initial input query without containing any ORs or parentheses,
				525	allowing later search layers to parse queries without worrying about ORs
				526	or parentheses.
				527
				528	Some examples of possible queries and their expected output:
				529
				530	- '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F']
				531	- '(A) OR (B)' -> ['A', 'B']
				532	- '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F']
				533
				534	Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me",
				535	"Pri=1", "hello world Hotlist=test", "label!=a11y", etc
				536
				537	Note: Monorail implicitly ANDs any query terms separated by a space. For
				538	the most part, AND functionality is handled at a later layer in search
				539	processing. However, this case becomes important here when considering the
				540	fact that a prentheses group can either be ANDed or ORed with terms that
				541	surround it.
				542
				543	The _MultiplySubqueries helper is used to AND the results of different
				544	groups together whereas concatenating lists is used to OR subqueries
				545	together.
				546
				547	Args:
				548	query: The initial query that was sent to the search.
				549
				550	Returns:
				551	List of query fragments to be independently processed as search terms.
				552
				553	Raises:
				554	InvalidQueryError if parentheses are unmatched.
				555	"""
				556	tokens = _ValidateAndTokenizeQuery(query)
				557
				558	# Using an iterator allows us to keep our current loop position across
				559	# helpers. This makes recursion a lot easier.
				560	token_iterator = PeekIterator(tokens)
				561
				562	subqueries = _ParseQuery(token_iterator)
				563
				564	if not len(subqueries):
				565	# Several cases, such as an empty query or a query with only parentheses
				566	# will result in an empty set of subqueries. In these cases, we still want
				567	# to give the search pipeline a single empty query to process.
				568	return ['']
				569
				570	return subqueries
				571
				572
				573	def _ParseQuery(token_iterator):
				574	# type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
				575	"""Recursive helper to convert query tokens into a list of subqueries.
				576
				577	Parses a Query based on the following grammar (EBNF):
				578
				579	Query := OrGroup { [OrOperator] OrGroup }
				580	OrGroup := AndGroup { AndGroup }
				581	AndGroup := Subquery \| ParenthesesGroup
				582	ParenthesesGroup := "(" Query ")"
				583	Subquery := /.+/
				584	OrOperator := " OR "
				585
				586	An important nuance is that two groups can be next to each other, separated
				587	only by a word boundary (ie: space or parentheses). In this case, they are
				588	implicitly ANDed. In practice, because unparenthesized fragments ANDed by
				589	spaces are stored as single tokens, we only need to handle the AND case when
				590	a parentheses group is implicitly ANDed with an adjacent group.
				591
				592	Order of precedence is implemented by recursing through OR groups before
				593	recursing through AND groups.
				594
				595	Args:
				596	token_iterator: Iterator over a list of query tokens.
				597
				598	Returns:
				599	List of query fragments to be processed as search terms.
				600
				601	Raises:
				602	InvalidQueryError if tokens were inputted in a format that does not follow
				603	our search grammar.
				604	"""
				605	subqueries = []
				606	try:
				607	if token_iterator.peek().token_type == OR:
				608	# Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup.
				609	# ie: "(OR A)" will be processed as "A"
				610	next(token_iterator)
				611
				612	subqueries = _ParseOrGroup(token_iterator)
				613
				614	while token_iterator.peek().token_type == OR:
				615	# Consume the OR tokens without doing anything with it.
				616	next(token_iterator)
				617
				618	next_token = token_iterator.peek()
				619	if next_token.token_type == RIGHT_PAREN:
				620	# Edge case: Ignore empty OR groups at the end of a ParenthesesGroup.
				621	# ie: "(A OR)" will be processed as "A"
				622	return subqueries
				623
				624	next_subqueries = _ParseOrGroup(token_iterator)
				625
				626	# Concatenate results of OR groups together.
				627	subqueries = subqueries + next_subqueries
				628
				629	except StopIteration:
				630	pass
				631	# Return when we've reached the end of the string.
				632	return subqueries
				633
				634
				635	def _ParseOrGroup(token_iterator):
				636	# type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
				637	"""Recursive helper to convert a single "OrGroup" into subqueries.
				638
				639	An OrGroup here is based on the following grammar:
				640
				641	Query := OrGroup { [OrOperator] OrGroup }
				642	OrGroup := AndGroup { AndGroup }
				643	AndGroup := Subquery \| ParenthesesGroup
				644	ParenthesesGroup := "(" Query ")"
				645	Subquery := /.+/
				646	OrOperator := " OR "
				647
				648	Args:
				649	token_iterator: Iterator over a list of query tokens.
				650
				651	Returns:
				652	List of query fragments to be processed as search terms.
				653
				654	Raises:
				655	InvalidQueryError if tokens were inputted in a format that does not follow
				656	our search grammar.
				657	"""
				658	subqueries = _ParseAndGroup(token_iterator)
				659
				660	try:
				661	# Iterate until there are no more AND groups left to see.
				662	# Subquery or left parentheses are the possible starts of an AndGroup.
				663	while (token_iterator.peek().token_type == SUBQUERY or
				664	token_iterator.peek().token_type == LEFT_PAREN):
				665
				666	# Find subqueries from the next AND group.
				667	next_subqueries = _ParseAndGroup(token_iterator)
				668
				669	# Multiply all results across AND groups together.
				670	subqueries = _MultiplySubqueries(subqueries, next_subqueries)
				671	except StopIteration:
				672	pass
				673
				674	return subqueries
				675
				676
				677	def _ParseAndGroup(token_iterator):
				678	# type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
				679	"""Recursive helper to convert a single "AndGroup" into subqueries.
				680
				681	An OrGroup here is based on the following grammar:
				682
				683	Query := OrGroup { [OrOperator] OrGroup }
				684	OrGroup := AndGroup { AndGroup }
				685	AndGroup := Subquery \| ParenthesesGroup
				686	ParenthesesGroup := "(" Query ")"
				687	Subquery := /.+/
				688	OrOperator := " OR "
				689
				690	Args:
				691	token_iterator: Iterator over a list of query tokens.
				692
				693	Returns:
				694	List of query fragments to be processed as search terms.
				695
				696	Raises:
				697	InvalidQueryError if tokens were inputted in a format that does not follow
				698	our search grammar.
				699	"""
				700	try:
				701	token = next(token_iterator)
				702	if token.token_type == LEFT_PAREN:
				703	if token_iterator.peek().token_type == RIGHT_PAREN:
				704	# Don't recurse into the ParenthesesGroup if there's nothing inside.
				705	next(token_iterator)
				706	return []
				707
				708	# Recurse into the ParenthesesGroup.
				709	subqueries = _ParseQuery(token_iterator)
				710
				711	# Next token should be a right parenthesis.
				712	next(token_iterator)
				713
				714	return subqueries
				715	elif token.token_type == SUBQUERY:
				716	return [token.value]
				717	else:
				718	# This should not happen if other QueryToSubqueries helpers are working
				719	# properly.
				720	raise InvalidQueryError('Inputted tokens do not follow grammar.')
				721	except StopIteration:
				722	pass
				723	return []
				724
				725
				726	def _ValidateAndTokenizeQuery(query):
				727	# type: (str) -> Sequence[proto.ast_pb2.QueryToken]
				728	"""Converts the input query into a set of tokens for easier parsing.
				729
				730	Tokenizing the query string before parsing allows us to not have to as many
				731	string manipulations while parsing, which simplifies our later code.
				732
				733	Args:
				734	query: Query to tokenize.
				735
				736	Returns:
				737	List of Token objects for use in query processing.
				738
				739	Raises:
				740	InvalidQueryError if parentheses are unmatched.
				741	"""
				742	tokens = [] # Function result
				743	count = 0 # Used for checking if parentheses are balanced
				744	s = '' # Records current string fragment. Cleared when a token is added.
				745
				746	for ch in query:
				747	if ch == '(':
				748	count += 1
				749
				750	# Add subquery from before we hit this parenthesis.
				751	tokens.extend(_TokenizeSubqueryOnOr(s))
				752	s = ''
				753
				754	tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN))
				755	elif ch == ')':
				756	count -= 1
				757
				758	if count < 0:
				759	# More closing parentheses then open parentheses.
				760	raise InvalidQueryError('Search query has unbalanced parentheses.')
				761
				762	# Add subquery from before we hit this parenthesis.
				763	tokens.extend(_TokenizeSubqueryOnOr(s))
				764	s = ''
				765
				766	tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN))
				767	else:
				768	s += ch
				769
				770	if count != 0:
				771	raise InvalidQueryError('Search query has unbalanced parentheses.')
				772
				773	# Add any trailing tokens.
				774	tokens.extend(_TokenizeSubqueryOnOr(s))
				775
				776	return tokens
				777
				778
				779	def _TokenizeSubqueryOnOr(subquery):
				780	# type: (str) -> Sequence[proto.ast_pb2.QueryToken]
				781	"""Helper to split a subquery by OR and convert the result into tokens.
				782
				783	Args:
				784	subquery: A string without parentheses to tokenize.
				785
				786	Returns:
				787	Tokens for the subquery with OR tokens separating query strings if
				788	applicable.
				789	"""
				790	if len(subquery) == 0:
				791	return []
				792
				793	result = []
				794	fragments = subquery.split(OR_SYMBOL)
				795	for f in fragments:
				796	# Interleave the string fragments with OR tokens.
				797	result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip()))
				798	result.append(ast_pb2.QueryToken(token_type=OR))
				799
				800	# Remove trailing OR.
				801	result.pop()
				802
				803	# Trim empty strings at the beginning or end. ie: if subquery is ' OR ',
				804	# we want the list to be ['OR'], not ['', 'OR', ''].
				805	if len(result) > 1 and result[0].value == '':
				806	result.pop(0)
				807	if len(result) > 1 and result[-1].value == '':
				808	result.pop()
				809	return result
				810
				811
				812	def _MultiplySubqueries(a, b):
				813	# type: (Sequence[str], Sequence[str]) -> Sequence[str]
				814	"""Helper to AND subqueries from two separate lists.
				815
				816	Args:
				817	a: First list of subqueries.
				818	b: Second list of subqueries.
				819
				820	Returns:
				821	List with n x m subqueries.
				822	"""
				823	if not len(a):
				824	return b
				825	if not len(b):
				826	return a
				827	res = []
				828	for q1 in a:
				829	for q2 in b:
				830	# AND two subqueries together by concatenating them.
				831	query = (q1.strip() + ' ' + q2.strip()).strip()
				832	res.append(query)
				833	return res
				834
				835
				836	class PeekIterator:
				837	"""Simple iterator with peek() functionality.
				838
				839	Used by QueryToSubqueries to maintain state easily across recursive calls.
				840	"""
				841
				842	def __init__(self, source):
				843	# type: (Sequence[Any])
				844	self.__source = source
				845	self.__i = 0
				846
				847	def peek(self):
				848	# type: () -> Any
				849	"""Gets the next value in the iterator without side effects.
				850
				851	Returns:
				852	Next value in iterator.
				853
				854	Raises:
				855	StopIteration if you're at the end of the iterator.
				856	"""
				857	if self.__i >= len(self.__source):
				858	raise StopIteration
				859	return self.__source[self.__i]
				860
				861	def __iter__(self):
				862	# type: () -> Sequence[Any]
				863	"""Return self to make iterator iterable."""
				864	return self
				865
				866	def __repr__(self):
				867	# type: () -> str
				868	"""Allow logging current iterator value for debugging."""
				869	try:
				870	return str(self.peek())
				871	except StopIteration:
				872	pass
				873	return 'End of PeekIterator'
				874
				875	def next(self):
				876	# type: () -> Any
				877	"""Gets the next value in the iterator and increments pointer.
				878
				879	Returns:
				880	Next value in iterator.
				881
				882	Raises:
				883	StopIteration if you're at the end of the iterator.
				884	"""
				885	if self.__i >= len(self.__source):
				886	raise StopIteration
				887	value = self.__source[self.__i]
				888	self.__i += 1
				889	return value
				890
				891
				892	class Error(Exception):
				893	"""Base exception class for this package."""
				894	pass
				895
				896
				897	class InvalidQueryError(Error):
				898	"""Error raised when an invalid query is requested."""
				899	pass