blob: 40c5cdcb528c2fdd5e715b024ed05e05b67586a1 [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""A set of functions that integrate the GAE search index with Monorail."""
6from __future__ import print_function
7from __future__ import division
8from __future__ import absolute_import
9
10import collections
11import datetime
12import logging
13import re
14import time
15
16from google.appengine.api import search
17
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010018from mrproto import ast_pb2
19from mrproto import tracker_pb2
Copybara854996b2021-09-07 19:36:02 +000020
21
22# TODO(jrobbins): Consider re-implementing this whole file by using a
23# BNF syntax specification and a parser generator or library.
24
25# encodings
26UTF8 = 'utf-8'
27
28# Operator used for OR statements.
29OR_SYMBOL = ' OR '
30
31# Token types used for parentheses parsing.
32SUBQUERY = ast_pb2.TokenType.SUBQUERY
33LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN
34RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN
35OR = ast_pb2.TokenType.OR
36
37# Field types and operators
38BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
39DATE = tracker_pb2.FieldTypes.DATE_TYPE
40NUM = tracker_pb2.FieldTypes.INT_TYPE
41TXT = tracker_pb2.FieldTypes.STR_TYPE
42APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE
43
44EQ = ast_pb2.QueryOp.EQ
45NE = ast_pb2.QueryOp.NE
46LT = ast_pb2.QueryOp.LT
47GT = ast_pb2.QueryOp.GT
48LE = ast_pb2.QueryOp.LE
49GE = ast_pb2.QueryOp.GE
50TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
51NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
52IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
53IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
54KEY_HAS = ast_pb2.QueryOp.KEY_HAS
55
56# Mapping from user query comparison operators to our internal representation.
57OPS = {
58 ':': TEXT_HAS,
59 '=': EQ,
60 '!=': NE,
61 '<': LT,
62 '>': GT,
63 '<=': LE,
64 '>=': GE,
65}
66
67# When the query has a leading minus, switch the operator for its opposite.
68NEGATED_OPS = {
69 EQ: NE,
70 NE: EQ,
71 LT: GE,
72 GT: LE,
73 LE: GT,
74 GE: LT,
75 TEXT_HAS: NOT_TEXT_HAS,
76 # IS_DEFINED is handled separately.
77 }
78
79# This is a partial regular expression that matches all of our comparison
80# operators, such as =, 1=, >, and <. Longer ones listed first so that the
81# shorter ones don't cause premature matches.
82OPS_PATTERN = '|'.join(
83 map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op))))
84
85# This RE extracts search terms from a subquery string.
86TERM_RE = re.compile(
87 r'(-?"[^"]+")|' # E.g., ["division by zero"]
88 r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
89 r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
90 r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
91 % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
92
93# This RE is used to further decompose a comparison term into prefix, op, and
94# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
95# can include a leading "-" to negate the comparison.
96OP_RE = re.compile(
97 r'^(?P<prefix>[-_.\w]*?)'
98 r'(?P<op>%s)'
99 r'(?P<value>([-@\w][-\*,./:<=>@\w]*|"[^"]*"))$' %
100 OPS_PATTERN,
101 flags=re.UNICODE)
102
103
104# Predefined issue fields passed to the query parser.
105_ISSUE_FIELDS_LIST = [
106 (ast_pb2.ANY_FIELD, TXT),
107 ('attachment', TXT), # attachment file names
108 ('attachments', NUM), # number of attachment files
109 ('blocked', BOOL),
110 ('blockedon', TXT),
111 ('blockedon_id', NUM),
112 ('blocking', TXT),
113 ('blocking_id', NUM),
114 ('cc', TXT),
115 ('cc_id', NUM),
116 ('comment', TXT),
117 ('commentby', TXT),
118 ('commentby_id', NUM),
119 ('component', TXT),
120 ('component_id', NUM),
121 ('description', TXT),
122 ('gate', TXT),
123 ('hotlist', TXT),
124 ('hotlist_id', NUM),
125 ('id', NUM),
126 ('is_spam', BOOL),
127 ('label', TXT),
128 ('label_id', NUM),
129 ('mergedinto', NUM),
130 ('mergedinto_id', NUM),
131 ('open', BOOL),
132 ('owner', TXT),
133 ('ownerbouncing', BOOL),
134 ('owner_id', NUM),
135 ('project', TXT),
136 ('reporter', TXT),
137 ('reporter_id', NUM),
138 ('spam', BOOL),
139 ('stars', NUM),
140 ('starredby', TXT),
141 ('starredby_id', NUM),
142 ('status', TXT),
143 ('status_id', NUM),
144 ('summary', TXT),
145 ]
146
147_DATE_FIELDS = (
148 'closed',
149 'modified',
150 'opened',
151 'ownermodified',
152 'ownerlastvisit',
153 'statusmodified',
154 'componentmodified',
155 )
156
157# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
158_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
159
160_DATE_FIELD_SUFFIX_TO_OP = {
161 '-after': '>',
162 '-before': '<',
163}
164
165SET_BY_SUFFIX = '-by'
166SET_ON_SUFFIX = '-on'
167APPROVER_SUFFIX = '-approver'
168STATUS_SUFFIX = '-status'
169
170_APPROVAL_SUFFIXES = (
171 SET_BY_SUFFIX,
172 SET_ON_SUFFIX,
173 APPROVER_SUFFIX,
174 STATUS_SUFFIX,
175)
176
177BUILTIN_ISSUE_FIELDS = {
178 f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
179 for f_name, f_type in _ISSUE_FIELDS_LIST}
180
181
182# Do not treat strings that start with the below as key:value search terms.
183# See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail.
184NON_OP_PREFIXES = (
185 'http:',
186 'https:',
187)
188
189
190def ParseUserQuery(
191 query, scope, builtin_fields, harmonized_config, warnings=None,
192 now=None):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100193 # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef],
194 # mrproto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) ->
195 # mrproto.ast_pb2.QueryAST
Copybara854996b2021-09-07 19:36:02 +0000196 """Parse a user query and return a set of structure terms.
197
198 Args:
199 query: string with user's query. E.g., 'Priority=High'.
200 scope: string search terms that define the scope in which the
201 query should be executed. They are expressed in the same
202 user query language. E.g., adding the canned query.
203 builtin_fields: dict {field_name: FieldDef(field_name, type)}
204 mapping field names to FieldDef objects for built-in fields.
205 harmonized_config: config for all the projects being searched.
206 @@@ custom field name is not unique in cross project search.
207 - custom_fields = {field_name: [fd, ...]}
208 - query build needs to OR each possible interpretation
209 - could be label in one project and field in another project.
210 @@@ what about searching across all projects?
211 warnings: optional list to accumulate warning messages.
212 now: optional timestamp for tests, otherwise time.time() is used.
213
214 Returns:
215 A QueryAST with conjunctions (usually just one), where each has a list of
216 Condition PBs with op, fields, str_values and int_values. E.g., the query
217 [priority=high leak OR stars>100] over open issues would return
218 QueryAST(
219 Conjunction(Condition(EQ, [open_fd], [], [1]),
220 Condition(EQ, [label_fd], ['priority-high'], []),
221 Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
222 Conjunction(Condition(EQ, [open_fd], [], [1]),
223 Condition(GT, [stars_fd], [], [100])))
224
225 Raises:
226 InvalidQueryError: If a problem was detected in the user's query.
227 """
228 if warnings is None:
229 warnings = []
230
231 # Convert the overall query into one or more OR'd subqueries.
232 subqueries = QueryToSubqueries(query)
233
234 # Make a dictionary of all fields: built-in + custom in each project.
235 combined_fields = collections.defaultdict(
236 list, {field_name: [field_def]
237 for field_name, field_def in builtin_fields.items()})
238 for fd in harmonized_config.field_defs:
239 if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
240 # Only do non-enum fields because enums are stored as labels
241 combined_fields[fd.field_name.lower()].append(fd)
242 if fd.field_type == APPROVAL:
243 for approval_suffix in _APPROVAL_SUFFIXES:
244 combined_fields[fd.field_name.lower() + approval_suffix].append(fd)
245
246 conjunctions = [
247 _ParseConjunction(sq, scope, combined_fields, warnings, now=now)
248 for sq in subqueries]
249 return ast_pb2.QueryAST(conjunctions=conjunctions)
250
251
252def _ParseConjunction(subquery, scope, fields, warnings, now=None):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100253 # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str],
254 # int) -> mrproto.ast_pb2.Condition
Copybara854996b2021-09-07 19:36:02 +0000255 """Parse part of a user query into a Conjunction PB."""
256 scoped_query = ('%s %s' % (scope, subquery)).lower()
257 cond_strs = _ExtractConds(scoped_query, warnings)
258 conds = [_ParseCond(cond_str, fields, warnings, now=now)
259 for cond_str in cond_strs]
260 conds = [cond for cond in conds if cond]
261 return ast_pb2.Conjunction(conds=conds)
262
263
264def _ParseCond(cond_str, fields, warnings, now=None):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100265 # type: (str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str],
266 # int) -> mrproto.ast_pb2.Condition
Copybara854996b2021-09-07 19:36:02 +0000267 """Parse one user query condition string into a Condition PB."""
268 op_match = OP_RE.match(cond_str)
269 # Do not treat as key:value search terms if any of the special prefixes match.
270 special_prefixes_match = any(
271 cond_str.startswith(p) for p in NON_OP_PREFIXES)
272 if op_match and not special_prefixes_match:
273 prefix = op_match.group('prefix')
274 op = op_match.group('op')
275 val = op_match.group('value')
276 # Special case handling to continue to support old date query terms from
277 # code.google.com. See monorail:151 for more details.
278 if prefix.startswith(_DATE_FIELDS):
279 for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
280 if prefix.endswith(date_suffix):
281 prefix = prefix.rstrip(date_suffix)
282 op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
283 return _ParseStructuredTerm(prefix, op, val, fields, now=now)
284
285 # Treat the cond as a full-text search term, which might be negated.
286 if cond_str.startswith('-'):
287 op = NOT_TEXT_HAS
288 cond_str = cond_str[1:]
289 else:
290 op = TEXT_HAS
291
292 # Construct a full-text Query object as a dry-run to validate that
293 # the syntax is acceptable.
294 try:
295 _fts_query = search.Query(cond_str)
296 except search.QueryError:
297 warnings.append('Ignoring full-text term: %s' % cond_str)
298 return None
299
300 # Flag a potential user misunderstanding.
301 if cond_str.lower() in ('and', 'or', 'not'):
302 warnings.append(
303 'The only supported boolean operator is OR (all capitals).')
304
305 return ast_pb2.MakeCond(
306 op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
307
308
309def _ParseStructuredTerm(prefix, op_str, value, fields, now=None):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100310 # type: (str, str, str, Mapping[str, mrproto.tracker_pb2.FieldDef]) ->
311 # mrproto.ast_pb2.Condition
Copybara854996b2021-09-07 19:36:02 +0000312 """Parse one user structured query term into an internal representation.
313
314 Args:
315 prefix: The query operator, usually a field name. E.g., summary. It can
316 also be special operators like "is" to test boolean fields.
317 op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
318 value: the value to compare against, e.g., term to find in that field.
319 fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
320 now: optional timestamp for tests, otherwise time.time() is used.
321
322 Returns:
323 A Condition PB.
324 """
325 unquoted_value = value.strip('"')
326 # Quick-OR is a convenient way to write one condition that matches any one of
327 # multiple values, like set membership. E.g., [Priority=High,Critical].
328 # Ignore empty values caused by duplicated or trailing commas. E.g.,
329 # [Priority=High,,Critical,] is equivalent to [Priority=High,Critical].
330 quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()]
331
332 op = OPS[op_str]
333 negate = False
334 if prefix.startswith('-'):
335 negate = True
336 op = NEGATED_OPS.get(op, op)
337 prefix = prefix[1:]
338
339 if prefix == 'is' and unquoted_value in [
340 'open', 'blocked', 'spam', 'ownerbouncing']:
341 return ast_pb2.MakeCond(
342 NE if negate else EQ, fields[unquoted_value], [], [])
343
344 # Search entries with or without any value in the specified field.
345 if prefix == 'has':
346 op = IS_NOT_DEFINED if negate else IS_DEFINED
347 if '.' in unquoted_value: # Possible search for phase field with any value.
348 phase_name, possible_field = unquoted_value.split('.', 1)
349 if possible_field in fields:
350 return ast_pb2.MakeCond(
351 op, fields[possible_field], [], [], phase_name=phase_name)
352 elif unquoted_value in fields: # Look for that field with any value.
353 return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
354 else: # Look for any label with that prefix.
355 return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
356
357 # Search entries with certain gates.
358 if prefix == 'gate':
359 return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, [])
360
361 # Determine hotlist query type.
362 # If prefix is not 'hotlist', quick_or_vals is empty, or qov
363 # does not contain ':', is_fields will remain True
364 is_fields = True
365 if prefix == 'hotlist':
366 try:
367 if ':' not in quick_or_vals[0]:
368 is_fields = False
369 except IndexError:
370 is_fields = False
371
372 phase_name = None
373 if '.' in prefix and is_fields:
374 split_prefix = prefix.split('.', 1)
375 if split_prefix[1] in fields:
376 phase_name, prefix = split_prefix
377
378 # search built-in and custom fields. E.g., summary.
379 if prefix in fields and is_fields:
380 # Note: if first matching field is date-type, we assume they all are.
381 # TODO(jrobbins): better handling for rare case where multiple projects
382 # define the same custom field name, and one is a date and another is not.
383 first_field = fields[prefix][0]
384 if first_field.field_type == DATE:
385 date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
386 return ast_pb2.MakeCond(op, fields[prefix], [], date_values)
387 elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX):
388 date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
389 return ast_pb2.MakeCond(
390 op,
391 fields[prefix], [],
392 date_values,
393 key_suffix=SET_ON_SUFFIX,
394 phase_name=phase_name)
395 else:
396 quick_or_ints = []
397 for qov in quick_or_vals:
398 try:
399 quick_or_ints.append(int(qov))
400 except ValueError:
401 pass
402 if first_field.field_type == APPROVAL:
403 for approval_suffix in _APPROVAL_SUFFIXES:
404 if prefix.endswith(approval_suffix):
405 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
406 quick_or_ints, key_suffix=approval_suffix,
407 phase_name=phase_name)
408 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
409 quick_or_ints, phase_name=phase_name)
410
411 # Since it is not a field, treat it as labels, E.g., Priority.
412 quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
413 # Convert substring match to key-value match if user typed 'foo:bar'.
414 if op == TEXT_HAS:
415 op = KEY_HAS
416 return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
417
418
419def _ExtractConds(query, warnings):
420 # type: (str, Sequence[str]) -> Sequence[str]
421 """Parse a query string into a list of individual condition strings.
422
423 Args:
424 query: UTF-8 encoded search query string.
425 warnings: list to accumulate warning messages.
426
427 Returns:
428 A list of query condition strings.
429 """
430 # Convert to unicode then search for distinct terms.
431 term_matches = TERM_RE.findall(query)
432
433 terms = []
434 for (phrase, word_label, _op1, phrase_label, _op2,
435 word) in term_matches:
436 # Case 1: Quoted phrases, e.g., ["hot dog"].
437 if phrase_label or phrase:
438 terms.append(phrase_label or phrase)
439
440 # Case 2: Comparisons
441 elif word_label:
442 special_prefixes_match = any(
443 word_label.startswith(p) for p in NON_OP_PREFIXES)
444 match = OP_RE.match(word_label)
445 if match and not special_prefixes_match:
446 label = match.group('prefix')
447 op = match.group('op')
448 word = match.group('value')
449 terms.append('%s%s"%s"' % (label, op, word))
450 else:
451 # It looked like a key:value cond, but not exactly, so treat it
452 # as fulltext search. It is probably a tiny bit of source code.
453 terms.append('"%s"' % word_label)
454
455 # Case 3: Simple words.
456 elif word:
457 terms.append(word)
458
459 else: # pragma: no coverage
460 warnings.append('Unparsable search term')
461
462 return terms
463
464
465def _ParseDateValue(val, now=None):
466 # type: (str, int) -> int
467 """Convert the user-entered date into timestamp."""
468 # Support timestamp value such as opened>1437671476
469 try:
470 return int(val)
471 except ValueError:
472 pass
473
474 # TODO(jrobbins): future: take timezones into account.
475 # TODO(jrobbins): for now, explain to users that "today" is
476 # actually now: the current time, not 12:01am in their timezone.
477 # In fact, it is not very useful because everything in the system
478 # happened before the current time.
479 if val == 'today':
480 return _CalculatePastDate(0, now=now)
481 elif val.startswith('today-'):
482 try:
483 days_ago = int(val.split('-')[1])
484 except ValueError:
485 raise InvalidQueryError('Could not parse date: ' + val)
486 return _CalculatePastDate(days_ago, now=now)
487
488 try:
489 if '/' in val:
490 year, month, day = [int(x) for x in val.split('/')]
491 elif '-' in val:
492 year, month, day = [int(x) for x in val.split('-')]
493 else:
494 raise InvalidQueryError('Could not parse date: ' + val)
495 except ValueError:
496 raise InvalidQueryError('Could not parse date: ' + val)
497
498 try:
499 return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
500 except ValueError:
501 raise InvalidQueryError('Could not parse date: ' + val)
502
503
504def _CalculatePastDate(days_ago, now=None):
505 # type: (int, int) -> int
506 """Calculates the timestamp N days ago from now."""
507 if now is None:
508 now = int(time.time())
509 ts = now - days_ago * 24 * 60 * 60
510 return ts
511
512
513def QueryToSubqueries(query):
514 # type (str) -> Sequence[str]
515 """Splits a query into smaller queries based on Monorail's search syntax.
516
517 This function handles parsing parentheses and OR statements in Monorail's
518 search syntax. By doing this parsing for OR statements and parentheses up
519 front in FrontendSearchPipeline, we are able to convert complex queries
520 with lots of ORs into smaller, more easily cacheable query terms.
521
522 These outputted subqueries should collectively return the same query results
523 as the initial input query without containing any ORs or parentheses,
524 allowing later search layers to parse queries without worrying about ORs
525 or parentheses.
526
527 Some examples of possible queries and their expected output:
528
529 - '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F']
530 - '(A) OR (B)' -> ['A', 'B']
531 - '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F']
532
533 Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me",
534 "Pri=1", "hello world Hotlist=test", "label!=a11y", etc
535
536 Note: Monorail implicitly ANDs any query terms separated by a space. For
537 the most part, AND functionality is handled at a later layer in search
538 processing. However, this case becomes important here when considering the
539 fact that a prentheses group can either be ANDed or ORed with terms that
540 surround it.
541
542 The _MultiplySubqueries helper is used to AND the results of different
543 groups together whereas concatenating lists is used to OR subqueries
544 together.
545
546 Args:
547 query: The initial query that was sent to the search.
548
549 Returns:
550 List of query fragments to be independently processed as search terms.
551
552 Raises:
553 InvalidQueryError if parentheses are unmatched.
554 """
555 tokens = _ValidateAndTokenizeQuery(query)
556
557 # Using an iterator allows us to keep our current loop position across
558 # helpers. This makes recursion a lot easier.
559 token_iterator = PeekIterator(tokens)
560
561 subqueries = _ParseQuery(token_iterator)
562
563 if not len(subqueries):
564 # Several cases, such as an empty query or a query with only parentheses
565 # will result in an empty set of subqueries. In these cases, we still want
566 # to give the search pipeline a single empty query to process.
567 return ['']
568
569 return subqueries
570
571
572def _ParseQuery(token_iterator):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100573 # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara854996b2021-09-07 19:36:02 +0000574 """Recursive helper to convert query tokens into a list of subqueries.
575
576 Parses a Query based on the following grammar (EBNF):
577
578 Query := OrGroup { [OrOperator] OrGroup }
579 OrGroup := AndGroup { AndGroup }
580 AndGroup := Subquery | ParenthesesGroup
581 ParenthesesGroup := "(" Query ")"
582 Subquery := /.+/
583 OrOperator := " OR "
584
585 An important nuance is that two groups can be next to each other, separated
586 only by a word boundary (ie: space or parentheses). In this case, they are
587 implicitly ANDed. In practice, because unparenthesized fragments ANDed by
588 spaces are stored as single tokens, we only need to handle the AND case when
589 a parentheses group is implicitly ANDed with an adjacent group.
590
591 Order of precedence is implemented by recursing through OR groups before
592 recursing through AND groups.
593
594 Args:
595 token_iterator: Iterator over a list of query tokens.
596
597 Returns:
598 List of query fragments to be processed as search terms.
599
600 Raises:
601 InvalidQueryError if tokens were inputted in a format that does not follow
602 our search grammar.
603 """
604 subqueries = []
605 try:
606 if token_iterator.peek().token_type == OR:
607 # Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup.
608 # ie: "(OR A)" will be processed as "A"
609 next(token_iterator)
610
611 subqueries = _ParseOrGroup(token_iterator)
612
613 while token_iterator.peek().token_type == OR:
614 # Consume the OR tokens without doing anything with it.
615 next(token_iterator)
616
617 next_token = token_iterator.peek()
618 if next_token.token_type == RIGHT_PAREN:
619 # Edge case: Ignore empty OR groups at the end of a ParenthesesGroup.
620 # ie: "(A OR)" will be processed as "A"
621 return subqueries
622
623 next_subqueries = _ParseOrGroup(token_iterator)
624
625 # Concatenate results of OR groups together.
626 subqueries = subqueries + next_subqueries
627
628 except StopIteration:
629 pass
630 # Return when we've reached the end of the string.
631 return subqueries
632
633
634def _ParseOrGroup(token_iterator):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100635 # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara854996b2021-09-07 19:36:02 +0000636 """Recursive helper to convert a single "OrGroup" into subqueries.
637
638 An OrGroup here is based on the following grammar:
639
640 Query := OrGroup { [OrOperator] OrGroup }
641 OrGroup := AndGroup { AndGroup }
642 AndGroup := Subquery | ParenthesesGroup
643 ParenthesesGroup := "(" Query ")"
644 Subquery := /.+/
645 OrOperator := " OR "
646
647 Args:
648 token_iterator: Iterator over a list of query tokens.
649
650 Returns:
651 List of query fragments to be processed as search terms.
652
653 Raises:
654 InvalidQueryError if tokens were inputted in a format that does not follow
655 our search grammar.
656 """
657 subqueries = _ParseAndGroup(token_iterator)
658
659 try:
660 # Iterate until there are no more AND groups left to see.
661 # Subquery or left parentheses are the possible starts of an AndGroup.
662 while (token_iterator.peek().token_type == SUBQUERY or
663 token_iterator.peek().token_type == LEFT_PAREN):
664
665 # Find subqueries from the next AND group.
666 next_subqueries = _ParseAndGroup(token_iterator)
667
668 # Multiply all results across AND groups together.
669 subqueries = _MultiplySubqueries(subqueries, next_subqueries)
670 except StopIteration:
671 pass
672
673 return subqueries
674
675
676def _ParseAndGroup(token_iterator):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100677 # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str]
Copybara854996b2021-09-07 19:36:02 +0000678 """Recursive helper to convert a single "AndGroup" into subqueries.
679
680 An OrGroup here is based on the following grammar:
681
682 Query := OrGroup { [OrOperator] OrGroup }
683 OrGroup := AndGroup { AndGroup }
684 AndGroup := Subquery | ParenthesesGroup
685 ParenthesesGroup := "(" Query ")"
686 Subquery := /.+/
687 OrOperator := " OR "
688
689 Args:
690 token_iterator: Iterator over a list of query tokens.
691
692 Returns:
693 List of query fragments to be processed as search terms.
694
695 Raises:
696 InvalidQueryError if tokens were inputted in a format that does not follow
697 our search grammar.
698 """
699 try:
700 token = next(token_iterator)
701 if token.token_type == LEFT_PAREN:
702 if token_iterator.peek().token_type == RIGHT_PAREN:
703 # Don't recurse into the ParenthesesGroup if there's nothing inside.
704 next(token_iterator)
705 return []
706
707 # Recurse into the ParenthesesGroup.
708 subqueries = _ParseQuery(token_iterator)
709
710 # Next token should be a right parenthesis.
711 next(token_iterator)
712
713 return subqueries
714 elif token.token_type == SUBQUERY:
715 return [token.value]
716 else:
717 # This should not happen if other QueryToSubqueries helpers are working
718 # properly.
719 raise InvalidQueryError('Inputted tokens do not follow grammar.')
720 except StopIteration:
721 pass
722 return []
723
724
725def _ValidateAndTokenizeQuery(query):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100726 # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken]
Copybara854996b2021-09-07 19:36:02 +0000727 """Converts the input query into a set of tokens for easier parsing.
728
729 Tokenizing the query string before parsing allows us to not have to as many
730 string manipulations while parsing, which simplifies our later code.
731
732 Args:
733 query: Query to tokenize.
734
735 Returns:
736 List of Token objects for use in query processing.
737
738 Raises:
739 InvalidQueryError if parentheses are unmatched.
740 """
741 tokens = [] # Function result
742 count = 0 # Used for checking if parentheses are balanced
743 s = '' # Records current string fragment. Cleared when a token is added.
744
745 for ch in query:
746 if ch == '(':
747 count += 1
748
749 # Add subquery from before we hit this parenthesis.
750 tokens.extend(_TokenizeSubqueryOnOr(s))
751 s = ''
752
753 tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN))
754 elif ch == ')':
755 count -= 1
756
757 if count < 0:
758 # More closing parentheses then open parentheses.
759 raise InvalidQueryError('Search query has unbalanced parentheses.')
760
761 # Add subquery from before we hit this parenthesis.
762 tokens.extend(_TokenizeSubqueryOnOr(s))
763 s = ''
764
765 tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN))
766 else:
767 s += ch
768
769 if count != 0:
770 raise InvalidQueryError('Search query has unbalanced parentheses.')
771
772 # Add any trailing tokens.
773 tokens.extend(_TokenizeSubqueryOnOr(s))
774
775 return tokens
776
777
778def _TokenizeSubqueryOnOr(subquery):
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100779 # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken]
Copybara854996b2021-09-07 19:36:02 +0000780 """Helper to split a subquery by OR and convert the result into tokens.
781
782 Args:
783 subquery: A string without parentheses to tokenize.
784
785 Returns:
786 Tokens for the subquery with OR tokens separating query strings if
787 applicable.
788 """
789 if len(subquery) == 0:
790 return []
791
792 result = []
793 fragments = subquery.split(OR_SYMBOL)
794 for f in fragments:
795 # Interleave the string fragments with OR tokens.
796 result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip()))
797 result.append(ast_pb2.QueryToken(token_type=OR))
798
799 # Remove trailing OR.
800 result.pop()
801
802 # Trim empty strings at the beginning or end. ie: if subquery is ' OR ',
803 # we want the list to be ['OR'], not ['', 'OR', ''].
804 if len(result) > 1 and result[0].value == '':
805 result.pop(0)
806 if len(result) > 1 and result[-1].value == '':
807 result.pop()
808 return result
809
810
811def _MultiplySubqueries(a, b):
812 # type: (Sequence[str], Sequence[str]) -> Sequence[str]
813 """Helper to AND subqueries from two separate lists.
814
815 Args:
816 a: First list of subqueries.
817 b: Second list of subqueries.
818
819 Returns:
820 List with n x m subqueries.
821 """
822 if not len(a):
823 return b
824 if not len(b):
825 return a
826 res = []
827 for q1 in a:
828 for q2 in b:
829 # AND two subqueries together by concatenating them.
830 query = (q1.strip() + ' ' + q2.strip()).strip()
831 res.append(query)
832 return res
833
834
835class PeekIterator:
836 """Simple iterator with peek() functionality.
837
838 Used by QueryToSubqueries to maintain state easily across recursive calls.
839 """
840
841 def __init__(self, source):
842 # type: (Sequence[Any])
843 self.__source = source
844 self.__i = 0
845
846 def peek(self):
847 # type: () -> Any
848 """Gets the next value in the iterator without side effects.
849
850 Returns:
851 Next value in iterator.
852
853 Raises:
854 StopIteration if you're at the end of the iterator.
855 """
856 if self.__i >= len(self.__source):
857 raise StopIteration
858 return self.__source[self.__i]
859
860 def __iter__(self):
861 # type: () -> Sequence[Any]
862 """Return self to make iterator iterable."""
863 return self
864
865 def __repr__(self):
866 # type: () -> str
867 """Allow logging current iterator value for debugging."""
868 try:
869 return str(self.peek())
870 except StopIteration:
871 pass
872 return 'End of PeekIterator'
873
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100874 def __next__(self):
875 # type: () -> Any
876 """Gets the next value in the iterator and increments pointer.
877
878 Returns:
879 Next value in iterator.
880
881 Raises:
882 StopIteration if you're at the end of the iterator.
883 """
884 return self.next()
885
Copybara854996b2021-09-07 19:36:02 +0000886 def next(self):
887 # type: () -> Any
888 """Gets the next value in the iterator and increments pointer.
889
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100890 For backwards compatibility with Python 2.
891
Copybara854996b2021-09-07 19:36:02 +0000892 Returns:
893 Next value in iterator.
894
895 Raises:
896 StopIteration if you're at the end of the iterator.
897 """
898 if self.__i >= len(self.__source):
899 raise StopIteration
900 value = self.__source[self.__i]
901 self.__i += 1
902 return value
903
904
905class Error(Exception):
906 """Base exception class for this package."""
907 pass
908
909
910class InvalidQueryError(Error):
911 """Error raised when an invalid query is requested."""
912 pass