blob: 235f9b3923c52f495e1343e564da7cd84af41814 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5
6"""A set of functions that integrate the GAE search index with Monorail."""
7from __future__ import print_function
8from __future__ import division
9from __future__ import absolute_import
10
11import collections
12import datetime
13import logging
14import re
15import time
16
17from google.appengine.api import search
18
19from proto import ast_pb2
20from proto import tracker_pb2
21
22
23# TODO(jrobbins): Consider re-implementing this whole file by using a
24# BNF syntax specification and a parser generator or library.
25
26# encodings
27UTF8 = 'utf-8'
28
29# Operator used for OR statements.
30OR_SYMBOL = ' OR '
31
32# Token types used for parentheses parsing.
33SUBQUERY = ast_pb2.TokenType.SUBQUERY
34LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN
35RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN
36OR = ast_pb2.TokenType.OR
37
38# Field types and operators
39BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
40DATE = tracker_pb2.FieldTypes.DATE_TYPE
41NUM = tracker_pb2.FieldTypes.INT_TYPE
42TXT = tracker_pb2.FieldTypes.STR_TYPE
43APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE
44
45EQ = ast_pb2.QueryOp.EQ
46NE = ast_pb2.QueryOp.NE
47LT = ast_pb2.QueryOp.LT
48GT = ast_pb2.QueryOp.GT
49LE = ast_pb2.QueryOp.LE
50GE = ast_pb2.QueryOp.GE
51TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
52NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
53IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
54IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
55KEY_HAS = ast_pb2.QueryOp.KEY_HAS
56
57# Mapping from user query comparison operators to our internal representation.
58OPS = {
59 ':': TEXT_HAS,
60 '=': EQ,
61 '!=': NE,
62 '<': LT,
63 '>': GT,
64 '<=': LE,
65 '>=': GE,
66}
67
68# When the query has a leading minus, switch the operator for its opposite.
69NEGATED_OPS = {
70 EQ: NE,
71 NE: EQ,
72 LT: GE,
73 GT: LE,
74 LE: GT,
75 GE: LT,
76 TEXT_HAS: NOT_TEXT_HAS,
77 # IS_DEFINED is handled separately.
78 }
79
80# This is a partial regular expression that matches all of our comparison
81# operators, such as =, 1=, >, and <. Longer ones listed first so that the
82# shorter ones don't cause premature matches.
83OPS_PATTERN = '|'.join(
84 map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op))))
85
86# This RE extracts search terms from a subquery string.
87TERM_RE = re.compile(
88 r'(-?"[^"]+")|' # E.g., ["division by zero"]
89 r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
90 r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
91 r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
92 % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
93
94# This RE is used to further decompose a comparison term into prefix, op, and
95# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
96# can include a leading "-" to negate the comparison.
97OP_RE = re.compile(
98 r'^(?P<prefix>[-_.\w]*?)'
99 r'(?P<op>%s)'
100 r'(?P<value>([-@\w][-\*,./:<=>@\w]*|"[^"]*"))$' %
101 OPS_PATTERN,
102 flags=re.UNICODE)
103
104
105# Predefined issue fields passed to the query parser.
106_ISSUE_FIELDS_LIST = [
107 (ast_pb2.ANY_FIELD, TXT),
108 ('attachment', TXT), # attachment file names
109 ('attachments', NUM), # number of attachment files
110 ('blocked', BOOL),
111 ('blockedon', TXT),
112 ('blockedon_id', NUM),
113 ('blocking', TXT),
114 ('blocking_id', NUM),
115 ('cc', TXT),
116 ('cc_id', NUM),
117 ('comment', TXT),
118 ('commentby', TXT),
119 ('commentby_id', NUM),
120 ('component', TXT),
121 ('component_id', NUM),
122 ('description', TXT),
123 ('gate', TXT),
124 ('hotlist', TXT),
125 ('hotlist_id', NUM),
126 ('id', NUM),
127 ('is_spam', BOOL),
128 ('label', TXT),
129 ('label_id', NUM),
130 ('mergedinto', NUM),
131 ('mergedinto_id', NUM),
132 ('open', BOOL),
133 ('owner', TXT),
134 ('ownerbouncing', BOOL),
135 ('owner_id', NUM),
136 ('project', TXT),
137 ('reporter', TXT),
138 ('reporter_id', NUM),
139 ('spam', BOOL),
140 ('stars', NUM),
141 ('starredby', TXT),
142 ('starredby_id', NUM),
143 ('status', TXT),
144 ('status_id', NUM),
145 ('summary', TXT),
146 ]
147
148_DATE_FIELDS = (
149 'closed',
150 'modified',
151 'opened',
152 'ownermodified',
153 'ownerlastvisit',
154 'statusmodified',
155 'componentmodified',
156 )
157
158# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
159_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
160
161_DATE_FIELD_SUFFIX_TO_OP = {
162 '-after': '>',
163 '-before': '<',
164}
165
166SET_BY_SUFFIX = '-by'
167SET_ON_SUFFIX = '-on'
168APPROVER_SUFFIX = '-approver'
169STATUS_SUFFIX = '-status'
170
171_APPROVAL_SUFFIXES = (
172 SET_BY_SUFFIX,
173 SET_ON_SUFFIX,
174 APPROVER_SUFFIX,
175 STATUS_SUFFIX,
176)
177
178BUILTIN_ISSUE_FIELDS = {
179 f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
180 for f_name, f_type in _ISSUE_FIELDS_LIST}
181
182
183# Do not treat strings that start with the below as key:value search terms.
184# See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail.
185NON_OP_PREFIXES = (
186 'http:',
187 'https:',
188)
189
190
191def ParseUserQuery(
192 query, scope, builtin_fields, harmonized_config, warnings=None,
193 now=None):
194 # type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef],
195 # proto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) ->
196 # proto.ast_pb2.QueryAST
197 """Parse a user query and return a set of structure terms.
198
199 Args:
200 query: string with user's query. E.g., 'Priority=High'.
201 scope: string search terms that define the scope in which the
202 query should be executed. They are expressed in the same
203 user query language. E.g., adding the canned query.
204 builtin_fields: dict {field_name: FieldDef(field_name, type)}
205 mapping field names to FieldDef objects for built-in fields.
206 harmonized_config: config for all the projects being searched.
207 @@@ custom field name is not unique in cross project search.
208 - custom_fields = {field_name: [fd, ...]}
209 - query build needs to OR each possible interpretation
210 - could be label in one project and field in another project.
211 @@@ what about searching across all projects?
212 warnings: optional list to accumulate warning messages.
213 now: optional timestamp for tests, otherwise time.time() is used.
214
215 Returns:
216 A QueryAST with conjunctions (usually just one), where each has a list of
217 Condition PBs with op, fields, str_values and int_values. E.g., the query
218 [priority=high leak OR stars>100] over open issues would return
219 QueryAST(
220 Conjunction(Condition(EQ, [open_fd], [], [1]),
221 Condition(EQ, [label_fd], ['priority-high'], []),
222 Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
223 Conjunction(Condition(EQ, [open_fd], [], [1]),
224 Condition(GT, [stars_fd], [], [100])))
225
226 Raises:
227 InvalidQueryError: If a problem was detected in the user's query.
228 """
229 if warnings is None:
230 warnings = []
231
232 # Convert the overall query into one or more OR'd subqueries.
233 subqueries = QueryToSubqueries(query)
234
235 # Make a dictionary of all fields: built-in + custom in each project.
236 combined_fields = collections.defaultdict(
237 list, {field_name: [field_def]
238 for field_name, field_def in builtin_fields.items()})
239 for fd in harmonized_config.field_defs:
240 if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
241 # Only do non-enum fields because enums are stored as labels
242 combined_fields[fd.field_name.lower()].append(fd)
243 if fd.field_type == APPROVAL:
244 for approval_suffix in _APPROVAL_SUFFIXES:
245 combined_fields[fd.field_name.lower() + approval_suffix].append(fd)
246
247 conjunctions = [
248 _ParseConjunction(sq, scope, combined_fields, warnings, now=now)
249 for sq in subqueries]
250 return ast_pb2.QueryAST(conjunctions=conjunctions)
251
252
253def _ParseConjunction(subquery, scope, fields, warnings, now=None):
254 # type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
255 # int) -> proto.ast_pb2.Condition
256 """Parse part of a user query into a Conjunction PB."""
257 scoped_query = ('%s %s' % (scope, subquery)).lower()
258 cond_strs = _ExtractConds(scoped_query, warnings)
259 conds = [_ParseCond(cond_str, fields, warnings, now=now)
260 for cond_str in cond_strs]
261 conds = [cond for cond in conds if cond]
262 return ast_pb2.Conjunction(conds=conds)
263
264
265def _ParseCond(cond_str, fields, warnings, now=None):
266 # type: (str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
267 # int) -> proto.ast_pb2.Condition
268 """Parse one user query condition string into a Condition PB."""
269 op_match = OP_RE.match(cond_str)
270 # Do not treat as key:value search terms if any of the special prefixes match.
271 special_prefixes_match = any(
272 cond_str.startswith(p) for p in NON_OP_PREFIXES)
273 if op_match and not special_prefixes_match:
274 prefix = op_match.group('prefix')
275 op = op_match.group('op')
276 val = op_match.group('value')
277 # Special case handling to continue to support old date query terms from
278 # code.google.com. See monorail:151 for more details.
279 if prefix.startswith(_DATE_FIELDS):
280 for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
281 if prefix.endswith(date_suffix):
282 prefix = prefix.rstrip(date_suffix)
283 op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
284 return _ParseStructuredTerm(prefix, op, val, fields, now=now)
285
286 # Treat the cond as a full-text search term, which might be negated.
287 if cond_str.startswith('-'):
288 op = NOT_TEXT_HAS
289 cond_str = cond_str[1:]
290 else:
291 op = TEXT_HAS
292
293 # Construct a full-text Query object as a dry-run to validate that
294 # the syntax is acceptable.
295 try:
296 _fts_query = search.Query(cond_str)
297 except search.QueryError:
298 warnings.append('Ignoring full-text term: %s' % cond_str)
299 return None
300
301 # Flag a potential user misunderstanding.
302 if cond_str.lower() in ('and', 'or', 'not'):
303 warnings.append(
304 'The only supported boolean operator is OR (all capitals).')
305
306 return ast_pb2.MakeCond(
307 op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
308
309
310def _ParseStructuredTerm(prefix, op_str, value, fields, now=None):
311 # type: (str, str, str, Mapping[str, proto.tracker_pb2.FieldDef]) ->
312 # proto.ast_pb2.Condition
313 """Parse one user structured query term into an internal representation.
314
315 Args:
316 prefix: The query operator, usually a field name. E.g., summary. It can
317 also be special operators like "is" to test boolean fields.
318 op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
319 value: the value to compare against, e.g., term to find in that field.
320 fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
321 now: optional timestamp for tests, otherwise time.time() is used.
322
323 Returns:
324 A Condition PB.
325 """
326 unquoted_value = value.strip('"')
327 # Quick-OR is a convenient way to write one condition that matches any one of
328 # multiple values, like set membership. E.g., [Priority=High,Critical].
329 # Ignore empty values caused by duplicated or trailing commas. E.g.,
330 # [Priority=High,,Critical,] is equivalent to [Priority=High,Critical].
331 quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()]
332
333 op = OPS[op_str]
334 negate = False
335 if prefix.startswith('-'):
336 negate = True
337 op = NEGATED_OPS.get(op, op)
338 prefix = prefix[1:]
339
340 if prefix == 'is' and unquoted_value in [
341 'open', 'blocked', 'spam', 'ownerbouncing']:
342 return ast_pb2.MakeCond(
343 NE if negate else EQ, fields[unquoted_value], [], [])
344
345 # Search entries with or without any value in the specified field.
346 if prefix == 'has':
347 op = IS_NOT_DEFINED if negate else IS_DEFINED
348 if '.' in unquoted_value: # Possible search for phase field with any value.
349 phase_name, possible_field = unquoted_value.split('.', 1)
350 if possible_field in fields:
351 return ast_pb2.MakeCond(
352 op, fields[possible_field], [], [], phase_name=phase_name)
353 elif unquoted_value in fields: # Look for that field with any value.
354 return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
355 else: # Look for any label with that prefix.
356 return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
357
358 # Search entries with certain gates.
359 if prefix == 'gate':
360 return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, [])
361
362 # Determine hotlist query type.
363 # If prefix is not 'hotlist', quick_or_vals is empty, or qov
364 # does not contain ':', is_fields will remain True
365 is_fields = True
366 if prefix == 'hotlist':
367 try:
368 if ':' not in quick_or_vals[0]:
369 is_fields = False
370 except IndexError:
371 is_fields = False
372
373 phase_name = None
374 if '.' in prefix and is_fields:
375 split_prefix = prefix.split('.', 1)
376 if split_prefix[1] in fields:
377 phase_name, prefix = split_prefix
378
379 # search built-in and custom fields. E.g., summary.
380 if prefix in fields and is_fields:
381 # Note: if first matching field is date-type, we assume they all are.
382 # TODO(jrobbins): better handling for rare case where multiple projects
383 # define the same custom field name, and one is a date and another is not.
384 first_field = fields[prefix][0]
385 if first_field.field_type == DATE:
386 date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
387 return ast_pb2.MakeCond(op, fields[prefix], [], date_values)
388 elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX):
389 date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
390 return ast_pb2.MakeCond(
391 op,
392 fields[prefix], [],
393 date_values,
394 key_suffix=SET_ON_SUFFIX,
395 phase_name=phase_name)
396 else:
397 quick_or_ints = []
398 for qov in quick_or_vals:
399 try:
400 quick_or_ints.append(int(qov))
401 except ValueError:
402 pass
403 if first_field.field_type == APPROVAL:
404 for approval_suffix in _APPROVAL_SUFFIXES:
405 if prefix.endswith(approval_suffix):
406 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
407 quick_or_ints, key_suffix=approval_suffix,
408 phase_name=phase_name)
409 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
410 quick_or_ints, phase_name=phase_name)
411
412 # Since it is not a field, treat it as labels, E.g., Priority.
413 quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
414 # Convert substring match to key-value match if user typed 'foo:bar'.
415 if op == TEXT_HAS:
416 op = KEY_HAS
417 return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
418
419
420def _ExtractConds(query, warnings):
421 # type: (str, Sequence[str]) -> Sequence[str]
422 """Parse a query string into a list of individual condition strings.
423
424 Args:
425 query: UTF-8 encoded search query string.
426 warnings: list to accumulate warning messages.
427
428 Returns:
429 A list of query condition strings.
430 """
431 # Convert to unicode then search for distinct terms.
432 term_matches = TERM_RE.findall(query)
433
434 terms = []
435 for (phrase, word_label, _op1, phrase_label, _op2,
436 word) in term_matches:
437 # Case 1: Quoted phrases, e.g., ["hot dog"].
438 if phrase_label or phrase:
439 terms.append(phrase_label or phrase)
440
441 # Case 2: Comparisons
442 elif word_label:
443 special_prefixes_match = any(
444 word_label.startswith(p) for p in NON_OP_PREFIXES)
445 match = OP_RE.match(word_label)
446 if match and not special_prefixes_match:
447 label = match.group('prefix')
448 op = match.group('op')
449 word = match.group('value')
450 terms.append('%s%s"%s"' % (label, op, word))
451 else:
452 # It looked like a key:value cond, but not exactly, so treat it
453 # as fulltext search. It is probably a tiny bit of source code.
454 terms.append('"%s"' % word_label)
455
456 # Case 3: Simple words.
457 elif word:
458 terms.append(word)
459
460 else: # pragma: no coverage
461 warnings.append('Unparsable search term')
462
463 return terms
464
465
466def _ParseDateValue(val, now=None):
467 # type: (str, int) -> int
468 """Convert the user-entered date into timestamp."""
469 # Support timestamp value such as opened>1437671476
470 try:
471 return int(val)
472 except ValueError:
473 pass
474
475 # TODO(jrobbins): future: take timezones into account.
476 # TODO(jrobbins): for now, explain to users that "today" is
477 # actually now: the current time, not 12:01am in their timezone.
478 # In fact, it is not very useful because everything in the system
479 # happened before the current time.
480 if val == 'today':
481 return _CalculatePastDate(0, now=now)
482 elif val.startswith('today-'):
483 try:
484 days_ago = int(val.split('-')[1])
485 except ValueError:
486 raise InvalidQueryError('Could not parse date: ' + val)
487 return _CalculatePastDate(days_ago, now=now)
488
489 try:
490 if '/' in val:
491 year, month, day = [int(x) for x in val.split('/')]
492 elif '-' in val:
493 year, month, day = [int(x) for x in val.split('-')]
494 else:
495 raise InvalidQueryError('Could not parse date: ' + val)
496 except ValueError:
497 raise InvalidQueryError('Could not parse date: ' + val)
498
499 try:
500 return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
501 except ValueError:
502 raise InvalidQueryError('Could not parse date: ' + val)
503
504
505def _CalculatePastDate(days_ago, now=None):
506 # type: (int, int) -> int
507 """Calculates the timestamp N days ago from now."""
508 if now is None:
509 now = int(time.time())
510 ts = now - days_ago * 24 * 60 * 60
511 return ts
512
513
514def QueryToSubqueries(query):
515 # type (str) -> Sequence[str]
516 """Splits a query into smaller queries based on Monorail's search syntax.
517
518 This function handles parsing parentheses and OR statements in Monorail's
519 search syntax. By doing this parsing for OR statements and parentheses up
520 front in FrontendSearchPipeline, we are able to convert complex queries
521 with lots of ORs into smaller, more easily cacheable query terms.
522
523 These outputted subqueries should collectively return the same query results
524 as the initial input query without containing any ORs or parentheses,
525 allowing later search layers to parse queries without worrying about ORs
526 or parentheses.
527
528 Some examples of possible queries and their expected output:
529
530 - '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F']
531 - '(A) OR (B)' -> ['A', 'B']
532 - '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F']
533
534 Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me",
535 "Pri=1", "hello world Hotlist=test", "label!=a11y", etc
536
537 Note: Monorail implicitly ANDs any query terms separated by a space. For
538 the most part, AND functionality is handled at a later layer in search
539 processing. However, this case becomes important here when considering the
540 fact that a prentheses group can either be ANDed or ORed with terms that
541 surround it.
542
543 The _MultiplySubqueries helper is used to AND the results of different
544 groups together whereas concatenating lists is used to OR subqueries
545 together.
546
547 Args:
548 query: The initial query that was sent to the search.
549
550 Returns:
551 List of query fragments to be independently processed as search terms.
552
553 Raises:
554 InvalidQueryError if parentheses are unmatched.
555 """
556 tokens = _ValidateAndTokenizeQuery(query)
557
558 # Using an iterator allows us to keep our current loop position across
559 # helpers. This makes recursion a lot easier.
560 token_iterator = PeekIterator(tokens)
561
562 subqueries = _ParseQuery(token_iterator)
563
564 if not len(subqueries):
565 # Several cases, such as an empty query or a query with only parentheses
566 # will result in an empty set of subqueries. In these cases, we still want
567 # to give the search pipeline a single empty query to process.
568 return ['']
569
570 return subqueries
571
572
573def _ParseQuery(token_iterator):
574 # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
575 """Recursive helper to convert query tokens into a list of subqueries.
576
577 Parses a Query based on the following grammar (EBNF):
578
579 Query := OrGroup { [OrOperator] OrGroup }
580 OrGroup := AndGroup { AndGroup }
581 AndGroup := Subquery | ParenthesesGroup
582 ParenthesesGroup := "(" Query ")"
583 Subquery := /.+/
584 OrOperator := " OR "
585
586 An important nuance is that two groups can be next to each other, separated
587 only by a word boundary (ie: space or parentheses). In this case, they are
588 implicitly ANDed. In practice, because unparenthesized fragments ANDed by
589 spaces are stored as single tokens, we only need to handle the AND case when
590 a parentheses group is implicitly ANDed with an adjacent group.
591
592 Order of precedence is implemented by recursing through OR groups before
593 recursing through AND groups.
594
595 Args:
596 token_iterator: Iterator over a list of query tokens.
597
598 Returns:
599 List of query fragments to be processed as search terms.
600
601 Raises:
602 InvalidQueryError if tokens were inputted in a format that does not follow
603 our search grammar.
604 """
605 subqueries = []
606 try:
607 if token_iterator.peek().token_type == OR:
608 # Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup.
609 # ie: "(OR A)" will be processed as "A"
610 next(token_iterator)
611
612 subqueries = _ParseOrGroup(token_iterator)
613
614 while token_iterator.peek().token_type == OR:
615 # Consume the OR tokens without doing anything with it.
616 next(token_iterator)
617
618 next_token = token_iterator.peek()
619 if next_token.token_type == RIGHT_PAREN:
620 # Edge case: Ignore empty OR groups at the end of a ParenthesesGroup.
621 # ie: "(A OR)" will be processed as "A"
622 return subqueries
623
624 next_subqueries = _ParseOrGroup(token_iterator)
625
626 # Concatenate results of OR groups together.
627 subqueries = subqueries + next_subqueries
628
629 except StopIteration:
630 pass
631 # Return when we've reached the end of the string.
632 return subqueries
633
634
635def _ParseOrGroup(token_iterator):
636 # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
637 """Recursive helper to convert a single "OrGroup" into subqueries.
638
639 An OrGroup here is based on the following grammar:
640
641 Query := OrGroup { [OrOperator] OrGroup }
642 OrGroup := AndGroup { AndGroup }
643 AndGroup := Subquery | ParenthesesGroup
644 ParenthesesGroup := "(" Query ")"
645 Subquery := /.+/
646 OrOperator := " OR "
647
648 Args:
649 token_iterator: Iterator over a list of query tokens.
650
651 Returns:
652 List of query fragments to be processed as search terms.
653
654 Raises:
655 InvalidQueryError if tokens were inputted in a format that does not follow
656 our search grammar.
657 """
658 subqueries = _ParseAndGroup(token_iterator)
659
660 try:
661 # Iterate until there are no more AND groups left to see.
662 # Subquery or left parentheses are the possible starts of an AndGroup.
663 while (token_iterator.peek().token_type == SUBQUERY or
664 token_iterator.peek().token_type == LEFT_PAREN):
665
666 # Find subqueries from the next AND group.
667 next_subqueries = _ParseAndGroup(token_iterator)
668
669 # Multiply all results across AND groups together.
670 subqueries = _MultiplySubqueries(subqueries, next_subqueries)
671 except StopIteration:
672 pass
673
674 return subqueries
675
676
677def _ParseAndGroup(token_iterator):
678 # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
679 """Recursive helper to convert a single "AndGroup" into subqueries.
680
681 An OrGroup here is based on the following grammar:
682
683 Query := OrGroup { [OrOperator] OrGroup }
684 OrGroup := AndGroup { AndGroup }
685 AndGroup := Subquery | ParenthesesGroup
686 ParenthesesGroup := "(" Query ")"
687 Subquery := /.+/
688 OrOperator := " OR "
689
690 Args:
691 token_iterator: Iterator over a list of query tokens.
692
693 Returns:
694 List of query fragments to be processed as search terms.
695
696 Raises:
697 InvalidQueryError if tokens were inputted in a format that does not follow
698 our search grammar.
699 """
700 try:
701 token = next(token_iterator)
702 if token.token_type == LEFT_PAREN:
703 if token_iterator.peek().token_type == RIGHT_PAREN:
704 # Don't recurse into the ParenthesesGroup if there's nothing inside.
705 next(token_iterator)
706 return []
707
708 # Recurse into the ParenthesesGroup.
709 subqueries = _ParseQuery(token_iterator)
710
711 # Next token should be a right parenthesis.
712 next(token_iterator)
713
714 return subqueries
715 elif token.token_type == SUBQUERY:
716 return [token.value]
717 else:
718 # This should not happen if other QueryToSubqueries helpers are working
719 # properly.
720 raise InvalidQueryError('Inputted tokens do not follow grammar.')
721 except StopIteration:
722 pass
723 return []
724
725
726def _ValidateAndTokenizeQuery(query):
727 # type: (str) -> Sequence[proto.ast_pb2.QueryToken]
728 """Converts the input query into a set of tokens for easier parsing.
729
730 Tokenizing the query string before parsing allows us to not have to as many
731 string manipulations while parsing, which simplifies our later code.
732
733 Args:
734 query: Query to tokenize.
735
736 Returns:
737 List of Token objects for use in query processing.
738
739 Raises:
740 InvalidQueryError if parentheses are unmatched.
741 """
742 tokens = [] # Function result
743 count = 0 # Used for checking if parentheses are balanced
744 s = '' # Records current string fragment. Cleared when a token is added.
745
746 for ch in query:
747 if ch == '(':
748 count += 1
749
750 # Add subquery from before we hit this parenthesis.
751 tokens.extend(_TokenizeSubqueryOnOr(s))
752 s = ''
753
754 tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN))
755 elif ch == ')':
756 count -= 1
757
758 if count < 0:
759 # More closing parentheses then open parentheses.
760 raise InvalidQueryError('Search query has unbalanced parentheses.')
761
762 # Add subquery from before we hit this parenthesis.
763 tokens.extend(_TokenizeSubqueryOnOr(s))
764 s = ''
765
766 tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN))
767 else:
768 s += ch
769
770 if count != 0:
771 raise InvalidQueryError('Search query has unbalanced parentheses.')
772
773 # Add any trailing tokens.
774 tokens.extend(_TokenizeSubqueryOnOr(s))
775
776 return tokens
777
778
779def _TokenizeSubqueryOnOr(subquery):
780 # type: (str) -> Sequence[proto.ast_pb2.QueryToken]
781 """Helper to split a subquery by OR and convert the result into tokens.
782
783 Args:
784 subquery: A string without parentheses to tokenize.
785
786 Returns:
787 Tokens for the subquery with OR tokens separating query strings if
788 applicable.
789 """
790 if len(subquery) == 0:
791 return []
792
793 result = []
794 fragments = subquery.split(OR_SYMBOL)
795 for f in fragments:
796 # Interleave the string fragments with OR tokens.
797 result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip()))
798 result.append(ast_pb2.QueryToken(token_type=OR))
799
800 # Remove trailing OR.
801 result.pop()
802
803 # Trim empty strings at the beginning or end. ie: if subquery is ' OR ',
804 # we want the list to be ['OR'], not ['', 'OR', ''].
805 if len(result) > 1 and result[0].value == '':
806 result.pop(0)
807 if len(result) > 1 and result[-1].value == '':
808 result.pop()
809 return result
810
811
812def _MultiplySubqueries(a, b):
813 # type: (Sequence[str], Sequence[str]) -> Sequence[str]
814 """Helper to AND subqueries from two separate lists.
815
816 Args:
817 a: First list of subqueries.
818 b: Second list of subqueries.
819
820 Returns:
821 List with n x m subqueries.
822 """
823 if not len(a):
824 return b
825 if not len(b):
826 return a
827 res = []
828 for q1 in a:
829 for q2 in b:
830 # AND two subqueries together by concatenating them.
831 query = (q1.strip() + ' ' + q2.strip()).strip()
832 res.append(query)
833 return res
834
835
836class PeekIterator:
837 """Simple iterator with peek() functionality.
838
839 Used by QueryToSubqueries to maintain state easily across recursive calls.
840 """
841
842 def __init__(self, source):
843 # type: (Sequence[Any])
844 self.__source = source
845 self.__i = 0
846
847 def peek(self):
848 # type: () -> Any
849 """Gets the next value in the iterator without side effects.
850
851 Returns:
852 Next value in iterator.
853
854 Raises:
855 StopIteration if you're at the end of the iterator.
856 """
857 if self.__i >= len(self.__source):
858 raise StopIteration
859 return self.__source[self.__i]
860
861 def __iter__(self):
862 # type: () -> Sequence[Any]
863 """Return self to make iterator iterable."""
864 return self
865
866 def __repr__(self):
867 # type: () -> str
868 """Allow logging current iterator value for debugging."""
869 try:
870 return str(self.peek())
871 except StopIteration:
872 pass
873 return 'End of PeekIterator'
874
875 def next(self):
876 # type: () -> Any
877 """Gets the next value in the iterator and increments pointer.
878
879 Returns:
880 Next value in iterator.
881
882 Raises:
883 StopIteration if you're at the end of the iterator.
884 """
885 if self.__i >= len(self.__source):
886 raise StopIteration
887 value = self.__source[self.__i]
888 self.__i += 1
889 return value
890
891
892class Error(Exception):
893 """Base exception class for this package."""
894 pass
895
896
897class InvalidQueryError(Error):
898 """Error raised when an invalid query is requested."""
899 pass