| # Copyright 2016 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """A set of functions that integrate the GAE search index with Monorail.""" |
| from __future__ import print_function |
| from __future__ import division |
| from __future__ import absolute_import |
| |
| import collections |
| import datetime |
| import logging |
| import re |
| import time |
| |
| from google.appengine.api import search |
| |
| from mrproto import ast_pb2 |
| from mrproto import tracker_pb2 |
| |
| |
| # TODO(jrobbins): Consider re-implementing this whole file by using a |
| # BNF syntax specification and a parser generator or library. |
| |
| # encodings |
| UTF8 = 'utf-8' |
| |
| # Operator used for OR statements. |
| OR_SYMBOL = ' OR ' |
| |
| # Token types used for parentheses parsing. |
| SUBQUERY = ast_pb2.TokenType.SUBQUERY |
| LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN |
| RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN |
| OR = ast_pb2.TokenType.OR |
| |
| # Field types and operators |
| BOOL = tracker_pb2.FieldTypes.BOOL_TYPE |
| DATE = tracker_pb2.FieldTypes.DATE_TYPE |
| NUM = tracker_pb2.FieldTypes.INT_TYPE |
| TXT = tracker_pb2.FieldTypes.STR_TYPE |
| APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE |
| |
| EQ = ast_pb2.QueryOp.EQ |
| NE = ast_pb2.QueryOp.NE |
| LT = ast_pb2.QueryOp.LT |
| GT = ast_pb2.QueryOp.GT |
| LE = ast_pb2.QueryOp.LE |
| GE = ast_pb2.QueryOp.GE |
| TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS |
| NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS |
| IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED |
| IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED |
| KEY_HAS = ast_pb2.QueryOp.KEY_HAS |
| |
| # Mapping from user query comparison operators to our internal representation. |
| OPS = { |
| ':': TEXT_HAS, |
| '=': EQ, |
| '!=': NE, |
| '<': LT, |
| '>': GT, |
| '<=': LE, |
| '>=': GE, |
| } |
| |
| # When the query has a leading minus, switch the operator for its opposite. |
| NEGATED_OPS = { |
| EQ: NE, |
| NE: EQ, |
| LT: GE, |
| GT: LE, |
| LE: GT, |
| GE: LT, |
| TEXT_HAS: NOT_TEXT_HAS, |
| # IS_DEFINED is handled separately. |
| } |
| |
| # This is a partial regular expression that matches all of our comparison |
| # operators, such as =, 1=, >, and <. Longer ones listed first so that the |
| # shorter ones don't cause premature matches. |
| OPS_PATTERN = '|'.join( |
| map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op)))) |
| |
| # This RE extracts search terms from a subquery string. |
| TERM_RE = re.compile( |
| r'(-?"[^"]+")|' # E.g., ["division by zero"] |
| r'(\S+(%s)[^ "]+)|' # E.g., [stars>10] |
| r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"] |
| r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround] |
| % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE) |
| |
| # This RE is used to further decompose a comparison term into prefix, op, and |
| # value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix |
| # can include a leading "-" to negate the comparison. |
| OP_RE = re.compile( |
| r'^(?P<prefix>[-_.\w]*?)' |
| r'(?P<op>%s)' |
| r'(?P<value>([-@\w][-\*,./:<=>@\w]*|"[^"]*"))$' % |
| OPS_PATTERN, |
| flags=re.UNICODE) |
| |
| |
| # Predefined issue fields passed to the query parser. |
| _ISSUE_FIELDS_LIST = [ |
| (ast_pb2.ANY_FIELD, TXT), |
| ('attachment', TXT), # attachment file names |
| ('attachments', NUM), # number of attachment files |
| ('blocked', BOOL), |
| ('blockedon', TXT), |
| ('blockedon_id', NUM), |
| ('blocking', TXT), |
| ('blocking_id', NUM), |
| ('cc', TXT), |
| ('cc_id', NUM), |
| ('comment', TXT), |
| ('commentby', TXT), |
| ('commentby_id', NUM), |
| ('component', TXT), |
| ('component_id', NUM), |
| ('description', TXT), |
| ('gate', TXT), |
| ('hotlist', TXT), |
| ('hotlist_id', NUM), |
| ('id', NUM), |
| ('is_spam', BOOL), |
| ('label', TXT), |
| ('label_id', NUM), |
| ('mergedinto', NUM), |
| ('mergedinto_id', NUM), |
| ('open', BOOL), |
| ('owner', TXT), |
| ('ownerbouncing', BOOL), |
| ('owner_id', NUM), |
| ('project', TXT), |
| ('reporter', TXT), |
| ('reporter_id', NUM), |
| ('spam', BOOL), |
| ('stars', NUM), |
| ('starredby', TXT), |
| ('starredby_id', NUM), |
| ('status', TXT), |
| ('status_id', NUM), |
| ('summary', TXT), |
| ] |
| |
| _DATE_FIELDS = ( |
| 'closed', |
| 'modified', |
| 'opened', |
| 'ownermodified', |
| 'ownerlastvisit', |
| 'statusmodified', |
| 'componentmodified', |
| ) |
| |
| # Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST. |
| _ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS) |
| |
| _DATE_FIELD_SUFFIX_TO_OP = { |
| '-after': '>', |
| '-before': '<', |
| } |
| |
| SET_BY_SUFFIX = '-by' |
| SET_ON_SUFFIX = '-on' |
| APPROVER_SUFFIX = '-approver' |
| STATUS_SUFFIX = '-status' |
| |
| _APPROVAL_SUFFIXES = ( |
| SET_BY_SUFFIX, |
| SET_ON_SUFFIX, |
| APPROVER_SUFFIX, |
| STATUS_SUFFIX, |
| ) |
| |
| BUILTIN_ISSUE_FIELDS = { |
| f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type) |
| for f_name, f_type in _ISSUE_FIELDS_LIST} |
| |
| |
| # Do not treat strings that start with the below as key:value search terms. |
| # See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail. |
| NON_OP_PREFIXES = ( |
| 'http:', |
| 'https:', |
| ) |
| |
| |
| def ParseUserQuery( |
| query, scope, builtin_fields, harmonized_config, warnings=None, |
| now=None): |
| # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], |
| # mrproto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) -> |
| # mrproto.ast_pb2.QueryAST |
| """Parse a user query and return a set of structure terms. |
| |
| Args: |
| query: string with user's query. E.g., 'Priority=High'. |
| scope: string search terms that define the scope in which the |
| query should be executed. They are expressed in the same |
| user query language. E.g., adding the canned query. |
| builtin_fields: dict {field_name: FieldDef(field_name, type)} |
| mapping field names to FieldDef objects for built-in fields. |
| harmonized_config: config for all the projects being searched. |
| @@@ custom field name is not unique in cross project search. |
| - custom_fields = {field_name: [fd, ...]} |
| - query build needs to OR each possible interpretation |
| - could be label in one project and field in another project. |
| @@@ what about searching across all projects? |
| warnings: optional list to accumulate warning messages. |
| now: optional timestamp for tests, otherwise time.time() is used. |
| |
| Returns: |
| A QueryAST with conjunctions (usually just one), where each has a list of |
| Condition PBs with op, fields, str_values and int_values. E.g., the query |
| [priority=high leak OR stars>100] over open issues would return |
| QueryAST( |
| Conjunction(Condition(EQ, [open_fd], [], [1]), |
| Condition(EQ, [label_fd], ['priority-high'], []), |
| Condition(TEXT_HAS, any_field_fd, ['leak'], [])), |
| Conjunction(Condition(EQ, [open_fd], [], [1]), |
| Condition(GT, [stars_fd], [], [100]))) |
| |
| Raises: |
| InvalidQueryError: If a problem was detected in the user's query. |
| """ |
| if warnings is None: |
| warnings = [] |
| |
| # Convert the overall query into one or more OR'd subqueries. |
| subqueries = QueryToSubqueries(query) |
| |
| # Make a dictionary of all fields: built-in + custom in each project. |
| combined_fields = collections.defaultdict( |
| list, {field_name: [field_def] |
| for field_name, field_def in builtin_fields.items()}) |
| for fd in harmonized_config.field_defs: |
| if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE: |
| # Only do non-enum fields because enums are stored as labels |
| combined_fields[fd.field_name.lower()].append(fd) |
| if fd.field_type == APPROVAL: |
| for approval_suffix in _APPROVAL_SUFFIXES: |
| combined_fields[fd.field_name.lower() + approval_suffix].append(fd) |
| |
| conjunctions = [ |
| _ParseConjunction(sq, scope, combined_fields, warnings, now=now) |
| for sq in subqueries] |
| return ast_pb2.QueryAST(conjunctions=conjunctions) |
| |
| |
| def _ParseConjunction(subquery, scope, fields, warnings, now=None): |
| # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str], |
| # int) -> mrproto.ast_pb2.Condition |
| """Parse part of a user query into a Conjunction PB.""" |
| scoped_query = ('%s %s' % (scope, subquery)).lower() |
| cond_strs = _ExtractConds(scoped_query, warnings) |
| conds = [_ParseCond(cond_str, fields, warnings, now=now) |
| for cond_str in cond_strs] |
| conds = [cond for cond in conds if cond] |
| return ast_pb2.Conjunction(conds=conds) |
| |
| |
| def _ParseCond(cond_str, fields, warnings, now=None): |
| # type: (str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str], |
| # int) -> mrproto.ast_pb2.Condition |
| """Parse one user query condition string into a Condition PB.""" |
| op_match = OP_RE.match(cond_str) |
| # Do not treat as key:value search terms if any of the special prefixes match. |
| special_prefixes_match = any( |
| cond_str.startswith(p) for p in NON_OP_PREFIXES) |
| if op_match and not special_prefixes_match: |
| prefix = op_match.group('prefix') |
| op = op_match.group('op') |
| val = op_match.group('value') |
| # Special case handling to continue to support old date query terms from |
| # code.google.com. See monorail:151 for more details. |
| if prefix.startswith(_DATE_FIELDS): |
| for date_suffix in _DATE_FIELD_SUFFIX_TO_OP: |
| if prefix.endswith(date_suffix): |
| prefix = prefix.rstrip(date_suffix) |
| op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix] |
| return _ParseStructuredTerm(prefix, op, val, fields, now=now) |
| |
| # Treat the cond as a full-text search term, which might be negated. |
| if cond_str.startswith('-'): |
| op = NOT_TEXT_HAS |
| cond_str = cond_str[1:] |
| else: |
| op = TEXT_HAS |
| |
| # Construct a full-text Query object as a dry-run to validate that |
| # the syntax is acceptable. |
| try: |
| _fts_query = search.Query(cond_str) |
| except search.QueryError: |
| warnings.append('Ignoring full-text term: %s' % cond_str) |
| return None |
| |
| # Flag a potential user misunderstanding. |
| if cond_str.lower() in ('and', 'or', 'not'): |
| warnings.append( |
| 'The only supported boolean operator is OR (all capitals).') |
| |
| return ast_pb2.MakeCond( |
| op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], []) |
| |
| |
| def _ParseStructuredTerm(prefix, op_str, value, fields, now=None): |
| # type: (str, str, str, Mapping[str, mrproto.tracker_pb2.FieldDef]) -> |
| # mrproto.ast_pb2.Condition |
| """Parse one user structured query term into an internal representation. |
| |
| Args: |
| prefix: The query operator, usually a field name. E.g., summary. It can |
| also be special operators like "is" to test boolean fields. |
| op_str: the comparison operator. Usually ":" or "=", but can be any OPS. |
| value: the value to compare against, e.g., term to find in that field. |
| fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields. |
| now: optional timestamp for tests, otherwise time.time() is used. |
| |
| Returns: |
| A Condition PB. |
| """ |
| unquoted_value = value.strip('"') |
| # Quick-OR is a convenient way to write one condition that matches any one of |
| # multiple values, like set membership. E.g., [Priority=High,Critical]. |
| # Ignore empty values caused by duplicated or trailing commas. E.g., |
| # [Priority=High,,Critical,] is equivalent to [Priority=High,Critical]. |
| quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()] |
| |
| op = OPS[op_str] |
| negate = False |
| if prefix.startswith('-'): |
| negate = True |
| op = NEGATED_OPS.get(op, op) |
| prefix = prefix[1:] |
| |
| if prefix == 'is' and unquoted_value in [ |
| 'open', 'blocked', 'spam', 'ownerbouncing']: |
| return ast_pb2.MakeCond( |
| NE if negate else EQ, fields[unquoted_value], [], []) |
| |
| # Search entries with or without any value in the specified field. |
| if prefix == 'has': |
| op = IS_NOT_DEFINED if negate else IS_DEFINED |
| if '.' in unquoted_value: # Possible search for phase field with any value. |
| phase_name, possible_field = unquoted_value.split('.', 1) |
| if possible_field in fields: |
| return ast_pb2.MakeCond( |
| op, fields[possible_field], [], [], phase_name=phase_name) |
| elif unquoted_value in fields: # Look for that field with any value. |
| return ast_pb2.MakeCond(op, fields[unquoted_value], [], []) |
| else: # Look for any label with that prefix. |
| return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], []) |
| |
| # Search entries with certain gates. |
| if prefix == 'gate': |
| return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, []) |
| |
| # Determine hotlist query type. |
| # If prefix is not 'hotlist', quick_or_vals is empty, or qov |
| # does not contain ':', is_fields will remain True |
| is_fields = True |
| if prefix == 'hotlist': |
| try: |
| if ':' not in quick_or_vals[0]: |
| is_fields = False |
| except IndexError: |
| is_fields = False |
| |
| phase_name = None |
| if '.' in prefix and is_fields: |
| split_prefix = prefix.split('.', 1) |
| if split_prefix[1] in fields: |
| phase_name, prefix = split_prefix |
| |
| # search built-in and custom fields. E.g., summary. |
| if prefix in fields and is_fields: |
| # Note: if first matching field is date-type, we assume they all are. |
| # TODO(jrobbins): better handling for rare case where multiple projects |
| # define the same custom field name, and one is a date and another is not. |
| first_field = fields[prefix][0] |
| if first_field.field_type == DATE: |
| date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals] |
| return ast_pb2.MakeCond(op, fields[prefix], [], date_values) |
| elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX): |
| date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals] |
| return ast_pb2.MakeCond( |
| op, |
| fields[prefix], [], |
| date_values, |
| key_suffix=SET_ON_SUFFIX, |
| phase_name=phase_name) |
| else: |
| quick_or_ints = [] |
| for qov in quick_or_vals: |
| try: |
| quick_or_ints.append(int(qov)) |
| except ValueError: |
| pass |
| if first_field.field_type == APPROVAL: |
| for approval_suffix in _APPROVAL_SUFFIXES: |
| if prefix.endswith(approval_suffix): |
| return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, |
| quick_or_ints, key_suffix=approval_suffix, |
| phase_name=phase_name) |
| return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, |
| quick_or_ints, phase_name=phase_name) |
| |
| # Since it is not a field, treat it as labels, E.g., Priority. |
| quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals] |
| # Convert substring match to key-value match if user typed 'foo:bar'. |
| if op == TEXT_HAS: |
| op = KEY_HAS |
| return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, []) |
| |
| |
| def _ExtractConds(query, warnings): |
| # type: (str, Sequence[str]) -> Sequence[str] |
| """Parse a query string into a list of individual condition strings. |
| |
| Args: |
| query: UTF-8 encoded search query string. |
| warnings: list to accumulate warning messages. |
| |
| Returns: |
| A list of query condition strings. |
| """ |
| # Convert to unicode then search for distinct terms. |
| term_matches = TERM_RE.findall(query) |
| |
| terms = [] |
| for (phrase, word_label, _op1, phrase_label, _op2, |
| word) in term_matches: |
| # Case 1: Quoted phrases, e.g., ["hot dog"]. |
| if phrase_label or phrase: |
| terms.append(phrase_label or phrase) |
| |
| # Case 2: Comparisons |
| elif word_label: |
| special_prefixes_match = any( |
| word_label.startswith(p) for p in NON_OP_PREFIXES) |
| match = OP_RE.match(word_label) |
| if match and not special_prefixes_match: |
| label = match.group('prefix') |
| op = match.group('op') |
| word = match.group('value') |
| terms.append('%s%s"%s"' % (label, op, word)) |
| else: |
| # It looked like a key:value cond, but not exactly, so treat it |
| # as fulltext search. It is probably a tiny bit of source code. |
| terms.append('"%s"' % word_label) |
| |
| # Case 3: Simple words. |
| elif word: |
| terms.append(word) |
| |
| else: # pragma: no coverage |
| warnings.append('Unparsable search term') |
| |
| return terms |
| |
| |
| def _ParseDateValue(val, now=None): |
| # type: (str, int) -> int |
| """Convert the user-entered date into timestamp.""" |
| # Support timestamp value such as opened>1437671476 |
| try: |
| return int(val) |
| except ValueError: |
| pass |
| |
| # TODO(jrobbins): future: take timezones into account. |
| # TODO(jrobbins): for now, explain to users that "today" is |
| # actually now: the current time, not 12:01am in their timezone. |
| # In fact, it is not very useful because everything in the system |
| # happened before the current time. |
| if val == 'today': |
| return _CalculatePastDate(0, now=now) |
| elif val.startswith('today-'): |
| try: |
| days_ago = int(val.split('-')[1]) |
| except ValueError: |
| raise InvalidQueryError('Could not parse date: ' + val) |
| return _CalculatePastDate(days_ago, now=now) |
| |
| try: |
| if '/' in val: |
| year, month, day = [int(x) for x in val.split('/')] |
| elif '-' in val: |
| year, month, day = [int(x) for x in val.split('-')] |
| else: |
| raise InvalidQueryError('Could not parse date: ' + val) |
| except ValueError: |
| raise InvalidQueryError('Could not parse date: ' + val) |
| |
| try: |
| return int(time.mktime(datetime.datetime(year, month, day).timetuple())) |
| except ValueError: |
| raise InvalidQueryError('Could not parse date: ' + val) |
| |
| |
| def _CalculatePastDate(days_ago, now=None): |
| # type: (int, int) -> int |
| """Calculates the timestamp N days ago from now.""" |
| if now is None: |
| now = int(time.time()) |
| ts = now - days_ago * 24 * 60 * 60 |
| return ts |
| |
| |
| def QueryToSubqueries(query): |
| # type (str) -> Sequence[str] |
| """Splits a query into smaller queries based on Monorail's search syntax. |
| |
| This function handles parsing parentheses and OR statements in Monorail's |
| search syntax. By doing this parsing for OR statements and parentheses up |
| front in FrontendSearchPipeline, we are able to convert complex queries |
| with lots of ORs into smaller, more easily cacheable query terms. |
| |
| These outputted subqueries should collectively return the same query results |
| as the initial input query without containing any ORs or parentheses, |
| allowing later search layers to parse queries without worrying about ORs |
| or parentheses. |
| |
| Some examples of possible queries and their expected output: |
| |
| - '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F'] |
| - '(A) OR (B)' -> ['A', 'B'] |
| - '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F'] |
| |
| Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me", |
| "Pri=1", "hello world Hotlist=test", "label!=a11y", etc |
| |
| Note: Monorail implicitly ANDs any query terms separated by a space. For |
| the most part, AND functionality is handled at a later layer in search |
| processing. However, this case becomes important here when considering the |
| fact that a prentheses group can either be ANDed or ORed with terms that |
| surround it. |
| |
| The _MultiplySubqueries helper is used to AND the results of different |
| groups together whereas concatenating lists is used to OR subqueries |
| together. |
| |
| Args: |
| query: The initial query that was sent to the search. |
| |
| Returns: |
| List of query fragments to be independently processed as search terms. |
| |
| Raises: |
| InvalidQueryError if parentheses are unmatched. |
| """ |
| tokens = _ValidateAndTokenizeQuery(query) |
| |
| # Using an iterator allows us to keep our current loop position across |
| # helpers. This makes recursion a lot easier. |
| token_iterator = PeekIterator(tokens) |
| |
| subqueries = _ParseQuery(token_iterator) |
| |
| if not len(subqueries): |
| # Several cases, such as an empty query or a query with only parentheses |
| # will result in an empty set of subqueries. In these cases, we still want |
| # to give the search pipeline a single empty query to process. |
| return [''] |
| |
| return subqueries |
| |
| |
| def _ParseQuery(token_iterator): |
| # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
| """Recursive helper to convert query tokens into a list of subqueries. |
| |
| Parses a Query based on the following grammar (EBNF): |
| |
| Query := OrGroup { [OrOperator] OrGroup } |
| OrGroup := AndGroup { AndGroup } |
| AndGroup := Subquery | ParenthesesGroup |
| ParenthesesGroup := "(" Query ")" |
| Subquery := /.+/ |
| OrOperator := " OR " |
| |
| An important nuance is that two groups can be next to each other, separated |
| only by a word boundary (ie: space or parentheses). In this case, they are |
| implicitly ANDed. In practice, because unparenthesized fragments ANDed by |
| spaces are stored as single tokens, we only need to handle the AND case when |
| a parentheses group is implicitly ANDed with an adjacent group. |
| |
| Order of precedence is implemented by recursing through OR groups before |
| recursing through AND groups. |
| |
| Args: |
| token_iterator: Iterator over a list of query tokens. |
| |
| Returns: |
| List of query fragments to be processed as search terms. |
| |
| Raises: |
| InvalidQueryError if tokens were inputted in a format that does not follow |
| our search grammar. |
| """ |
| subqueries = [] |
| try: |
| if token_iterator.peek().token_type == OR: |
| # Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup. |
| # ie: "(OR A)" will be processed as "A" |
| next(token_iterator) |
| |
| subqueries = _ParseOrGroup(token_iterator) |
| |
| while token_iterator.peek().token_type == OR: |
| # Consume the OR tokens without doing anything with it. |
| next(token_iterator) |
| |
| next_token = token_iterator.peek() |
| if next_token.token_type == RIGHT_PAREN: |
| # Edge case: Ignore empty OR groups at the end of a ParenthesesGroup. |
| # ie: "(A OR)" will be processed as "A" |
| return subqueries |
| |
| next_subqueries = _ParseOrGroup(token_iterator) |
| |
| # Concatenate results of OR groups together. |
| subqueries = subqueries + next_subqueries |
| |
| except StopIteration: |
| pass |
| # Return when we've reached the end of the string. |
| return subqueries |
| |
| |
| def _ParseOrGroup(token_iterator): |
| # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
| """Recursive helper to convert a single "OrGroup" into subqueries. |
| |
| An OrGroup here is based on the following grammar: |
| |
| Query := OrGroup { [OrOperator] OrGroup } |
| OrGroup := AndGroup { AndGroup } |
| AndGroup := Subquery | ParenthesesGroup |
| ParenthesesGroup := "(" Query ")" |
| Subquery := /.+/ |
| OrOperator := " OR " |
| |
| Args: |
| token_iterator: Iterator over a list of query tokens. |
| |
| Returns: |
| List of query fragments to be processed as search terms. |
| |
| Raises: |
| InvalidQueryError if tokens were inputted in a format that does not follow |
| our search grammar. |
| """ |
| subqueries = _ParseAndGroup(token_iterator) |
| |
| try: |
| # Iterate until there are no more AND groups left to see. |
| # Subquery or left parentheses are the possible starts of an AndGroup. |
| while (token_iterator.peek().token_type == SUBQUERY or |
| token_iterator.peek().token_type == LEFT_PAREN): |
| |
| # Find subqueries from the next AND group. |
| next_subqueries = _ParseAndGroup(token_iterator) |
| |
| # Multiply all results across AND groups together. |
| subqueries = _MultiplySubqueries(subqueries, next_subqueries) |
| except StopIteration: |
| pass |
| |
| return subqueries |
| |
| |
| def _ParseAndGroup(token_iterator): |
| # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
| """Recursive helper to convert a single "AndGroup" into subqueries. |
| |
| An OrGroup here is based on the following grammar: |
| |
| Query := OrGroup { [OrOperator] OrGroup } |
| OrGroup := AndGroup { AndGroup } |
| AndGroup := Subquery | ParenthesesGroup |
| ParenthesesGroup := "(" Query ")" |
| Subquery := /.+/ |
| OrOperator := " OR " |
| |
| Args: |
| token_iterator: Iterator over a list of query tokens. |
| |
| Returns: |
| List of query fragments to be processed as search terms. |
| |
| Raises: |
| InvalidQueryError if tokens were inputted in a format that does not follow |
| our search grammar. |
| """ |
| try: |
| token = next(token_iterator) |
| if token.token_type == LEFT_PAREN: |
| if token_iterator.peek().token_type == RIGHT_PAREN: |
| # Don't recurse into the ParenthesesGroup if there's nothing inside. |
| next(token_iterator) |
| return [] |
| |
| # Recurse into the ParenthesesGroup. |
| subqueries = _ParseQuery(token_iterator) |
| |
| # Next token should be a right parenthesis. |
| next(token_iterator) |
| |
| return subqueries |
| elif token.token_type == SUBQUERY: |
| return [token.value] |
| else: |
| # This should not happen if other QueryToSubqueries helpers are working |
| # properly. |
| raise InvalidQueryError('Inputted tokens do not follow grammar.') |
| except StopIteration: |
| pass |
| return [] |
| |
| |
| def _ValidateAndTokenizeQuery(query): |
| # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken] |
| """Converts the input query into a set of tokens for easier parsing. |
| |
| Tokenizing the query string before parsing allows us to not have to as many |
| string manipulations while parsing, which simplifies our later code. |
| |
| Args: |
| query: Query to tokenize. |
| |
| Returns: |
| List of Token objects for use in query processing. |
| |
| Raises: |
| InvalidQueryError if parentheses are unmatched. |
| """ |
| tokens = [] # Function result |
| count = 0 # Used for checking if parentheses are balanced |
| s = '' # Records current string fragment. Cleared when a token is added. |
| |
| for ch in query: |
| if ch == '(': |
| count += 1 |
| |
| # Add subquery from before we hit this parenthesis. |
| tokens.extend(_TokenizeSubqueryOnOr(s)) |
| s = '' |
| |
| tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN)) |
| elif ch == ')': |
| count -= 1 |
| |
| if count < 0: |
| # More closing parentheses then open parentheses. |
| raise InvalidQueryError('Search query has unbalanced parentheses.') |
| |
| # Add subquery from before we hit this parenthesis. |
| tokens.extend(_TokenizeSubqueryOnOr(s)) |
| s = '' |
| |
| tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN)) |
| else: |
| s += ch |
| |
| if count != 0: |
| raise InvalidQueryError('Search query has unbalanced parentheses.') |
| |
| # Add any trailing tokens. |
| tokens.extend(_TokenizeSubqueryOnOr(s)) |
| |
| return tokens |
| |
| |
| def _TokenizeSubqueryOnOr(subquery): |
| # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken] |
| """Helper to split a subquery by OR and convert the result into tokens. |
| |
| Args: |
| subquery: A string without parentheses to tokenize. |
| |
| Returns: |
| Tokens for the subquery with OR tokens separating query strings if |
| applicable. |
| """ |
| if len(subquery) == 0: |
| return [] |
| |
| result = [] |
| fragments = subquery.split(OR_SYMBOL) |
| for f in fragments: |
| # Interleave the string fragments with OR tokens. |
| result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip())) |
| result.append(ast_pb2.QueryToken(token_type=OR)) |
| |
| # Remove trailing OR. |
| result.pop() |
| |
| # Trim empty strings at the beginning or end. ie: if subquery is ' OR ', |
| # we want the list to be ['OR'], not ['', 'OR', '']. |
| if len(result) > 1 and result[0].value == '': |
| result.pop(0) |
| if len(result) > 1 and result[-1].value == '': |
| result.pop() |
| return result |
| |
| |
| def _MultiplySubqueries(a, b): |
| # type: (Sequence[str], Sequence[str]) -> Sequence[str] |
| """Helper to AND subqueries from two separate lists. |
| |
| Args: |
| a: First list of subqueries. |
| b: Second list of subqueries. |
| |
| Returns: |
| List with n x m subqueries. |
| """ |
| if not len(a): |
| return b |
| if not len(b): |
| return a |
| res = [] |
| for q1 in a: |
| for q2 in b: |
| # AND two subqueries together by concatenating them. |
| query = (q1.strip() + ' ' + q2.strip()).strip() |
| res.append(query) |
| return res |
| |
| |
| class PeekIterator: |
| """Simple iterator with peek() functionality. |
| |
| Used by QueryToSubqueries to maintain state easily across recursive calls. |
| """ |
| |
| def __init__(self, source): |
| # type: (Sequence[Any]) |
| self.__source = source |
| self.__i = 0 |
| |
| def peek(self): |
| # type: () -> Any |
| """Gets the next value in the iterator without side effects. |
| |
| Returns: |
| Next value in iterator. |
| |
| Raises: |
| StopIteration if you're at the end of the iterator. |
| """ |
| if self.__i >= len(self.__source): |
| raise StopIteration |
| return self.__source[self.__i] |
| |
| def __iter__(self): |
| # type: () -> Sequence[Any] |
| """Return self to make iterator iterable.""" |
| return self |
| |
| def __repr__(self): |
| # type: () -> str |
| """Allow logging current iterator value for debugging.""" |
| try: |
| return str(self.peek()) |
| except StopIteration: |
| pass |
| return 'End of PeekIterator' |
| |
| def __next__(self): |
| # type: () -> Any |
| """Gets the next value in the iterator and increments pointer. |
| |
| Returns: |
| Next value in iterator. |
| |
| Raises: |
| StopIteration if you're at the end of the iterator. |
| """ |
| return self.next() |
| |
| def next(self): |
| # type: () -> Any |
| """Gets the next value in the iterator and increments pointer. |
| |
| For backwards compatibility with Python 2. |
| |
| Returns: |
| Next value in iterator. |
| |
| Raises: |
| StopIteration if you're at the end of the iterator. |
| """ |
| if self.__i >= len(self.__source): |
| raise StopIteration |
| value = self.__source[self.__i] |
| self.__i += 1 |
| return value |
| |
| |
| class Error(Exception): |
| """Base exception class for this package.""" |
| pass |
| |
| |
| class InvalidQueryError(Error): |
| """Error raised when an invalid query is requested.""" |
| pass |