Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/search/query2ast.py b/search/query2ast.py
new file mode 100644
index 0000000..235f9b3
--- /dev/null
+++ b/search/query2ast.py
@@ -0,0 +1,899 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""A set of functions that integrate the GAE search index with Monorail."""
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import collections
+import datetime
+import logging
+import re
+import time
+
+from google.appengine.api import search
+
+from proto import ast_pb2
+from proto import tracker_pb2
+
+
+# TODO(jrobbins): Consider re-implementing this whole file by using a
+# BNF syntax specification and a parser generator or library.
+
+# encodings
+UTF8 = 'utf-8'
+
+# Operator used for OR statements.
+OR_SYMBOL = ' OR '
+
+# Token types used for parentheses parsing.
+SUBQUERY = ast_pb2.TokenType.SUBQUERY
+LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN
+RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN
+OR = ast_pb2.TokenType.OR
+
+# Field types and operators
+BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
+DATE = tracker_pb2.FieldTypes.DATE_TYPE
+NUM = tracker_pb2.FieldTypes.INT_TYPE
+TXT = tracker_pb2.FieldTypes.STR_TYPE
+APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE
+
+EQ = ast_pb2.QueryOp.EQ
+NE = ast_pb2.QueryOp.NE
+LT = ast_pb2.QueryOp.LT
+GT = ast_pb2.QueryOp.GT
+LE = ast_pb2.QueryOp.LE
+GE = ast_pb2.QueryOp.GE
+TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
+NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
+IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
+IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
+KEY_HAS = ast_pb2.QueryOp.KEY_HAS
+
+# Mapping from user query comparison operators to our internal representation.
+OPS = {
+    ':': TEXT_HAS,
+    '=': EQ,
+    '!=': NE,
+    '<': LT,
+    '>': GT,
+    '<=': LE,
+    '>=': GE,
+}
+
+# When the query has a leading minus, switch the operator for its opposite.
+NEGATED_OPS = {
+    EQ: NE,
+    NE: EQ,
+    LT: GE,
+    GT: LE,
+    LE: GT,
+    GE: LT,
+    TEXT_HAS: NOT_TEXT_HAS,
+    # IS_DEFINED is handled separately.
+    }
+
+# This is a partial regular expression that matches all of our comparison
+# operators, such as =, 1=, >, and <.  Longer ones listed first so that the
+# shorter ones don't cause premature matches.
+OPS_PATTERN = '|'.join(
+    map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op))))
+
+# This RE extracts search terms from a subquery string.
+TERM_RE = re.compile(
+    r'(-?"[^"]+")|'  # E.g., ["division by zero"]
+    r'(\S+(%s)[^ "]+)|'  # E.g., [stars>10]
+    r'(\w+(%s)"[^"]+")|'  # E.g., [summary:"memory leak"]
+    r'(-?[._\*\w][-._\*\w]+)'  # E.g., [-workaround]
+    % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
+
+# This RE is used to further decompose a comparison term into prefix, op, and
+# value.  E.g., [stars>10] or [is:open] or [summary:"memory leak"].  The prefix
+# can include a leading "-" to negate the comparison.
+OP_RE = re.compile(
+    r'^(?P<prefix>[-_.\w]*?)'
+    r'(?P<op>%s)'
+    r'(?P<value>([-@\w][-\*,./:<=>@\w]*|"[^"]*"))$' %
+    OPS_PATTERN,
+    flags=re.UNICODE)
+
+
+# Predefined issue fields passed to the query parser.
+_ISSUE_FIELDS_LIST = [
+    (ast_pb2.ANY_FIELD, TXT),
+    ('attachment', TXT),  # attachment file names
+    ('attachments', NUM),  # number of attachment files
+    ('blocked', BOOL),
+    ('blockedon', TXT),
+    ('blockedon_id', NUM),
+    ('blocking', TXT),
+    ('blocking_id', NUM),
+    ('cc', TXT),
+    ('cc_id', NUM),
+    ('comment', TXT),
+    ('commentby', TXT),
+    ('commentby_id', NUM),
+    ('component', TXT),
+    ('component_id', NUM),
+    ('description', TXT),
+    ('gate', TXT),
+    ('hotlist', TXT),
+    ('hotlist_id', NUM),
+    ('id', NUM),
+    ('is_spam', BOOL),
+    ('label', TXT),
+    ('label_id', NUM),
+    ('mergedinto', NUM),
+    ('mergedinto_id', NUM),
+    ('open', BOOL),
+    ('owner', TXT),
+    ('ownerbouncing', BOOL),
+    ('owner_id', NUM),
+    ('project', TXT),
+    ('reporter', TXT),
+    ('reporter_id', NUM),
+    ('spam', BOOL),
+    ('stars', NUM),
+    ('starredby', TXT),
+    ('starredby_id', NUM),
+    ('status', TXT),
+    ('status_id', NUM),
+    ('summary', TXT),
+    ]
+
+_DATE_FIELDS = (
+    'closed',
+    'modified',
+    'opened',
+    'ownermodified',
+    'ownerlastvisit',
+    'statusmodified',
+    'componentmodified',
+    )
+
+# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
+_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
+
+_DATE_FIELD_SUFFIX_TO_OP = {
+    '-after': '>',
+    '-before': '<',
+}
+
+SET_BY_SUFFIX = '-by'
+SET_ON_SUFFIX = '-on'
+APPROVER_SUFFIX = '-approver'
+STATUS_SUFFIX = '-status'
+
+_APPROVAL_SUFFIXES = (
+    SET_BY_SUFFIX,
+    SET_ON_SUFFIX,
+    APPROVER_SUFFIX,
+    STATUS_SUFFIX,
+)
+
+BUILTIN_ISSUE_FIELDS = {
+    f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
+    for f_name, f_type in _ISSUE_FIELDS_LIST}
+
+
+# Do not treat strings that start with the below as key:value search terms.
+# See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail.
+NON_OP_PREFIXES = (
+    'http:',
+    'https:',
+)
+
+
+def ParseUserQuery(
+    query, scope, builtin_fields, harmonized_config, warnings=None,
+    now=None):
+  # type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef],
+  #   proto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) ->
+  #     proto.ast_pb2.QueryAST
+  """Parse a user query and return a set of structure terms.
+
+  Args:
+    query: string with user's query.  E.g., 'Priority=High'.
+    scope: string search terms that define the scope in which the
+        query should be executed.  They are expressed in the same
+        user query language.  E.g., adding the canned query.
+    builtin_fields: dict {field_name: FieldDef(field_name, type)}
+        mapping field names to FieldDef objects for built-in fields.
+    harmonized_config: config for all the projects being searched.
+        @@@ custom field name is not unique in cross project search.
+         - custom_fields = {field_name: [fd, ...]}
+         - query build needs to OR each possible interpretation
+         - could be label in one project and field in another project.
+        @@@ what about searching across all projects?
+    warnings: optional list to accumulate warning messages.
+    now: optional timestamp for tests, otherwise time.time() is used.
+
+  Returns:
+    A QueryAST with conjunctions (usually just one), where each has a list of
+    Condition PBs with op, fields, str_values and int_values.  E.g., the query
+    [priority=high leak OR stars>100] over open issues would return
+    QueryAST(
+      Conjunction(Condition(EQ, [open_fd], [], [1]),
+                  Condition(EQ, [label_fd], ['priority-high'], []),
+                  Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
+      Conjunction(Condition(EQ, [open_fd], [], [1]),
+                  Condition(GT, [stars_fd], [], [100])))
+
+  Raises:
+    InvalidQueryError: If a problem was detected in the user's query.
+  """
+  if warnings is None:
+    warnings = []
+
+  # Convert the overall query into one or more OR'd subqueries.
+  subqueries = QueryToSubqueries(query)
+
+  # Make a dictionary of all fields: built-in + custom in each project.
+  combined_fields = collections.defaultdict(
+      list, {field_name: [field_def]
+             for field_name, field_def in builtin_fields.items()})
+  for fd in harmonized_config.field_defs:
+    if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
+      # Only do non-enum fields because enums are stored as labels
+      combined_fields[fd.field_name.lower()].append(fd)
+      if fd.field_type == APPROVAL:
+        for approval_suffix in _APPROVAL_SUFFIXES:
+          combined_fields[fd.field_name.lower() + approval_suffix].append(fd)
+
+  conjunctions = [
+      _ParseConjunction(sq, scope, combined_fields, warnings, now=now)
+      for sq in subqueries]
+  return ast_pb2.QueryAST(conjunctions=conjunctions)
+
+
+def _ParseConjunction(subquery, scope, fields, warnings, now=None):
+  # type: (str, str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
+  #     int) -> proto.ast_pb2.Condition
+  """Parse part of a user query into a Conjunction PB."""
+  scoped_query = ('%s %s' % (scope, subquery)).lower()
+  cond_strs = _ExtractConds(scoped_query, warnings)
+  conds = [_ParseCond(cond_str, fields, warnings, now=now)
+           for cond_str in cond_strs]
+  conds = [cond for cond in conds if cond]
+  return ast_pb2.Conjunction(conds=conds)
+
+
+def _ParseCond(cond_str, fields, warnings, now=None):
+  # type: (str, Mapping[str, proto.tracker_pb2.FieldDef], Sequence[str],
+  #     int) -> proto.ast_pb2.Condition
+  """Parse one user query condition string into a Condition PB."""
+  op_match = OP_RE.match(cond_str)
+  # Do not treat as key:value search terms if any of the special prefixes match.
+  special_prefixes_match = any(
+      cond_str.startswith(p) for p in NON_OP_PREFIXES)
+  if op_match and not special_prefixes_match:
+    prefix = op_match.group('prefix')
+    op = op_match.group('op')
+    val = op_match.group('value')
+    # Special case handling to continue to support old date query terms from
+    # code.google.com. See monorail:151 for more details.
+    if prefix.startswith(_DATE_FIELDS):
+      for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
+        if prefix.endswith(date_suffix):
+          prefix = prefix.rstrip(date_suffix)
+          op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
+    return _ParseStructuredTerm(prefix, op, val, fields, now=now)
+
+  # Treat the cond as a full-text search term, which might be negated.
+  if cond_str.startswith('-'):
+    op = NOT_TEXT_HAS
+    cond_str = cond_str[1:]
+  else:
+    op = TEXT_HAS
+
+  # Construct a full-text Query object as a dry-run to validate that
+  # the syntax is acceptable.
+  try:
+    _fts_query = search.Query(cond_str)
+  except search.QueryError:
+    warnings.append('Ignoring full-text term: %s' % cond_str)
+    return None
+
+  # Flag a potential user misunderstanding.
+  if cond_str.lower() in ('and', 'or', 'not'):
+    warnings.append(
+        'The only supported boolean operator is OR (all capitals).')
+
+  return ast_pb2.MakeCond(
+      op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
+
+
+def _ParseStructuredTerm(prefix, op_str, value, fields, now=None):
+  # type: (str, str, str, Mapping[str, proto.tracker_pb2.FieldDef]) ->
+  #     proto.ast_pb2.Condition
+  """Parse one user structured query term into an internal representation.
+
+  Args:
+    prefix: The query operator, usually a field name.  E.g., summary. It can
+      also be special operators like "is" to test boolean fields.
+    op_str: the comparison operator.  Usually ":" or "=", but can be any OPS.
+    value: the value to compare against, e.g., term to find in that field.
+    fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
+    now: optional timestamp for tests, otherwise time.time() is used.
+
+  Returns:
+    A Condition PB.
+  """
+  unquoted_value = value.strip('"')
+  # Quick-OR is a convenient way to write one condition that matches any one of
+  # multiple values, like set membership.  E.g., [Priority=High,Critical].
+  # Ignore empty values caused by duplicated or trailing commas. E.g.,
+  # [Priority=High,,Critical,] is equivalent to [Priority=High,Critical].
+  quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()]
+
+  op = OPS[op_str]
+  negate = False
+  if prefix.startswith('-'):
+    negate = True
+    op = NEGATED_OPS.get(op, op)
+    prefix = prefix[1:]
+
+  if prefix == 'is' and unquoted_value in [
+      'open', 'blocked', 'spam', 'ownerbouncing']:
+    return ast_pb2.MakeCond(
+        NE if negate else EQ, fields[unquoted_value], [], [])
+
+  # Search entries with or without any value in the specified field.
+  if prefix == 'has':
+    op = IS_NOT_DEFINED if negate else IS_DEFINED
+    if '.' in unquoted_value:  # Possible search for phase field with any value.
+      phase_name, possible_field = unquoted_value.split('.', 1)
+      if possible_field in fields:
+        return ast_pb2.MakeCond(
+            op, fields[possible_field], [], [], phase_name=phase_name)
+    elif unquoted_value in fields:  # Look for that field with any value.
+      return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
+    else:  # Look for any label with that prefix.
+      return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
+
+  # Search entries with certain gates.
+  if prefix == 'gate':
+    return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, [])
+
+  # Determine hotlist query type.
+  # If prefix is not 'hotlist', quick_or_vals is empty, or qov
+  # does not contain ':', is_fields will remain True
+  is_fields = True
+  if prefix == 'hotlist':
+    try:
+      if ':' not in quick_or_vals[0]:
+        is_fields = False
+    except IndexError:
+      is_fields = False
+
+  phase_name = None
+  if '.' in prefix and is_fields:
+    split_prefix = prefix.split('.', 1)
+    if split_prefix[1] in fields:
+      phase_name, prefix = split_prefix
+
+  # search built-in and custom fields. E.g., summary.
+  if prefix in fields and is_fields:
+    # Note: if first matching field is date-type, we assume they all are.
+    # TODO(jrobbins): better handling for rare case where multiple projects
+    # define the same custom field name, and one is a date and another is not.
+    first_field = fields[prefix][0]
+    if first_field.field_type == DATE:
+      date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
+      return ast_pb2.MakeCond(op, fields[prefix], [], date_values)
+    elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX):
+      date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals]
+      return ast_pb2.MakeCond(
+          op,
+          fields[prefix], [],
+          date_values,
+          key_suffix=SET_ON_SUFFIX,
+          phase_name=phase_name)
+    else:
+      quick_or_ints = []
+      for qov in quick_or_vals:
+        try:
+          quick_or_ints.append(int(qov))
+        except ValueError:
+          pass
+      if first_field.field_type == APPROVAL:
+        for approval_suffix in _APPROVAL_SUFFIXES:
+          if prefix.endswith(approval_suffix):
+            return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
+                                    quick_or_ints, key_suffix=approval_suffix,
+                                    phase_name=phase_name)
+      return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals,
+                              quick_or_ints, phase_name=phase_name)
+
+  # Since it is not a field, treat it as labels, E.g., Priority.
+  quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
+  # Convert substring match to key-value match if user typed 'foo:bar'.
+  if op == TEXT_HAS:
+    op = KEY_HAS
+  return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
+
+
+def _ExtractConds(query, warnings):
+  # type: (str, Sequence[str]) -> Sequence[str]
+  """Parse a query string into a list of individual condition strings.
+
+  Args:
+    query: UTF-8 encoded search query string.
+    warnings: list to accumulate warning messages.
+
+  Returns:
+    A list of query condition strings.
+  """
+  # Convert to unicode then search for distinct terms.
+  term_matches = TERM_RE.findall(query)
+
+  terms = []
+  for (phrase, word_label, _op1, phrase_label, _op2,
+       word) in term_matches:
+    # Case 1: Quoted phrases, e.g., ["hot dog"].
+    if phrase_label or phrase:
+      terms.append(phrase_label or phrase)
+
+    # Case 2: Comparisons
+    elif word_label:
+      special_prefixes_match = any(
+          word_label.startswith(p) for p in NON_OP_PREFIXES)
+      match = OP_RE.match(word_label)
+      if match and not special_prefixes_match:
+        label = match.group('prefix')
+        op = match.group('op')
+        word = match.group('value')
+        terms.append('%s%s"%s"' % (label, op, word))
+      else:
+        # It looked like a key:value cond, but not exactly, so treat it
+        # as fulltext search.  It is probably a tiny bit of source code.
+        terms.append('"%s"' % word_label)
+
+    # Case 3: Simple words.
+    elif word:
+      terms.append(word)
+
+    else:  # pragma: no coverage
+      warnings.append('Unparsable search term')
+
+  return terms
+
+
+def _ParseDateValue(val, now=None):
+  # type: (str, int) -> int
+  """Convert the user-entered date into timestamp."""
+  # Support timestamp value such as opened>1437671476
+  try:
+    return int(val)
+  except ValueError:
+    pass
+
+  # TODO(jrobbins): future: take timezones into account.
+  # TODO(jrobbins): for now, explain to users that "today" is
+  # actually now: the current time, not 12:01am in their timezone.
+  # In fact, it is not very useful because everything in the system
+  # happened before the current time.
+  if val == 'today':
+    return _CalculatePastDate(0, now=now)
+  elif val.startswith('today-'):
+    try:
+      days_ago = int(val.split('-')[1])
+    except ValueError:
+      raise InvalidQueryError('Could not parse date: ' + val)
+    return _CalculatePastDate(days_ago, now=now)
+
+  try:
+    if '/' in val:
+      year, month, day = [int(x) for x in val.split('/')]
+    elif '-' in val:
+      year, month, day = [int(x) for x in val.split('-')]
+    else:
+      raise InvalidQueryError('Could not parse date: ' + val)
+  except ValueError:
+    raise InvalidQueryError('Could not parse date: ' + val)
+
+  try:
+    return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
+  except ValueError:
+    raise InvalidQueryError('Could not parse date: ' + val)
+
+
+def _CalculatePastDate(days_ago, now=None):
+  # type: (int, int) -> int
+  """Calculates the timestamp N days ago from now."""
+  if now is None:
+    now = int(time.time())
+  ts = now - days_ago * 24 * 60 * 60
+  return ts
+
+
+def QueryToSubqueries(query):
+  # type (str) -> Sequence[str]
+  """Splits a query into smaller queries based on Monorail's search syntax.
+
+  This function handles parsing parentheses and OR statements in Monorail's
+  search syntax. By doing this parsing for OR statements and parentheses up
+  front in FrontendSearchPipeline, we are able to convert complex queries
+  with lots of ORs into smaller, more easily cacheable query terms.
+
+  These outputted subqueries should collectively return the same query results
+  as the initial input query without containing any ORs or parentheses,
+  allowing later search layers to parse queries without worrying about ORs
+  or parentheses.
+
+  Some examples of possible queries and their expected output:
+
+  - '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F']
+  - '(A) OR (B)' -> ['A', 'B']
+  - '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F']
+
+  Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me",
+  "Pri=1", "hello world Hotlist=test", "label!=a11y", etc
+
+  Note: Monorail implicitly ANDs any query terms separated by a space. For
+  the most part, AND functionality is handled at a later layer in search
+  processing. However, this case becomes important here when considering the
+  fact that a prentheses group can either be ANDed or ORed with terms that
+  surround it.
+
+  The _MultiplySubqueries helper is used to AND the results of different
+  groups together whereas concatenating lists is used to OR subqueries
+  together.
+
+  Args:
+    query: The initial query that was sent to the search.
+
+  Returns:
+    List of query fragments to be independently processed as search terms.
+
+  Raises:
+    InvalidQueryError if parentheses are unmatched.
+  """
+  tokens = _ValidateAndTokenizeQuery(query)
+
+  # Using an iterator allows us to keep our current loop position across
+  # helpers. This makes recursion a lot easier.
+  token_iterator = PeekIterator(tokens)
+
+  subqueries = _ParseQuery(token_iterator)
+
+  if not len(subqueries):
+    # Several cases, such as an empty query or a query with only parentheses
+    # will result in an empty set of subqueries. In these cases, we still want
+    # to give the search pipeline a single empty query to process.
+    return ['']
+
+  return subqueries
+
+
+def _ParseQuery(token_iterator):
+  # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
+  """Recursive helper to convert query tokens into a list of subqueries.
+
+  Parses a Query based on the following grammar (EBNF):
+
+    Query             := OrGroup { [OrOperator] OrGroup }
+    OrGroup           := AndGroup { AndGroup }
+    AndGroup          := Subquery | ParenthesesGroup
+    ParenthesesGroup  := "(" Query ")"
+    Subquery          := /.+/
+    OrOperator        := " OR "
+
+  An important nuance is that two groups can be next to each other, separated
+  only by a word boundary (ie: space or parentheses). In this case, they are
+  implicitly ANDed. In practice, because unparenthesized fragments ANDed by
+  spaces are stored as single tokens, we only need to handle the AND case when
+  a parentheses group is implicitly ANDed with an adjacent group.
+
+  Order of precedence is implemented by recursing through OR groups before
+  recursing through AND groups.
+
+  Args:
+    token_iterator: Iterator over a list of query tokens.
+
+  Returns:
+    List of query fragments to be processed as search terms.
+
+  Raises:
+    InvalidQueryError if tokens were inputted in a format that does not follow
+    our search grammar.
+  """
+  subqueries = []
+  try:
+    if token_iterator.peek().token_type == OR:
+      # Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup.
+      # ie: "(OR A)" will be processed as "A"
+      next(token_iterator)
+
+    subqueries = _ParseOrGroup(token_iterator)
+
+    while token_iterator.peek().token_type == OR:
+      # Consume the OR tokens without doing anything with it.
+      next(token_iterator)
+
+      next_token = token_iterator.peek()
+      if next_token.token_type == RIGHT_PAREN:
+        # Edge case: Ignore empty OR groups at the end of a ParenthesesGroup.
+        # ie: "(A OR)" will be processed as "A"
+        return subqueries
+
+      next_subqueries = _ParseOrGroup(token_iterator)
+
+      # Concatenate results of OR groups together.
+      subqueries = subqueries + next_subqueries
+
+  except StopIteration:
+    pass
+  # Return when we've reached the end of the string.
+  return subqueries
+
+
+def _ParseOrGroup(token_iterator):
+  # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
+  """Recursive helper to convert a single "OrGroup" into subqueries.
+
+  An OrGroup here is based on the following grammar:
+
+    Query             := OrGroup { [OrOperator] OrGroup }
+    OrGroup           := AndGroup { AndGroup }
+    AndGroup          := Subquery | ParenthesesGroup
+    ParenthesesGroup  := "(" Query ")"
+    Subquery          := /.+/
+    OrOperator        := " OR "
+
+  Args:
+    token_iterator: Iterator over a list of query tokens.
+
+  Returns:
+    List of query fragments to be processed as search terms.
+
+  Raises:
+    InvalidQueryError if tokens were inputted in a format that does not follow
+    our search grammar.
+  """
+  subqueries = _ParseAndGroup(token_iterator)
+
+  try:
+    # Iterate until there are no more AND groups left to see.
+    # Subquery or left parentheses are the possible starts of an AndGroup.
+    while (token_iterator.peek().token_type == SUBQUERY or
+           token_iterator.peek().token_type == LEFT_PAREN):
+
+      # Find subqueries from the next AND group.
+      next_subqueries = _ParseAndGroup(token_iterator)
+
+      # Multiply all results across AND groups together.
+      subqueries = _MultiplySubqueries(subqueries, next_subqueries)
+  except StopIteration:
+    pass
+
+  return subqueries
+
+
+def _ParseAndGroup(token_iterator):
+  # type (Sequence[proto.ast_pb2.QueryToken]) -> Sequence[str]
+  """Recursive helper to convert a single "AndGroup" into subqueries.
+
+  An OrGroup here is based on the following grammar:
+
+    Query             := OrGroup { [OrOperator] OrGroup }
+    OrGroup           := AndGroup { AndGroup }
+    AndGroup          := Subquery | ParenthesesGroup
+    ParenthesesGroup  := "(" Query ")"
+    Subquery          := /.+/
+    OrOperator        := " OR "
+
+  Args:
+    token_iterator: Iterator over a list of query tokens.
+
+  Returns:
+    List of query fragments to be processed as search terms.
+
+  Raises:
+    InvalidQueryError if tokens were inputted in a format that does not follow
+    our search grammar.
+  """
+  try:
+    token = next(token_iterator)
+    if token.token_type == LEFT_PAREN:
+      if token_iterator.peek().token_type == RIGHT_PAREN:
+        # Don't recurse into the ParenthesesGroup if there's nothing inside.
+        next(token_iterator)
+        return []
+
+      # Recurse into the ParenthesesGroup.
+      subqueries = _ParseQuery(token_iterator)
+
+      # Next token should be a right parenthesis.
+      next(token_iterator)
+
+      return subqueries
+    elif token.token_type == SUBQUERY:
+      return [token.value]
+    else:
+      # This should not happen if other QueryToSubqueries helpers are working
+      # properly.
+      raise InvalidQueryError('Inputted tokens do not follow grammar.')
+  except StopIteration:
+    pass
+  return []
+
+
+def _ValidateAndTokenizeQuery(query):
+  # type: (str) -> Sequence[proto.ast_pb2.QueryToken]
+  """Converts the input query into a set of tokens for easier parsing.
+
+  Tokenizing the query string before parsing allows us to not have to as many
+  string manipulations while parsing, which simplifies our later code.
+
+  Args:
+    query: Query to tokenize.
+
+  Returns:
+    List of Token objects for use in query processing.
+
+  Raises:
+    InvalidQueryError if parentheses are unmatched.
+  """
+  tokens = []  # Function result
+  count = 0  # Used for checking if parentheses are balanced
+  s = ''  # Records current string fragment. Cleared when a token is added.
+
+  for ch in query:
+    if ch == '(':
+      count += 1
+
+      # Add subquery from before we hit this parenthesis.
+      tokens.extend(_TokenizeSubqueryOnOr(s))
+      s = ''
+
+      tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN))
+    elif ch == ')':
+      count -= 1
+
+      if count < 0:
+        # More closing parentheses then open parentheses.
+        raise InvalidQueryError('Search query has unbalanced parentheses.')
+
+      # Add subquery from before we hit this parenthesis.
+      tokens.extend(_TokenizeSubqueryOnOr(s))
+      s = ''
+
+      tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN))
+    else:
+      s += ch
+
+  if count != 0:
+    raise InvalidQueryError('Search query has unbalanced parentheses.')
+
+  # Add any trailing tokens.
+  tokens.extend(_TokenizeSubqueryOnOr(s))
+
+  return tokens
+
+
+def _TokenizeSubqueryOnOr(subquery):
+  # type: (str) -> Sequence[proto.ast_pb2.QueryToken]
+  """Helper to split a subquery by OR and convert the result into tokens.
+
+  Args:
+    subquery: A string without parentheses to tokenize.
+
+  Returns:
+    Tokens for the subquery with OR tokens separating query strings if
+    applicable.
+  """
+  if len(subquery) == 0:
+    return []
+
+  result = []
+  fragments = subquery.split(OR_SYMBOL)
+  for f in fragments:
+    # Interleave the string fragments with OR tokens.
+    result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip()))
+    result.append(ast_pb2.QueryToken(token_type=OR))
+
+  # Remove trailing OR.
+  result.pop()
+
+  # Trim empty strings at the beginning or end. ie: if subquery is ' OR ',
+  # we want the list to be ['OR'], not ['', 'OR', ''].
+  if len(result) > 1 and result[0].value == '':
+    result.pop(0)
+  if len(result) > 1 and result[-1].value == '':
+    result.pop()
+  return result
+
+
+def _MultiplySubqueries(a, b):
+  # type: (Sequence[str], Sequence[str]) -> Sequence[str]
+  """Helper to AND subqueries from two separate lists.
+
+  Args:
+    a: First list of subqueries.
+    b: Second list of subqueries.
+
+  Returns:
+    List with n x m subqueries.
+  """
+  if not len(a):
+    return b
+  if not len(b):
+    return a
+  res = []
+  for q1 in a:
+    for q2 in b:
+      # AND two subqueries together by concatenating them.
+      query = (q1.strip() + ' ' + q2.strip()).strip()
+      res.append(query)
+  return res
+
+
+class PeekIterator:
+  """Simple iterator with peek() functionality.
+
+  Used by QueryToSubqueries to maintain state easily across recursive calls.
+  """
+
+  def __init__(self, source):
+    # type: (Sequence[Any])
+    self.__source = source
+    self.__i = 0
+
+  def peek(self):
+    # type: () -> Any
+    """Gets the next value in the iterator without side effects.
+
+    Returns:
+      Next value in iterator.
+
+    Raises:
+      StopIteration if you're at the end of the iterator.
+    """
+    if self.__i >= len(self.__source):
+      raise StopIteration
+    return self.__source[self.__i]
+
+  def __iter__(self):
+    # type: () -> Sequence[Any]
+    """Return self to make iterator iterable."""
+    return self
+
+  def __repr__(self):
+    # type: () -> str
+    """Allow logging current iterator value for debugging."""
+    try:
+      return str(self.peek())
+    except StopIteration:
+      pass
+    return 'End of PeekIterator'
+
+  def next(self):
+    # type: () -> Any
+    """Gets the next value in the iterator and increments pointer.
+
+    Returns:
+      Next value in iterator.
+
+    Raises:
+      StopIteration if you're at the end of the iterator.
+    """
+    if self.__i >= len(self.__source):
+      raise StopIteration
+    value = self.__source[self.__i]
+    self.__i += 1
+    return value
+
+
+class Error(Exception):
+  """Base exception class for this package."""
+  pass
+
+
+class InvalidQueryError(Error):
+  """Error raised when an invalid query is requested."""
+  pass