Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 4 | |
| 5 | """A set of functions that integrate the GAE search index with Monorail.""" |
| 6 | from __future__ import print_function |
| 7 | from __future__ import division |
| 8 | from __future__ import absolute_import |
| 9 | |
| 10 | import collections |
| 11 | import datetime |
| 12 | import logging |
| 13 | import re |
| 14 | import time |
| 15 | |
| 16 | from google.appengine.api import search |
| 17 | |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 18 | from mrproto import ast_pb2 |
| 19 | from mrproto import tracker_pb2 |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 20 | |
| 21 | |
| 22 | # TODO(jrobbins): Consider re-implementing this whole file by using a |
| 23 | # BNF syntax specification and a parser generator or library. |
| 24 | |
| 25 | # encodings |
| 26 | UTF8 = 'utf-8' |
| 27 | |
| 28 | # Operator used for OR statements. |
| 29 | OR_SYMBOL = ' OR ' |
| 30 | |
| 31 | # Token types used for parentheses parsing. |
| 32 | SUBQUERY = ast_pb2.TokenType.SUBQUERY |
| 33 | LEFT_PAREN = ast_pb2.TokenType.LEFT_PAREN |
| 34 | RIGHT_PAREN = ast_pb2.TokenType.RIGHT_PAREN |
| 35 | OR = ast_pb2.TokenType.OR |
| 36 | |
| 37 | # Field types and operators |
| 38 | BOOL = tracker_pb2.FieldTypes.BOOL_TYPE |
| 39 | DATE = tracker_pb2.FieldTypes.DATE_TYPE |
| 40 | NUM = tracker_pb2.FieldTypes.INT_TYPE |
| 41 | TXT = tracker_pb2.FieldTypes.STR_TYPE |
| 42 | APPROVAL = tracker_pb2.FieldTypes.APPROVAL_TYPE |
| 43 | |
| 44 | EQ = ast_pb2.QueryOp.EQ |
| 45 | NE = ast_pb2.QueryOp.NE |
| 46 | LT = ast_pb2.QueryOp.LT |
| 47 | GT = ast_pb2.QueryOp.GT |
| 48 | LE = ast_pb2.QueryOp.LE |
| 49 | GE = ast_pb2.QueryOp.GE |
| 50 | TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS |
| 51 | NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS |
| 52 | IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED |
| 53 | IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED |
| 54 | KEY_HAS = ast_pb2.QueryOp.KEY_HAS |
| 55 | |
| 56 | # Mapping from user query comparison operators to our internal representation. |
| 57 | OPS = { |
| 58 | ':': TEXT_HAS, |
| 59 | '=': EQ, |
| 60 | '!=': NE, |
| 61 | '<': LT, |
| 62 | '>': GT, |
| 63 | '<=': LE, |
| 64 | '>=': GE, |
| 65 | } |
| 66 | |
| 67 | # When the query has a leading minus, switch the operator for its opposite. |
| 68 | NEGATED_OPS = { |
| 69 | EQ: NE, |
| 70 | NE: EQ, |
| 71 | LT: GE, |
| 72 | GT: LE, |
| 73 | LE: GT, |
| 74 | GE: LT, |
| 75 | TEXT_HAS: NOT_TEXT_HAS, |
| 76 | # IS_DEFINED is handled separately. |
| 77 | } |
| 78 | |
| 79 | # This is a partial regular expression that matches all of our comparison |
| 80 | # operators, such as =, 1=, >, and <. Longer ones listed first so that the |
| 81 | # shorter ones don't cause premature matches. |
| 82 | OPS_PATTERN = '|'.join( |
| 83 | map(re.escape, sorted(list(OPS.keys()), key=lambda op: -len(op)))) |
| 84 | |
| 85 | # This RE extracts search terms from a subquery string. |
| 86 | TERM_RE = re.compile( |
| 87 | r'(-?"[^"]+")|' # E.g., ["division by zero"] |
| 88 | r'(\S+(%s)[^ "]+)|' # E.g., [stars>10] |
| 89 | r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"] |
| 90 | r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround] |
| 91 | % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE) |
| 92 | |
| 93 | # This RE is used to further decompose a comparison term into prefix, op, and |
| 94 | # value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix |
| 95 | # can include a leading "-" to negate the comparison. |
| 96 | OP_RE = re.compile( |
| 97 | r'^(?P<prefix>[-_.\w]*?)' |
| 98 | r'(?P<op>%s)' |
| 99 | r'(?P<value>([-@\w][-\*,./:<=>@\w]*|"[^"]*"))$' % |
| 100 | OPS_PATTERN, |
| 101 | flags=re.UNICODE) |
| 102 | |
| 103 | |
| 104 | # Predefined issue fields passed to the query parser. |
| 105 | _ISSUE_FIELDS_LIST = [ |
| 106 | (ast_pb2.ANY_FIELD, TXT), |
| 107 | ('attachment', TXT), # attachment file names |
| 108 | ('attachments', NUM), # number of attachment files |
| 109 | ('blocked', BOOL), |
| 110 | ('blockedon', TXT), |
| 111 | ('blockedon_id', NUM), |
| 112 | ('blocking', TXT), |
| 113 | ('blocking_id', NUM), |
| 114 | ('cc', TXT), |
| 115 | ('cc_id', NUM), |
| 116 | ('comment', TXT), |
| 117 | ('commentby', TXT), |
| 118 | ('commentby_id', NUM), |
| 119 | ('component', TXT), |
| 120 | ('component_id', NUM), |
| 121 | ('description', TXT), |
| 122 | ('gate', TXT), |
| 123 | ('hotlist', TXT), |
| 124 | ('hotlist_id', NUM), |
| 125 | ('id', NUM), |
| 126 | ('is_spam', BOOL), |
| 127 | ('label', TXT), |
| 128 | ('label_id', NUM), |
| 129 | ('mergedinto', NUM), |
| 130 | ('mergedinto_id', NUM), |
| 131 | ('open', BOOL), |
| 132 | ('owner', TXT), |
| 133 | ('ownerbouncing', BOOL), |
| 134 | ('owner_id', NUM), |
| 135 | ('project', TXT), |
| 136 | ('reporter', TXT), |
| 137 | ('reporter_id', NUM), |
| 138 | ('spam', BOOL), |
| 139 | ('stars', NUM), |
| 140 | ('starredby', TXT), |
| 141 | ('starredby_id', NUM), |
| 142 | ('status', TXT), |
| 143 | ('status_id', NUM), |
| 144 | ('summary', TXT), |
| 145 | ] |
| 146 | |
| 147 | _DATE_FIELDS = ( |
| 148 | 'closed', |
| 149 | 'modified', |
| 150 | 'opened', |
| 151 | 'ownermodified', |
| 152 | 'ownerlastvisit', |
| 153 | 'statusmodified', |
| 154 | 'componentmodified', |
| 155 | ) |
| 156 | |
| 157 | # Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST. |
| 158 | _ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS) |
| 159 | |
| 160 | _DATE_FIELD_SUFFIX_TO_OP = { |
| 161 | '-after': '>', |
| 162 | '-before': '<', |
| 163 | } |
| 164 | |
| 165 | SET_BY_SUFFIX = '-by' |
| 166 | SET_ON_SUFFIX = '-on' |
| 167 | APPROVER_SUFFIX = '-approver' |
| 168 | STATUS_SUFFIX = '-status' |
| 169 | |
| 170 | _APPROVAL_SUFFIXES = ( |
| 171 | SET_BY_SUFFIX, |
| 172 | SET_ON_SUFFIX, |
| 173 | APPROVER_SUFFIX, |
| 174 | STATUS_SUFFIX, |
| 175 | ) |
| 176 | |
| 177 | BUILTIN_ISSUE_FIELDS = { |
| 178 | f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type) |
| 179 | for f_name, f_type in _ISSUE_FIELDS_LIST} |
| 180 | |
| 181 | |
| 182 | # Do not treat strings that start with the below as key:value search terms. |
| 183 | # See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail. |
| 184 | NON_OP_PREFIXES = ( |
| 185 | 'http:', |
| 186 | 'https:', |
| 187 | ) |
| 188 | |
| 189 | |
| 190 | def ParseUserQuery( |
| 191 | query, scope, builtin_fields, harmonized_config, warnings=None, |
| 192 | now=None): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 193 | # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], |
| 194 | # mrproto.tracker_pb2.ProjectIssueConfig, Sequence[str], int) -> |
| 195 | # mrproto.ast_pb2.QueryAST |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 196 | """Parse a user query and return a set of structure terms. |
| 197 | |
| 198 | Args: |
| 199 | query: string with user's query. E.g., 'Priority=High'. |
| 200 | scope: string search terms that define the scope in which the |
| 201 | query should be executed. They are expressed in the same |
| 202 | user query language. E.g., adding the canned query. |
| 203 | builtin_fields: dict {field_name: FieldDef(field_name, type)} |
| 204 | mapping field names to FieldDef objects for built-in fields. |
| 205 | harmonized_config: config for all the projects being searched. |
| 206 | @@@ custom field name is not unique in cross project search. |
| 207 | - custom_fields = {field_name: [fd, ...]} |
| 208 | - query build needs to OR each possible interpretation |
| 209 | - could be label in one project and field in another project. |
| 210 | @@@ what about searching across all projects? |
| 211 | warnings: optional list to accumulate warning messages. |
| 212 | now: optional timestamp for tests, otherwise time.time() is used. |
| 213 | |
| 214 | Returns: |
| 215 | A QueryAST with conjunctions (usually just one), where each has a list of |
| 216 | Condition PBs with op, fields, str_values and int_values. E.g., the query |
| 217 | [priority=high leak OR stars>100] over open issues would return |
| 218 | QueryAST( |
| 219 | Conjunction(Condition(EQ, [open_fd], [], [1]), |
| 220 | Condition(EQ, [label_fd], ['priority-high'], []), |
| 221 | Condition(TEXT_HAS, any_field_fd, ['leak'], [])), |
| 222 | Conjunction(Condition(EQ, [open_fd], [], [1]), |
| 223 | Condition(GT, [stars_fd], [], [100]))) |
| 224 | |
| 225 | Raises: |
| 226 | InvalidQueryError: If a problem was detected in the user's query. |
| 227 | """ |
| 228 | if warnings is None: |
| 229 | warnings = [] |
| 230 | |
| 231 | # Convert the overall query into one or more OR'd subqueries. |
| 232 | subqueries = QueryToSubqueries(query) |
| 233 | |
| 234 | # Make a dictionary of all fields: built-in + custom in each project. |
| 235 | combined_fields = collections.defaultdict( |
| 236 | list, {field_name: [field_def] |
| 237 | for field_name, field_def in builtin_fields.items()}) |
| 238 | for fd in harmonized_config.field_defs: |
| 239 | if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE: |
| 240 | # Only do non-enum fields because enums are stored as labels |
| 241 | combined_fields[fd.field_name.lower()].append(fd) |
| 242 | if fd.field_type == APPROVAL: |
| 243 | for approval_suffix in _APPROVAL_SUFFIXES: |
| 244 | combined_fields[fd.field_name.lower() + approval_suffix].append(fd) |
| 245 | |
| 246 | conjunctions = [ |
| 247 | _ParseConjunction(sq, scope, combined_fields, warnings, now=now) |
| 248 | for sq in subqueries] |
| 249 | return ast_pb2.QueryAST(conjunctions=conjunctions) |
| 250 | |
| 251 | |
| 252 | def _ParseConjunction(subquery, scope, fields, warnings, now=None): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 253 | # type: (str, str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str], |
| 254 | # int) -> mrproto.ast_pb2.Condition |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 255 | """Parse part of a user query into a Conjunction PB.""" |
| 256 | scoped_query = ('%s %s' % (scope, subquery)).lower() |
| 257 | cond_strs = _ExtractConds(scoped_query, warnings) |
| 258 | conds = [_ParseCond(cond_str, fields, warnings, now=now) |
| 259 | for cond_str in cond_strs] |
| 260 | conds = [cond for cond in conds if cond] |
| 261 | return ast_pb2.Conjunction(conds=conds) |
| 262 | |
| 263 | |
| 264 | def _ParseCond(cond_str, fields, warnings, now=None): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 265 | # type: (str, Mapping[str, mrproto.tracker_pb2.FieldDef], Sequence[str], |
| 266 | # int) -> mrproto.ast_pb2.Condition |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 267 | """Parse one user query condition string into a Condition PB.""" |
| 268 | op_match = OP_RE.match(cond_str) |
| 269 | # Do not treat as key:value search terms if any of the special prefixes match. |
| 270 | special_prefixes_match = any( |
| 271 | cond_str.startswith(p) for p in NON_OP_PREFIXES) |
| 272 | if op_match and not special_prefixes_match: |
| 273 | prefix = op_match.group('prefix') |
| 274 | op = op_match.group('op') |
| 275 | val = op_match.group('value') |
| 276 | # Special case handling to continue to support old date query terms from |
| 277 | # code.google.com. See monorail:151 for more details. |
| 278 | if prefix.startswith(_DATE_FIELDS): |
| 279 | for date_suffix in _DATE_FIELD_SUFFIX_TO_OP: |
| 280 | if prefix.endswith(date_suffix): |
| 281 | prefix = prefix.rstrip(date_suffix) |
| 282 | op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix] |
| 283 | return _ParseStructuredTerm(prefix, op, val, fields, now=now) |
| 284 | |
| 285 | # Treat the cond as a full-text search term, which might be negated. |
| 286 | if cond_str.startswith('-'): |
| 287 | op = NOT_TEXT_HAS |
| 288 | cond_str = cond_str[1:] |
| 289 | else: |
| 290 | op = TEXT_HAS |
| 291 | |
| 292 | # Construct a full-text Query object as a dry-run to validate that |
| 293 | # the syntax is acceptable. |
| 294 | try: |
| 295 | _fts_query = search.Query(cond_str) |
| 296 | except search.QueryError: |
| 297 | warnings.append('Ignoring full-text term: %s' % cond_str) |
| 298 | return None |
| 299 | |
| 300 | # Flag a potential user misunderstanding. |
| 301 | if cond_str.lower() in ('and', 'or', 'not'): |
| 302 | warnings.append( |
| 303 | 'The only supported boolean operator is OR (all capitals).') |
| 304 | |
| 305 | return ast_pb2.MakeCond( |
| 306 | op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], []) |
| 307 | |
| 308 | |
| 309 | def _ParseStructuredTerm(prefix, op_str, value, fields, now=None): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 310 | # type: (str, str, str, Mapping[str, mrproto.tracker_pb2.FieldDef]) -> |
| 311 | # mrproto.ast_pb2.Condition |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 312 | """Parse one user structured query term into an internal representation. |
| 313 | |
| 314 | Args: |
| 315 | prefix: The query operator, usually a field name. E.g., summary. It can |
| 316 | also be special operators like "is" to test boolean fields. |
| 317 | op_str: the comparison operator. Usually ":" or "=", but can be any OPS. |
| 318 | value: the value to compare against, e.g., term to find in that field. |
| 319 | fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields. |
| 320 | now: optional timestamp for tests, otherwise time.time() is used. |
| 321 | |
| 322 | Returns: |
| 323 | A Condition PB. |
| 324 | """ |
| 325 | unquoted_value = value.strip('"') |
| 326 | # Quick-OR is a convenient way to write one condition that matches any one of |
| 327 | # multiple values, like set membership. E.g., [Priority=High,Critical]. |
| 328 | # Ignore empty values caused by duplicated or trailing commas. E.g., |
| 329 | # [Priority=High,,Critical,] is equivalent to [Priority=High,Critical]. |
| 330 | quick_or_vals = [v.strip() for v in unquoted_value.split(',') if v.strip()] |
| 331 | |
| 332 | op = OPS[op_str] |
| 333 | negate = False |
| 334 | if prefix.startswith('-'): |
| 335 | negate = True |
| 336 | op = NEGATED_OPS.get(op, op) |
| 337 | prefix = prefix[1:] |
| 338 | |
| 339 | if prefix == 'is' and unquoted_value in [ |
| 340 | 'open', 'blocked', 'spam', 'ownerbouncing']: |
| 341 | return ast_pb2.MakeCond( |
| 342 | NE if negate else EQ, fields[unquoted_value], [], []) |
| 343 | |
| 344 | # Search entries with or without any value in the specified field. |
| 345 | if prefix == 'has': |
| 346 | op = IS_NOT_DEFINED if negate else IS_DEFINED |
| 347 | if '.' in unquoted_value: # Possible search for phase field with any value. |
| 348 | phase_name, possible_field = unquoted_value.split('.', 1) |
| 349 | if possible_field in fields: |
| 350 | return ast_pb2.MakeCond( |
| 351 | op, fields[possible_field], [], [], phase_name=phase_name) |
| 352 | elif unquoted_value in fields: # Look for that field with any value. |
| 353 | return ast_pb2.MakeCond(op, fields[unquoted_value], [], []) |
| 354 | else: # Look for any label with that prefix. |
| 355 | return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], []) |
| 356 | |
| 357 | # Search entries with certain gates. |
| 358 | if prefix == 'gate': |
| 359 | return ast_pb2.MakeCond(op, fields['gate'], quick_or_vals, []) |
| 360 | |
| 361 | # Determine hotlist query type. |
| 362 | # If prefix is not 'hotlist', quick_or_vals is empty, or qov |
| 363 | # does not contain ':', is_fields will remain True |
| 364 | is_fields = True |
| 365 | if prefix == 'hotlist': |
| 366 | try: |
| 367 | if ':' not in quick_or_vals[0]: |
| 368 | is_fields = False |
| 369 | except IndexError: |
| 370 | is_fields = False |
| 371 | |
| 372 | phase_name = None |
| 373 | if '.' in prefix and is_fields: |
| 374 | split_prefix = prefix.split('.', 1) |
| 375 | if split_prefix[1] in fields: |
| 376 | phase_name, prefix = split_prefix |
| 377 | |
| 378 | # search built-in and custom fields. E.g., summary. |
| 379 | if prefix in fields and is_fields: |
| 380 | # Note: if first matching field is date-type, we assume they all are. |
| 381 | # TODO(jrobbins): better handling for rare case where multiple projects |
| 382 | # define the same custom field name, and one is a date and another is not. |
| 383 | first_field = fields[prefix][0] |
| 384 | if first_field.field_type == DATE: |
| 385 | date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals] |
| 386 | return ast_pb2.MakeCond(op, fields[prefix], [], date_values) |
| 387 | elif first_field.field_type == APPROVAL and prefix.endswith(SET_ON_SUFFIX): |
| 388 | date_values = [_ParseDateValue(val, now=now) for val in quick_or_vals] |
| 389 | return ast_pb2.MakeCond( |
| 390 | op, |
| 391 | fields[prefix], [], |
| 392 | date_values, |
| 393 | key_suffix=SET_ON_SUFFIX, |
| 394 | phase_name=phase_name) |
| 395 | else: |
| 396 | quick_or_ints = [] |
| 397 | for qov in quick_or_vals: |
| 398 | try: |
| 399 | quick_or_ints.append(int(qov)) |
| 400 | except ValueError: |
| 401 | pass |
| 402 | if first_field.field_type == APPROVAL: |
| 403 | for approval_suffix in _APPROVAL_SUFFIXES: |
| 404 | if prefix.endswith(approval_suffix): |
| 405 | return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, |
| 406 | quick_or_ints, key_suffix=approval_suffix, |
| 407 | phase_name=phase_name) |
| 408 | return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, |
| 409 | quick_or_ints, phase_name=phase_name) |
| 410 | |
| 411 | # Since it is not a field, treat it as labels, E.g., Priority. |
| 412 | quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals] |
| 413 | # Convert substring match to key-value match if user typed 'foo:bar'. |
| 414 | if op == TEXT_HAS: |
| 415 | op = KEY_HAS |
| 416 | return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, []) |
| 417 | |
| 418 | |
| 419 | def _ExtractConds(query, warnings): |
| 420 | # type: (str, Sequence[str]) -> Sequence[str] |
| 421 | """Parse a query string into a list of individual condition strings. |
| 422 | |
| 423 | Args: |
| 424 | query: UTF-8 encoded search query string. |
| 425 | warnings: list to accumulate warning messages. |
| 426 | |
| 427 | Returns: |
| 428 | A list of query condition strings. |
| 429 | """ |
| 430 | # Convert to unicode then search for distinct terms. |
| 431 | term_matches = TERM_RE.findall(query) |
| 432 | |
| 433 | terms = [] |
| 434 | for (phrase, word_label, _op1, phrase_label, _op2, |
| 435 | word) in term_matches: |
| 436 | # Case 1: Quoted phrases, e.g., ["hot dog"]. |
| 437 | if phrase_label or phrase: |
| 438 | terms.append(phrase_label or phrase) |
| 439 | |
| 440 | # Case 2: Comparisons |
| 441 | elif word_label: |
| 442 | special_prefixes_match = any( |
| 443 | word_label.startswith(p) for p in NON_OP_PREFIXES) |
| 444 | match = OP_RE.match(word_label) |
| 445 | if match and not special_prefixes_match: |
| 446 | label = match.group('prefix') |
| 447 | op = match.group('op') |
| 448 | word = match.group('value') |
| 449 | terms.append('%s%s"%s"' % (label, op, word)) |
| 450 | else: |
| 451 | # It looked like a key:value cond, but not exactly, so treat it |
| 452 | # as fulltext search. It is probably a tiny bit of source code. |
| 453 | terms.append('"%s"' % word_label) |
| 454 | |
| 455 | # Case 3: Simple words. |
| 456 | elif word: |
| 457 | terms.append(word) |
| 458 | |
| 459 | else: # pragma: no coverage |
| 460 | warnings.append('Unparsable search term') |
| 461 | |
| 462 | return terms |
| 463 | |
| 464 | |
| 465 | def _ParseDateValue(val, now=None): |
| 466 | # type: (str, int) -> int |
| 467 | """Convert the user-entered date into timestamp.""" |
| 468 | # Support timestamp value such as opened>1437671476 |
| 469 | try: |
| 470 | return int(val) |
| 471 | except ValueError: |
| 472 | pass |
| 473 | |
| 474 | # TODO(jrobbins): future: take timezones into account. |
| 475 | # TODO(jrobbins): for now, explain to users that "today" is |
| 476 | # actually now: the current time, not 12:01am in their timezone. |
| 477 | # In fact, it is not very useful because everything in the system |
| 478 | # happened before the current time. |
| 479 | if val == 'today': |
| 480 | return _CalculatePastDate(0, now=now) |
| 481 | elif val.startswith('today-'): |
| 482 | try: |
| 483 | days_ago = int(val.split('-')[1]) |
| 484 | except ValueError: |
| 485 | raise InvalidQueryError('Could not parse date: ' + val) |
| 486 | return _CalculatePastDate(days_ago, now=now) |
| 487 | |
| 488 | try: |
| 489 | if '/' in val: |
| 490 | year, month, day = [int(x) for x in val.split('/')] |
| 491 | elif '-' in val: |
| 492 | year, month, day = [int(x) for x in val.split('-')] |
| 493 | else: |
| 494 | raise InvalidQueryError('Could not parse date: ' + val) |
| 495 | except ValueError: |
| 496 | raise InvalidQueryError('Could not parse date: ' + val) |
| 497 | |
| 498 | try: |
| 499 | return int(time.mktime(datetime.datetime(year, month, day).timetuple())) |
| 500 | except ValueError: |
| 501 | raise InvalidQueryError('Could not parse date: ' + val) |
| 502 | |
| 503 | |
| 504 | def _CalculatePastDate(days_ago, now=None): |
| 505 | # type: (int, int) -> int |
| 506 | """Calculates the timestamp N days ago from now.""" |
| 507 | if now is None: |
| 508 | now = int(time.time()) |
| 509 | ts = now - days_ago * 24 * 60 * 60 |
| 510 | return ts |
| 511 | |
| 512 | |
| 513 | def QueryToSubqueries(query): |
| 514 | # type (str) -> Sequence[str] |
| 515 | """Splits a query into smaller queries based on Monorail's search syntax. |
| 516 | |
| 517 | This function handles parsing parentheses and OR statements in Monorail's |
| 518 | search syntax. By doing this parsing for OR statements and parentheses up |
| 519 | front in FrontendSearchPipeline, we are able to convert complex queries |
| 520 | with lots of ORs into smaller, more easily cacheable query terms. |
| 521 | |
| 522 | These outputted subqueries should collectively return the same query results |
| 523 | as the initial input query without containing any ORs or parentheses, |
| 524 | allowing later search layers to parse queries without worrying about ORs |
| 525 | or parentheses. |
| 526 | |
| 527 | Some examples of possible queries and their expected output: |
| 528 | |
| 529 | - '(A OR B) (C OR D) OR (E OR F)' -> ['A C', 'A D', 'B C', 'B D', 'E', 'F'] |
| 530 | - '(A) OR (B)' -> ['A', 'B'] |
| 531 | - '(A ((C) OR (D OR (E OR F))))' -> ['A C', 'A D', 'A E', 'A F'] |
| 532 | |
| 533 | Where A, B, C, D, etc could be any list of conjunctions. ie: "owner:me", |
| 534 | "Pri=1", "hello world Hotlist=test", "label!=a11y", etc |
| 535 | |
| 536 | Note: Monorail implicitly ANDs any query terms separated by a space. For |
| 537 | the most part, AND functionality is handled at a later layer in search |
| 538 | processing. However, this case becomes important here when considering the |
| 539 | fact that a prentheses group can either be ANDed or ORed with terms that |
| 540 | surround it. |
| 541 | |
| 542 | The _MultiplySubqueries helper is used to AND the results of different |
| 543 | groups together whereas concatenating lists is used to OR subqueries |
| 544 | together. |
| 545 | |
| 546 | Args: |
| 547 | query: The initial query that was sent to the search. |
| 548 | |
| 549 | Returns: |
| 550 | List of query fragments to be independently processed as search terms. |
| 551 | |
| 552 | Raises: |
| 553 | InvalidQueryError if parentheses are unmatched. |
| 554 | """ |
| 555 | tokens = _ValidateAndTokenizeQuery(query) |
| 556 | |
| 557 | # Using an iterator allows us to keep our current loop position across |
| 558 | # helpers. This makes recursion a lot easier. |
| 559 | token_iterator = PeekIterator(tokens) |
| 560 | |
| 561 | subqueries = _ParseQuery(token_iterator) |
| 562 | |
| 563 | if not len(subqueries): |
| 564 | # Several cases, such as an empty query or a query with only parentheses |
| 565 | # will result in an empty set of subqueries. In these cases, we still want |
| 566 | # to give the search pipeline a single empty query to process. |
| 567 | return [''] |
| 568 | |
| 569 | return subqueries |
| 570 | |
| 571 | |
| 572 | def _ParseQuery(token_iterator): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 573 | # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 574 | """Recursive helper to convert query tokens into a list of subqueries. |
| 575 | |
| 576 | Parses a Query based on the following grammar (EBNF): |
| 577 | |
| 578 | Query := OrGroup { [OrOperator] OrGroup } |
| 579 | OrGroup := AndGroup { AndGroup } |
| 580 | AndGroup := Subquery | ParenthesesGroup |
| 581 | ParenthesesGroup := "(" Query ")" |
| 582 | Subquery := /.+/ |
| 583 | OrOperator := " OR " |
| 584 | |
| 585 | An important nuance is that two groups can be next to each other, separated |
| 586 | only by a word boundary (ie: space or parentheses). In this case, they are |
| 587 | implicitly ANDed. In practice, because unparenthesized fragments ANDed by |
| 588 | spaces are stored as single tokens, we only need to handle the AND case when |
| 589 | a parentheses group is implicitly ANDed with an adjacent group. |
| 590 | |
| 591 | Order of precedence is implemented by recursing through OR groups before |
| 592 | recursing through AND groups. |
| 593 | |
| 594 | Args: |
| 595 | token_iterator: Iterator over a list of query tokens. |
| 596 | |
| 597 | Returns: |
| 598 | List of query fragments to be processed as search terms. |
| 599 | |
| 600 | Raises: |
| 601 | InvalidQueryError if tokens were inputted in a format that does not follow |
| 602 | our search grammar. |
| 603 | """ |
| 604 | subqueries = [] |
| 605 | try: |
| 606 | if token_iterator.peek().token_type == OR: |
| 607 | # Edge case: Ignore empty OR groups at the starte of a ParenthesesGroup. |
| 608 | # ie: "(OR A)" will be processed as "A" |
| 609 | next(token_iterator) |
| 610 | |
| 611 | subqueries = _ParseOrGroup(token_iterator) |
| 612 | |
| 613 | while token_iterator.peek().token_type == OR: |
| 614 | # Consume the OR tokens without doing anything with it. |
| 615 | next(token_iterator) |
| 616 | |
| 617 | next_token = token_iterator.peek() |
| 618 | if next_token.token_type == RIGHT_PAREN: |
| 619 | # Edge case: Ignore empty OR groups at the end of a ParenthesesGroup. |
| 620 | # ie: "(A OR)" will be processed as "A" |
| 621 | return subqueries |
| 622 | |
| 623 | next_subqueries = _ParseOrGroup(token_iterator) |
| 624 | |
| 625 | # Concatenate results of OR groups together. |
| 626 | subqueries = subqueries + next_subqueries |
| 627 | |
| 628 | except StopIteration: |
| 629 | pass |
| 630 | # Return when we've reached the end of the string. |
| 631 | return subqueries |
| 632 | |
| 633 | |
| 634 | def _ParseOrGroup(token_iterator): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 635 | # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 636 | """Recursive helper to convert a single "OrGroup" into subqueries. |
| 637 | |
| 638 | An OrGroup here is based on the following grammar: |
| 639 | |
| 640 | Query := OrGroup { [OrOperator] OrGroup } |
| 641 | OrGroup := AndGroup { AndGroup } |
| 642 | AndGroup := Subquery | ParenthesesGroup |
| 643 | ParenthesesGroup := "(" Query ")" |
| 644 | Subquery := /.+/ |
| 645 | OrOperator := " OR " |
| 646 | |
| 647 | Args: |
| 648 | token_iterator: Iterator over a list of query tokens. |
| 649 | |
| 650 | Returns: |
| 651 | List of query fragments to be processed as search terms. |
| 652 | |
| 653 | Raises: |
| 654 | InvalidQueryError if tokens were inputted in a format that does not follow |
| 655 | our search grammar. |
| 656 | """ |
| 657 | subqueries = _ParseAndGroup(token_iterator) |
| 658 | |
| 659 | try: |
| 660 | # Iterate until there are no more AND groups left to see. |
| 661 | # Subquery or left parentheses are the possible starts of an AndGroup. |
| 662 | while (token_iterator.peek().token_type == SUBQUERY or |
| 663 | token_iterator.peek().token_type == LEFT_PAREN): |
| 664 | |
| 665 | # Find subqueries from the next AND group. |
| 666 | next_subqueries = _ParseAndGroup(token_iterator) |
| 667 | |
| 668 | # Multiply all results across AND groups together. |
| 669 | subqueries = _MultiplySubqueries(subqueries, next_subqueries) |
| 670 | except StopIteration: |
| 671 | pass |
| 672 | |
| 673 | return subqueries |
| 674 | |
| 675 | |
| 676 | def _ParseAndGroup(token_iterator): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 677 | # type (Sequence[mrproto.ast_pb2.QueryToken]) -> Sequence[str] |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 678 | """Recursive helper to convert a single "AndGroup" into subqueries. |
| 679 | |
| 680 | An OrGroup here is based on the following grammar: |
| 681 | |
| 682 | Query := OrGroup { [OrOperator] OrGroup } |
| 683 | OrGroup := AndGroup { AndGroup } |
| 684 | AndGroup := Subquery | ParenthesesGroup |
| 685 | ParenthesesGroup := "(" Query ")" |
| 686 | Subquery := /.+/ |
| 687 | OrOperator := " OR " |
| 688 | |
| 689 | Args: |
| 690 | token_iterator: Iterator over a list of query tokens. |
| 691 | |
| 692 | Returns: |
| 693 | List of query fragments to be processed as search terms. |
| 694 | |
| 695 | Raises: |
| 696 | InvalidQueryError if tokens were inputted in a format that does not follow |
| 697 | our search grammar. |
| 698 | """ |
| 699 | try: |
| 700 | token = next(token_iterator) |
| 701 | if token.token_type == LEFT_PAREN: |
| 702 | if token_iterator.peek().token_type == RIGHT_PAREN: |
| 703 | # Don't recurse into the ParenthesesGroup if there's nothing inside. |
| 704 | next(token_iterator) |
| 705 | return [] |
| 706 | |
| 707 | # Recurse into the ParenthesesGroup. |
| 708 | subqueries = _ParseQuery(token_iterator) |
| 709 | |
| 710 | # Next token should be a right parenthesis. |
| 711 | next(token_iterator) |
| 712 | |
| 713 | return subqueries |
| 714 | elif token.token_type == SUBQUERY: |
| 715 | return [token.value] |
| 716 | else: |
| 717 | # This should not happen if other QueryToSubqueries helpers are working |
| 718 | # properly. |
| 719 | raise InvalidQueryError('Inputted tokens do not follow grammar.') |
| 720 | except StopIteration: |
| 721 | pass |
| 722 | return [] |
| 723 | |
| 724 | |
| 725 | def _ValidateAndTokenizeQuery(query): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 726 | # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken] |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 727 | """Converts the input query into a set of tokens for easier parsing. |
| 728 | |
| 729 | Tokenizing the query string before parsing allows us to not have to as many |
| 730 | string manipulations while parsing, which simplifies our later code. |
| 731 | |
| 732 | Args: |
| 733 | query: Query to tokenize. |
| 734 | |
| 735 | Returns: |
| 736 | List of Token objects for use in query processing. |
| 737 | |
| 738 | Raises: |
| 739 | InvalidQueryError if parentheses are unmatched. |
| 740 | """ |
| 741 | tokens = [] # Function result |
| 742 | count = 0 # Used for checking if parentheses are balanced |
| 743 | s = '' # Records current string fragment. Cleared when a token is added. |
| 744 | |
| 745 | for ch in query: |
| 746 | if ch == '(': |
| 747 | count += 1 |
| 748 | |
| 749 | # Add subquery from before we hit this parenthesis. |
| 750 | tokens.extend(_TokenizeSubqueryOnOr(s)) |
| 751 | s = '' |
| 752 | |
| 753 | tokens.append(ast_pb2.QueryToken(token_type=LEFT_PAREN)) |
| 754 | elif ch == ')': |
| 755 | count -= 1 |
| 756 | |
| 757 | if count < 0: |
| 758 | # More closing parentheses then open parentheses. |
| 759 | raise InvalidQueryError('Search query has unbalanced parentheses.') |
| 760 | |
| 761 | # Add subquery from before we hit this parenthesis. |
| 762 | tokens.extend(_TokenizeSubqueryOnOr(s)) |
| 763 | s = '' |
| 764 | |
| 765 | tokens.append(ast_pb2.QueryToken(token_type=RIGHT_PAREN)) |
| 766 | else: |
| 767 | s += ch |
| 768 | |
| 769 | if count != 0: |
| 770 | raise InvalidQueryError('Search query has unbalanced parentheses.') |
| 771 | |
| 772 | # Add any trailing tokens. |
| 773 | tokens.extend(_TokenizeSubqueryOnOr(s)) |
| 774 | |
| 775 | return tokens |
| 776 | |
| 777 | |
| 778 | def _TokenizeSubqueryOnOr(subquery): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 779 | # type: (str) -> Sequence[mrproto.ast_pb2.QueryToken] |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 780 | """Helper to split a subquery by OR and convert the result into tokens. |
| 781 | |
| 782 | Args: |
| 783 | subquery: A string without parentheses to tokenize. |
| 784 | |
| 785 | Returns: |
| 786 | Tokens for the subquery with OR tokens separating query strings if |
| 787 | applicable. |
| 788 | """ |
| 789 | if len(subquery) == 0: |
| 790 | return [] |
| 791 | |
| 792 | result = [] |
| 793 | fragments = subquery.split(OR_SYMBOL) |
| 794 | for f in fragments: |
| 795 | # Interleave the string fragments with OR tokens. |
| 796 | result.append(ast_pb2.QueryToken(token_type=SUBQUERY, value=f.strip())) |
| 797 | result.append(ast_pb2.QueryToken(token_type=OR)) |
| 798 | |
| 799 | # Remove trailing OR. |
| 800 | result.pop() |
| 801 | |
| 802 | # Trim empty strings at the beginning or end. ie: if subquery is ' OR ', |
| 803 | # we want the list to be ['OR'], not ['', 'OR', '']. |
| 804 | if len(result) > 1 and result[0].value == '': |
| 805 | result.pop(0) |
| 806 | if len(result) > 1 and result[-1].value == '': |
| 807 | result.pop() |
| 808 | return result |
| 809 | |
| 810 | |
| 811 | def _MultiplySubqueries(a, b): |
| 812 | # type: (Sequence[str], Sequence[str]) -> Sequence[str] |
| 813 | """Helper to AND subqueries from two separate lists. |
| 814 | |
| 815 | Args: |
| 816 | a: First list of subqueries. |
| 817 | b: Second list of subqueries. |
| 818 | |
| 819 | Returns: |
| 820 | List with n x m subqueries. |
| 821 | """ |
| 822 | if not len(a): |
| 823 | return b |
| 824 | if not len(b): |
| 825 | return a |
| 826 | res = [] |
| 827 | for q1 in a: |
| 828 | for q2 in b: |
| 829 | # AND two subqueries together by concatenating them. |
| 830 | query = (q1.strip() + ' ' + q2.strip()).strip() |
| 831 | res.append(query) |
| 832 | return res |
| 833 | |
| 834 | |
| 835 | class PeekIterator: |
| 836 | """Simple iterator with peek() functionality. |
| 837 | |
| 838 | Used by QueryToSubqueries to maintain state easily across recursive calls. |
| 839 | """ |
| 840 | |
| 841 | def __init__(self, source): |
| 842 | # type: (Sequence[Any]) |
| 843 | self.__source = source |
| 844 | self.__i = 0 |
| 845 | |
| 846 | def peek(self): |
| 847 | # type: () -> Any |
| 848 | """Gets the next value in the iterator without side effects. |
| 849 | |
| 850 | Returns: |
| 851 | Next value in iterator. |
| 852 | |
| 853 | Raises: |
| 854 | StopIteration if you're at the end of the iterator. |
| 855 | """ |
| 856 | if self.__i >= len(self.__source): |
| 857 | raise StopIteration |
| 858 | return self.__source[self.__i] |
| 859 | |
| 860 | def __iter__(self): |
| 861 | # type: () -> Sequence[Any] |
| 862 | """Return self to make iterator iterable.""" |
| 863 | return self |
| 864 | |
| 865 | def __repr__(self): |
| 866 | # type: () -> str |
| 867 | """Allow logging current iterator value for debugging.""" |
| 868 | try: |
| 869 | return str(self.peek()) |
| 870 | except StopIteration: |
| 871 | pass |
| 872 | return 'End of PeekIterator' |
| 873 | |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 874 | def __next__(self): |
| 875 | # type: () -> Any |
| 876 | """Gets the next value in the iterator and increments pointer. |
| 877 | |
| 878 | Returns: |
| 879 | Next value in iterator. |
| 880 | |
| 881 | Raises: |
| 882 | StopIteration if you're at the end of the iterator. |
| 883 | """ |
| 884 | return self.next() |
| 885 | |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 886 | def next(self): |
| 887 | # type: () -> Any |
| 888 | """Gets the next value in the iterator and increments pointer. |
| 889 | |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 890 | For backwards compatibility with Python 2. |
| 891 | |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 892 | Returns: |
| 893 | Next value in iterator. |
| 894 | |
| 895 | Raises: |
| 896 | StopIteration if you're at the end of the iterator. |
| 897 | """ |
| 898 | if self.__i >= len(self.__source): |
| 899 | raise StopIteration |
| 900 | value = self.__source[self.__i] |
| 901 | self.__i += 1 |
| 902 | return value |
| 903 | |
| 904 | |
| 905 | class Error(Exception): |
| 906 | """Base exception class for this package.""" |
| 907 | pass |
| 908 | |
| 909 | |
| 910 | class InvalidQueryError(Error): |
| 911 | """Error raised when an invalid query is requested.""" |
| 912 | pass |