Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style |
| 3 | # license that can be found in the LICENSE file or at |
| 4 | # https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | """A set of helpers functions for fulltext search.""" |
| 7 | |
| 8 | from __future__ import division |
| 9 | from __future__ import print_function |
| 10 | from __future__ import absolute_import |
| 11 | |
| 12 | import logging |
| 13 | |
| 14 | from google.appengine.api import search |
| 15 | |
| 16 | import settings |
| 17 | from proto import ast_pb2 |
| 18 | from proto import tracker_pb2 |
| 19 | from search import query2ast |
| 20 | |
| 21 | # GAE search API can only respond with 500 results per call. |
| 22 | _SEARCH_RESULT_CHUNK_SIZE = 500 |
| 23 | |
| 24 | |
| 25 | def BuildFTSQuery(query_ast_conj, fulltext_fields): |
| 26 | """Convert a Monorail query AST into a GAE search query string. |
| 27 | |
| 28 | Args: |
| 29 | query_ast_conj: a Conjunction PB with a list of Comparison PBs that each |
| 30 | have operator, field definitions, string values, and int values. |
| 31 | All Conditions should be AND'd together. |
| 32 | fulltext_fields: a list of string names of fields that may exist in the |
| 33 | fulltext documents. E.g., issue fulltext documents have a "summary" |
| 34 | field. |
| 35 | |
| 36 | Returns: |
| 37 | A string that can be passed to AppEngine's search API. Or, None if there |
| 38 | were no fulltext conditions, so no fulltext search should be done. |
| 39 | """ |
| 40 | fulltext_parts = [ |
| 41 | _BuildFTSCondition(cond, fulltext_fields) |
| 42 | for cond in query_ast_conj.conds] |
| 43 | if any(fulltext_parts): |
| 44 | return ' '.join(fulltext_parts) |
| 45 | else: |
| 46 | return None |
| 47 | |
| 48 | |
| 49 | def _BuildFTSCondition(cond, fulltext_fields): |
| 50 | """Convert one query AST condition into a GAE search query string.""" |
| 51 | if cond.op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
| 52 | neg = 'NOT ' |
| 53 | elif cond.op == ast_pb2.QueryOp.TEXT_HAS: |
| 54 | neg = '' |
| 55 | else: |
| 56 | return '' # FTS only looks at TEXT_HAS and NOT_TEXT_HAS |
| 57 | |
| 58 | parts = [] |
| 59 | |
| 60 | for fd in cond.field_defs: |
| 61 | if fd.field_name in fulltext_fields: |
| 62 | pattern = fd.field_name + ':"%s"' |
| 63 | elif fd.field_name == ast_pb2.ANY_FIELD: |
| 64 | pattern = '"%s"' |
| 65 | elif fd.field_id and fd.field_type == tracker_pb2.FieldTypes.STR_TYPE: |
| 66 | pattern = 'custom_' + str(fd.field_id) + ':"%s"' |
| 67 | else: |
| 68 | pattern = 'pylint does not handle else-continue' |
| 69 | continue # This issue field is searched via SQL. |
| 70 | |
| 71 | for value in cond.str_values: |
| 72 | # Strip out quotes around the value. |
| 73 | value = value.strip('"') |
| 74 | special_prefixes_match = any( |
| 75 | value.startswith(p) for p in query2ast.NON_OP_PREFIXES) |
| 76 | if not special_prefixes_match: |
| 77 | value = value.replace(':', ' ') |
| 78 | assert ('"' not in value), 'Value %r has a quote in it' % value |
| 79 | parts.append(pattern % value) |
| 80 | |
| 81 | if parts: |
| 82 | return neg + '(%s)' % ' OR '.join(parts) |
| 83 | else: |
| 84 | return '' # None of the fields were fulltext fields. |
| 85 | |
| 86 | |
| 87 | def ComprehensiveSearch(fulltext_query, index_name): |
| 88 | """Call the GAE search API, and keep calling it to get all results. |
| 89 | |
| 90 | Args: |
| 91 | fulltext_query: string in the GAE search API query language. |
| 92 | index_name: string name of the GAE fulltext index to hit. |
| 93 | |
| 94 | Returns: |
| 95 | A list of integer issue IIDs or project IDs. |
| 96 | """ |
| 97 | search_index = search.Index(name=index_name) |
| 98 | |
| 99 | try: |
| 100 | response = search_index.search(search.Query( |
| 101 | fulltext_query, |
| 102 | options=search.QueryOptions( |
| 103 | limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 104 | cursor=search.Cursor()))) |
| 105 | except ValueError as e: |
| 106 | raise query2ast.InvalidQueryError(e.message) |
| 107 | |
| 108 | logging.info('got %d initial results', len(response.results)) |
| 109 | ids = [int(result.doc_id) for result in response] |
| 110 | |
| 111 | remaining_iterations = int( |
| 112 | (settings.fulltext_limit_per_shard - 1) // _SEARCH_RESULT_CHUNK_SIZE) |
| 113 | for _ in range(remaining_iterations): |
| 114 | if not response.cursor: |
| 115 | break |
| 116 | response = search_index.search(search.Query( |
| 117 | fulltext_query, |
| 118 | options=search.QueryOptions( |
| 119 | limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 120 | cursor=response.cursor))) |
| 121 | logging.info( |
| 122 | 'got %d more results: %r', len(response.results), response.results) |
| 123 | ids.extend(int(result.doc_id) for result in response) |
| 124 | |
| 125 | logging.info('FTS result ids %d', len(ids)) |
| 126 | return ids |