Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 4 | |
| 5 | """A set of helpers functions for fulltext search.""" |
| 6 | |
| 7 | from __future__ import division |
| 8 | from __future__ import print_function |
| 9 | from __future__ import absolute_import |
| 10 | |
| 11 | import logging |
| 12 | |
| 13 | from google.appengine.api import search |
| 14 | |
| 15 | import settings |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 16 | from mrproto import ast_pb2 |
| 17 | from mrproto import tracker_pb2 |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 18 | from search import query2ast |
| 19 | |
| 20 | # GAE search API can only respond with 500 results per call. |
| 21 | _SEARCH_RESULT_CHUNK_SIZE = 500 |
| 22 | |
| 23 | |
| 24 | def BuildFTSQuery(query_ast_conj, fulltext_fields): |
| 25 | """Convert a Monorail query AST into a GAE search query string. |
| 26 | |
| 27 | Args: |
| 28 | query_ast_conj: a Conjunction PB with a list of Comparison PBs that each |
| 29 | have operator, field definitions, string values, and int values. |
| 30 | All Conditions should be AND'd together. |
| 31 | fulltext_fields: a list of string names of fields that may exist in the |
| 32 | fulltext documents. E.g., issue fulltext documents have a "summary" |
| 33 | field. |
| 34 | |
| 35 | Returns: |
| 36 | A string that can be passed to AppEngine's search API. Or, None if there |
| 37 | were no fulltext conditions, so no fulltext search should be done. |
| 38 | """ |
| 39 | fulltext_parts = [ |
| 40 | _BuildFTSCondition(cond, fulltext_fields) |
| 41 | for cond in query_ast_conj.conds] |
| 42 | if any(fulltext_parts): |
| 43 | return ' '.join(fulltext_parts) |
| 44 | else: |
| 45 | return None |
| 46 | |
| 47 | |
| 48 | def _BuildFTSCondition(cond, fulltext_fields): |
| 49 | """Convert one query AST condition into a GAE search query string.""" |
| 50 | if cond.op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
| 51 | neg = 'NOT ' |
| 52 | elif cond.op == ast_pb2.QueryOp.TEXT_HAS: |
| 53 | neg = '' |
| 54 | else: |
| 55 | return '' # FTS only looks at TEXT_HAS and NOT_TEXT_HAS |
| 56 | |
| 57 | parts = [] |
| 58 | |
| 59 | for fd in cond.field_defs: |
| 60 | if fd.field_name in fulltext_fields: |
| 61 | pattern = fd.field_name + ':"%s"' |
| 62 | elif fd.field_name == ast_pb2.ANY_FIELD: |
| 63 | pattern = '"%s"' |
| 64 | elif fd.field_id and fd.field_type == tracker_pb2.FieldTypes.STR_TYPE: |
| 65 | pattern = 'custom_' + str(fd.field_id) + ':"%s"' |
| 66 | else: |
| 67 | pattern = 'pylint does not handle else-continue' |
| 68 | continue # This issue field is searched via SQL. |
| 69 | |
| 70 | for value in cond.str_values: |
| 71 | # Strip out quotes around the value. |
| 72 | value = value.strip('"') |
| 73 | special_prefixes_match = any( |
| 74 | value.startswith(p) for p in query2ast.NON_OP_PREFIXES) |
| 75 | if not special_prefixes_match: |
| 76 | value = value.replace(':', ' ') |
| 77 | assert ('"' not in value), 'Value %r has a quote in it' % value |
| 78 | parts.append(pattern % value) |
| 79 | |
| 80 | if parts: |
| 81 | return neg + '(%s)' % ' OR '.join(parts) |
| 82 | else: |
| 83 | return '' # None of the fields were fulltext fields. |
| 84 | |
| 85 | |
| 86 | def ComprehensiveSearch(fulltext_query, index_name): |
| 87 | """Call the GAE search API, and keep calling it to get all results. |
| 88 | |
| 89 | Args: |
| 90 | fulltext_query: string in the GAE search API query language. |
| 91 | index_name: string name of the GAE fulltext index to hit. |
| 92 | |
| 93 | Returns: |
| 94 | A list of integer issue IIDs or project IDs. |
| 95 | """ |
| 96 | search_index = search.Index(name=index_name) |
| 97 | |
| 98 | try: |
| 99 | response = search_index.search(search.Query( |
| 100 | fulltext_query, |
| 101 | options=search.QueryOptions( |
| 102 | limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 103 | cursor=search.Cursor()))) |
| 104 | except ValueError as e: |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 105 | raise query2ast.InvalidQueryError(str(e)) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 106 | |
| 107 | logging.info('got %d initial results', len(response.results)) |
| 108 | ids = [int(result.doc_id) for result in response] |
| 109 | |
| 110 | remaining_iterations = int( |
| 111 | (settings.fulltext_limit_per_shard - 1) // _SEARCH_RESULT_CHUNK_SIZE) |
| 112 | for _ in range(remaining_iterations): |
| 113 | if not response.cursor: |
| 114 | break |
| 115 | response = search_index.search(search.Query( |
| 116 | fulltext_query, |
| 117 | options=search.QueryOptions( |
| 118 | limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 119 | cursor=response.cursor))) |
| 120 | logging.info( |
| 121 | 'got %d more results: %r', len(response.results), response.results) |
| 122 | ids.extend(int(result.doc_id) for result in response) |
| 123 | |
| 124 | logging.info('FTS result ids %d', len(ids)) |
| 125 | return ids |