blob: 2da6d68cc322f47b525d68196c92fc6b473914f1 [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""A set of helpers functions for fulltext search."""
6
7from __future__ import division
8from __future__ import print_function
9from __future__ import absolute_import
10
11import logging
12
13from google.appengine.api import search
14
15import settings
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010016from mrproto import ast_pb2
17from mrproto import tracker_pb2
Copybara854996b2021-09-07 19:36:02 +000018from search import query2ast
19
20# GAE search API can only respond with 500 results per call.
21_SEARCH_RESULT_CHUNK_SIZE = 500
22
23
24def BuildFTSQuery(query_ast_conj, fulltext_fields):
25 """Convert a Monorail query AST into a GAE search query string.
26
27 Args:
28 query_ast_conj: a Conjunction PB with a list of Comparison PBs that each
29 have operator, field definitions, string values, and int values.
30 All Conditions should be AND'd together.
31 fulltext_fields: a list of string names of fields that may exist in the
32 fulltext documents. E.g., issue fulltext documents have a "summary"
33 field.
34
35 Returns:
36 A string that can be passed to AppEngine's search API. Or, None if there
37 were no fulltext conditions, so no fulltext search should be done.
38 """
39 fulltext_parts = [
40 _BuildFTSCondition(cond, fulltext_fields)
41 for cond in query_ast_conj.conds]
42 if any(fulltext_parts):
43 return ' '.join(fulltext_parts)
44 else:
45 return None
46
47
48def _BuildFTSCondition(cond, fulltext_fields):
49 """Convert one query AST condition into a GAE search query string."""
50 if cond.op == ast_pb2.QueryOp.NOT_TEXT_HAS:
51 neg = 'NOT '
52 elif cond.op == ast_pb2.QueryOp.TEXT_HAS:
53 neg = ''
54 else:
55 return '' # FTS only looks at TEXT_HAS and NOT_TEXT_HAS
56
57 parts = []
58
59 for fd in cond.field_defs:
60 if fd.field_name in fulltext_fields:
61 pattern = fd.field_name + ':"%s"'
62 elif fd.field_name == ast_pb2.ANY_FIELD:
63 pattern = '"%s"'
64 elif fd.field_id and fd.field_type == tracker_pb2.FieldTypes.STR_TYPE:
65 pattern = 'custom_' + str(fd.field_id) + ':"%s"'
66 else:
67 pattern = 'pylint does not handle else-continue'
68 continue # This issue field is searched via SQL.
69
70 for value in cond.str_values:
71 # Strip out quotes around the value.
72 value = value.strip('"')
73 special_prefixes_match = any(
74 value.startswith(p) for p in query2ast.NON_OP_PREFIXES)
75 if not special_prefixes_match:
76 value = value.replace(':', ' ')
77 assert ('"' not in value), 'Value %r has a quote in it' % value
78 parts.append(pattern % value)
79
80 if parts:
81 return neg + '(%s)' % ' OR '.join(parts)
82 else:
83 return '' # None of the fields were fulltext fields.
84
85
86def ComprehensiveSearch(fulltext_query, index_name):
87 """Call the GAE search API, and keep calling it to get all results.
88
89 Args:
90 fulltext_query: string in the GAE search API query language.
91 index_name: string name of the GAE fulltext index to hit.
92
93 Returns:
94 A list of integer issue IIDs or project IDs.
95 """
96 search_index = search.Index(name=index_name)
97
98 try:
99 response = search_index.search(search.Query(
100 fulltext_query,
101 options=search.QueryOptions(
102 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True,
103 cursor=search.Cursor())))
104 except ValueError as e:
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100105 raise query2ast.InvalidQueryError(str(e))
Copybara854996b2021-09-07 19:36:02 +0000106
107 logging.info('got %d initial results', len(response.results))
108 ids = [int(result.doc_id) for result in response]
109
110 remaining_iterations = int(
111 (settings.fulltext_limit_per_shard - 1) // _SEARCH_RESULT_CHUNK_SIZE)
112 for _ in range(remaining_iterations):
113 if not response.cursor:
114 break
115 response = search_index.search(search.Query(
116 fulltext_query,
117 options=search.QueryOptions(
118 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True,
119 cursor=response.cursor)))
120 logging.info(
121 'got %d more results: %r', len(response.results), response.results)
122 ids.extend(int(result.doc_id) for result in response)
123
124 logging.info('FTS result ids %d', len(ids))
125 return ids