blob: 80d4264e8464c150db6d7672b2e81b7be9ae3636 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5
6"""A set of helpers functions for fulltext search."""
7
8from __future__ import division
9from __future__ import print_function
10from __future__ import absolute_import
11
12import logging
13
14from google.appengine.api import search
15
16import settings
17from proto import ast_pb2
18from proto import tracker_pb2
19from search import query2ast
20
21# GAE search API can only respond with 500 results per call.
22_SEARCH_RESULT_CHUNK_SIZE = 500
23
24
25def BuildFTSQuery(query_ast_conj, fulltext_fields):
26 """Convert a Monorail query AST into a GAE search query string.
27
28 Args:
29 query_ast_conj: a Conjunction PB with a list of Comparison PBs that each
30 have operator, field definitions, string values, and int values.
31 All Conditions should be AND'd together.
32 fulltext_fields: a list of string names of fields that may exist in the
33 fulltext documents. E.g., issue fulltext documents have a "summary"
34 field.
35
36 Returns:
37 A string that can be passed to AppEngine's search API. Or, None if there
38 were no fulltext conditions, so no fulltext search should be done.
39 """
40 fulltext_parts = [
41 _BuildFTSCondition(cond, fulltext_fields)
42 for cond in query_ast_conj.conds]
43 if any(fulltext_parts):
44 return ' '.join(fulltext_parts)
45 else:
46 return None
47
48
49def _BuildFTSCondition(cond, fulltext_fields):
50 """Convert one query AST condition into a GAE search query string."""
51 if cond.op == ast_pb2.QueryOp.NOT_TEXT_HAS:
52 neg = 'NOT '
53 elif cond.op == ast_pb2.QueryOp.TEXT_HAS:
54 neg = ''
55 else:
56 return '' # FTS only looks at TEXT_HAS and NOT_TEXT_HAS
57
58 parts = []
59
60 for fd in cond.field_defs:
61 if fd.field_name in fulltext_fields:
62 pattern = fd.field_name + ':"%s"'
63 elif fd.field_name == ast_pb2.ANY_FIELD:
64 pattern = '"%s"'
65 elif fd.field_id and fd.field_type == tracker_pb2.FieldTypes.STR_TYPE:
66 pattern = 'custom_' + str(fd.field_id) + ':"%s"'
67 else:
68 pattern = 'pylint does not handle else-continue'
69 continue # This issue field is searched via SQL.
70
71 for value in cond.str_values:
72 # Strip out quotes around the value.
73 value = value.strip('"')
74 special_prefixes_match = any(
75 value.startswith(p) for p in query2ast.NON_OP_PREFIXES)
76 if not special_prefixes_match:
77 value = value.replace(':', ' ')
78 assert ('"' not in value), 'Value %r has a quote in it' % value
79 parts.append(pattern % value)
80
81 if parts:
82 return neg + '(%s)' % ' OR '.join(parts)
83 else:
84 return '' # None of the fields were fulltext fields.
85
86
87def ComprehensiveSearch(fulltext_query, index_name):
88 """Call the GAE search API, and keep calling it to get all results.
89
90 Args:
91 fulltext_query: string in the GAE search API query language.
92 index_name: string name of the GAE fulltext index to hit.
93
94 Returns:
95 A list of integer issue IIDs or project IDs.
96 """
97 search_index = search.Index(name=index_name)
98
99 try:
100 response = search_index.search(search.Query(
101 fulltext_query,
102 options=search.QueryOptions(
103 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True,
104 cursor=search.Cursor())))
105 except ValueError as e:
106 raise query2ast.InvalidQueryError(e.message)
107
108 logging.info('got %d initial results', len(response.results))
109 ids = [int(result.doc_id) for result in response]
110
111 remaining_iterations = int(
112 (settings.fulltext_limit_per_shard - 1) // _SEARCH_RESULT_CHUNK_SIZE)
113 for _ in range(remaining_iterations):
114 if not response.cursor:
115 break
116 response = search_index.search(search.Query(
117 fulltext_query,
118 options=search.QueryOptions(
119 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True,
120 cursor=response.cursor)))
121 logging.info(
122 'got %d more results: %r', len(response.results), response.results)
123 ids.extend(int(result.doc_id) for result in response)
124
125 logging.info('FTS result ids %d', len(ids))
126 return ids