blob: 0e1e4cf20a07087814dba42b9975a04ce43edcad [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2016 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""Convert a user's issue search AST into a simplified AST.
6
7This phase of query processing simplifies the user's query by looking up
8the int IDs of any labels, statuses, or components that are mentioned by
9name in the original query. The data needed for lookups is typically cached
10in RAM in each backend job, so this will not put much load on the DB. The
11simplified ASTs are later converted into SQL which is simpler and has
12fewer joins.
13
14The simplified main query is better because:
15 + It is clearly faster, especially in the most common case where config
16 data is in RAM.
17 + Since less RAM is used to process the main query on each shard, query
18 execution time is more consistent with less variability under load. Less
19 variability is good because the user must wait for the slowest shard.
20 + The config tables (LabelDef, StatusDef, etc.) exist only on the primary DB,
21 so they cannot be mentioned in a query that runs on a shard.
22 + The query string itself is shorter when numeric IDs are substituted, which
23 means that we can handle user queries with long lists of labels in a
24 reasonable-sized query.
25 + It bisects the complexity of the operation: it's easier to test and debug
26 the lookup and simplification logic plus the main query logic this way
27 than it would be to deal with an even more complex SQL main query.
28"""
29from __future__ import print_function
30from __future__ import division
31from __future__ import absolute_import
32
33import collections
34import logging
35import re
36
37from framework import exceptions
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010038from mrproto import ast_pb2
39from mrproto import tracker_pb2
Copybara854996b2021-09-07 19:36:02 +000040# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
41# remove this dep.
42from search import query2ast
43from tracker import tracker_bizobj
44from features import federated
45
46
47def PreprocessAST(
48 cnxn, query_ast, project_ids, services, harmonized_config, is_member=True):
49 """Preprocess the query by doing lookups so that the SQL query is simpler.
50
51 Args:
52 cnxn: connection to SQL database.
53 query_ast: user query abstract syntax tree parsed by query2ast.py.
54 project_ids: collection of int project IDs to use to look up status values
55 and labels.
56 services: Connections to persistence layer for users and configs.
57 harmonized_config: harmonized config for all projects being searched.
58 is_member: True if user is a member of all the projects being searched,
59 so they can do user substring searches.
60
61 Returns:
62 A new QueryAST PB with simplified conditions. Specifically, string values
63 for labels, statuses, and components are replaced with the int IDs of
64 those items. Also, is:open is distilled down to
65 status_id != closed_status_ids.
66 """
67 new_conjs = []
68 for conj in query_ast.conjunctions:
69 new_conds = [
70 _PreprocessCond(
71 cnxn, cond, project_ids, services, harmonized_config, is_member)
72 for cond in conj.conds]
73 new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
74
75 return ast_pb2.QueryAST(conjunctions=new_conjs)
76
77
78def _PreprocessIsOpenCond(
79 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
80 """Preprocess an is:open cond into status_id != closed_status_ids."""
81 if project_ids:
82 closed_status_ids = []
83 for project_id in project_ids:
84 closed_status_ids.extend(services.config.LookupClosedStatusIDs(
85 cnxn, project_id))
86 else:
87 closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
88
89 # Invert the operator, because we're comparing against *closed* statuses.
90 if cond.op == ast_pb2.QueryOp.EQ:
91 op = ast_pb2.QueryOp.NE
92 elif cond.op == ast_pb2.QueryOp.NE:
93 op = ast_pb2.QueryOp.EQ
94 else:
95 raise MalformedQuery('Open condition got nonsensical op %r' % cond.op)
96
97 return ast_pb2.Condition(
98 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
99 int_values=closed_status_ids)
100
101
102def _PreprocessIsBlockedCond(
103 _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
104 """Preprocess an is:blocked cond into issues that are blocked."""
105 if cond.op == ast_pb2.QueryOp.EQ:
106 op = ast_pb2.QueryOp.IS_DEFINED
107 elif cond.op == ast_pb2.QueryOp.NE:
108 op = ast_pb2.QueryOp.IS_NOT_DEFINED
109 else:
110 raise MalformedQuery('Blocked condition got nonsensical op %r' % cond.op)
111
112 return ast_pb2.Condition(
113 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
114
115
116def _PreprocessIsSpamCond(
117 _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
118 """Preprocess an is:spam cond into is_spam == 1."""
119 if cond.op == ast_pb2.QueryOp.EQ:
120 int_values = [1]
121 elif cond.op == ast_pb2.QueryOp.NE:
122 int_values = [0]
123 else:
124 raise MalformedQuery('Spam condition got nonsensical op %r' % cond.op)
125
126 return ast_pb2.Condition(
127 op=ast_pb2.QueryOp.EQ,
128 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['is_spam']],
129 int_values=int_values)
130
131
132def _PreprocessBlockedOnCond(
133 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
134 """Preprocess blockedon=xyz and has:blockedon conds.
135
136 Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
137 Preprocesses has:blockedon cond into issues that are blocked on other issues.
138 """
139 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
140 cond, project_ids, services)
141 return ast_pb2.Condition(
142 op=_TextOpToIntOp(cond.op),
143 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
144 int_values=issue_ids,
145 str_values=ext_issue_ids)
146
147
148def _PreprocessBlockingCond(
149 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
150 """Preprocess blocking=xyz and has:blocking conds.
151
152 Preprocesses blocking=xyz cond into blocking_id:issue_ids.
153 Preprocesses has:blocking cond into issues that are blocking other issues.
154 """
155 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
156 cond, project_ids, services)
157 return ast_pb2.Condition(
158 op=_TextOpToIntOp(cond.op),
159 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
160 int_values=issue_ids,
161 str_values=ext_issue_ids)
162
163
164def _PreprocessMergedIntoCond(
165 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
166 """Preprocess mergedinto=xyz and has:mergedinto conds.
167
168 Preprocesses mergedinto=xyz cond into mergedinto_id:issue_ids.
169 Preprocesses has:mergedinto cond into has:mergedinto_id.
170 """
171 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
172 cond, project_ids, services)
173 return ast_pb2.Condition(
174 op=_TextOpToIntOp(cond.op),
175 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['mergedinto_id']],
176 int_values=issue_ids,
177 str_values=ext_issue_ids)
178
179
180def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
181 """Returns global IDs from the local IDs provided in the cond."""
182 # Get {project_name: project} for all projects in project_ids.
183 ids_to_projects = services.project.GetProjects(cnxn, project_ids)
184 ref_projects = {pb.project_name: pb for pb in ids_to_projects.values()}
185 # Populate default_project_name if there is only one project id provided.
186 default_project_name = None
187 if len(ref_projects) == 1:
188 default_project_name = list(ref_projects.values())[0].project_name
189
190 # Populate refs with (project_name, local_id) pairs.
191 refs = []
192 # Populate ext_issue_ids with strings like 'b/1234'.
193 ext_issue_ids = []
194 for val in cond.str_values:
195 try:
196 project_name, local_id = tracker_bizobj.ParseIssueRef(val)
197 if not project_name:
198 if not default_project_name:
199 # TODO(rmistry): Support the below.
200 raise MalformedQuery(
201 'Searching for issues accross multiple/all projects without '
202 'project prefixes is ambiguous and is currently not supported.')
203 project_name = default_project_name
204 refs.append((project_name, int(local_id)))
205 except MalformedQuery as e:
206 raise e
207 # Can't parse issue id, try external issue pattern.
208 except ValueError as e:
209 if federated.FromShortlink(val):
210 ext_issue_ids.append(val)
211 else:
212 raise MalformedQuery('Could not parse issue reference: %s' % val)
213
214 issue_ids, _misses = services.issue.ResolveIssueRefs(
215 cnxn, ref_projects, default_project_name, refs)
216 return issue_ids, ext_issue_ids
217
218
219def _PreprocessStatusCond(
220 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
221 """Preprocess a status=names cond into status_id=IDs."""
222 if project_ids:
223 status_ids = []
224 for project_id in project_ids:
225 status_ids.extend(services.config.LookupStatusIDs(
226 cnxn, project_id, cond.str_values))
227 else:
228 status_ids = services.config.LookupStatusIDsAnyProject(
229 cnxn, cond.str_values)
230
231 return ast_pb2.Condition(
232 op=_TextOpToIntOp(cond.op),
233 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
234 int_values=status_ids)
235
236
237def _IsEqualityOp(op):
238 """Return True for EQ and NE."""
239 return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
240
241
242def _IsDefinedOp(op):
243 """Return True for IS_DEFINED and IS_NOT_DEFINED."""
244 return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
245
246
247def _TextOpToIntOp(op):
248 """If a query is optimized from string to ID matching, use an equality op."""
249 if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
250 return ast_pb2.QueryOp.EQ
251 elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
252 return ast_pb2.QueryOp.NE
253 return op
254
255
256def _MakePrefixRegex(cond):
257 """Return a regex to match strings that start with cond values."""
258 all_prefixes = '|'.join(map(re.escape, cond.str_values))
259 return re.compile(r'(%s)-.+' % all_prefixes, re.I)
260
261
262def _MakeKeyValueRegex(cond):
263 """Return a regex to match the first token and remaining text separately."""
264 keys, values = list(zip(*[x.split('-', 1) for x in cond.str_values]))
265 if len(set(keys)) != 1:
266 raise MalformedQuery(
267 "KeyValue query with multiple different keys: %r" % cond.str_values)
268 all_values = '|'.join(map(re.escape, values))
269 return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
270
271
272def _MakeWordBoundaryRegex(cond):
273 """Return a regex to match the cond values as whole words."""
274 all_words = '|'.join(map(re.escape, cond.str_values))
275 return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
276
277
278def _PreprocessLabelCond(
279 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
280 """Preprocess a label=names cond into label_id=IDs."""
281 if project_ids:
282 label_ids = []
283 for project_id in project_ids:
284 if _IsEqualityOp(cond.op):
285 label_ids.extend(services.config.LookupLabelIDs(
286 cnxn, project_id, cond.str_values))
287 elif _IsDefinedOp(cond.op):
288 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
289 cnxn, project_id, _MakePrefixRegex(cond)))
290 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
291 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
292 cnxn, project_id, _MakeKeyValueRegex(cond)))
293 else:
294 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
295 cnxn, project_id, _MakeWordBoundaryRegex(cond)))
296 else:
297 if _IsEqualityOp(cond.op):
298 label_ids = services.config.LookupLabelIDsAnyProject(
299 cnxn, cond.str_values)
300 elif _IsDefinedOp(cond.op):
301 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
302 cnxn, _MakePrefixRegex(cond))
303 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
304 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
305 cnxn, _MakeKeyValueRegex(cond))
306 else:
307 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
308 cnxn, _MakeWordBoundaryRegex(cond))
309
310 return ast_pb2.Condition(
311 op=_TextOpToIntOp(cond.op),
312 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
313 int_values=label_ids)
314
315
316def _PreprocessComponentCond(
317 cnxn, cond, project_ids, services, harmonized_config, _is_member):
318 """Preprocess a component= or component:name cond into component_id=IDs."""
319 exact = _IsEqualityOp(cond.op)
320 component_ids = []
321 if project_ids:
322 # We are searching within specific projects, so harmonized_config
323 # holds the config data for all those projects.
324 for comp_path in cond.str_values:
325 component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
326 comp_path, harmonized_config, exact=exact))
327 else:
328 # We are searching across the whole site, so we have no harmonized_config
329 # to use.
330 component_ids = services.config.FindMatchingComponentIDsAnyProject(
331 cnxn, cond.str_values, exact=exact)
332
333 return ast_pb2.Condition(
334 op=_TextOpToIntOp(cond.op),
335 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
336 int_values=component_ids)
337
338
339def _PreprocessExactUsers(
340 cnxn, cond, user_service, id_fields, is_member):
341 """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
342
343 This preprocesing step converts string conditions to int ID conditions.
344 E.g., [owner=email] to [owner_id=ID]. It only does it in cases
345 where (a) the email was "me", so it was already converted to an string of
346 digits in the search pipeline, or (b) it is "user@domain" which resolves to
347 a known Monorail user. It is also possible to search for, e.g.,
348 [owner:substring], but such searches remain 'owner' field searches rather
349 than 'owner_id', and they cannot be combined with the "me" keyword.
350
351 Args:
352 cnxn: connection to the DB.
353 cond: original parsed query Condition PB.
354 user_service: connection to user persistence layer.
355 id_fields: list of the search fields to use if the conversion to IDs
356 succeed.
357 is_member: True if user is a member of all the projects being searchers,
358 so they can do user substring searches.
359
360 Returns:
361 A new Condition PB that checks the id_field. Or, the original cond.
362
363 Raises:
364 MalformedQuery: A non-member used a query term that could be used to
365 guess full user email addresses.
366 """
367 op = _TextOpToIntOp(cond.op)
368 if _IsDefinedOp(op):
369 # No need to look up any IDs if we are just testing for any defined value.
370 return ast_pb2.Condition(op=op, field_defs=id_fields,
371 key_suffix=cond.key_suffix,
372 phase_name=cond.phase_name)
373
374 # This preprocessing step is only for ops that compare whole values, not
375 # substrings.
376 if not _IsEqualityOp(op):
377 logging.info('could not convert to IDs because op is %r', op)
378 if not is_member:
379 raise MalformedQuery('Only project members may compare user strings')
380 return cond
381
382 user_ids = []
383 for val in cond.str_values:
384 try:
385 user_ids.append(int(val))
386 except ValueError:
387 try:
388 user_ids.append(user_service.LookupUserID(cnxn, val))
389 except exceptions.NoSuchUserException:
390 if not is_member and val != 'me' and not val.startswith('@'):
391 logging.info('could not convert user %r to int ID', val)
392 if '@' in val:
393 raise MalformedQuery('User email address not found')
394 else:
395 raise MalformedQuery(
396 'Only project members may search for user substrings')
397 return cond # preprocessing failed, stick with the original cond.
398
399 return ast_pb2.MakeCond(
400 op, id_fields, [], user_ids, key_suffix=cond.key_suffix,
401 phase_name=cond.phase_name)
402
403
404def _PreprocessOwnerCond(
405 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
406 """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
407 return _PreprocessExactUsers(
408 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']],
409 is_member)
410
411
412def _PreprocessCcCond(
413 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
414 """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
415 return _PreprocessExactUsers(
416 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']],
417 is_member)
418
419
420def _PreprocessReporterCond(
421 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
422 """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
423 return _PreprocessExactUsers(
424 cnxn, cond, services.user,
425 [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']], is_member)
426
427
428def _PreprocessStarredByCond(
429 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
430 """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
431 return _PreprocessExactUsers(
432 cnxn, cond, services.user,
433 [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']], is_member)
434
435
436def _PreprocessCommentByCond(
437 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
438 """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
439 return _PreprocessExactUsers(
440 cnxn, cond, services.user,
441 [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']], is_member)
442
443
444def _PreprocessHotlistCond(
445 cnxn, cond, _project_ids, services, _harmonized_config, _is_member):
446 """Preprocess hotlist query
447
448 Preprocesses a hotlist query in the form:
449 'hotlist=<user_email>:<hotlist-name>,<hotlist-name>,<user2_email>:...
450 into hotlist_id=IDs, if exact.
451 """
452 # TODO(jojwang): add support for searches that don't contain domain names.
453 # eg jojwang:hotlist-name
454 users_to_hotlists = collections.defaultdict(list)
455 cur_user = ''
456 for val in cond.str_values:
457 if ':' in val:
458 cur_user, hotlists_str = val.split(':', 1)
459 else:
460 hotlists_str = val
461 try:
462 users_to_hotlists[int(cur_user)].append(hotlists_str)
463 except ValueError:
464 try:
465 user_id = services.user.LookupUserID(cnxn, cur_user)
466 users_to_hotlists[user_id].append(hotlists_str)
467 except exceptions.NoSuchUserException:
468 logging.info('could not convert user %r to int ID', val)
469 return cond
470 hotlist_ids = set()
471 for user_id, hotlists in users_to_hotlists.items():
472 if not hotlists[0]:
473 user_hotlists = services.features.GetHotlistsByUserID(cnxn, user_id)
474 user_hotlist_ids = [hotlist.hotlist_id for hotlist in user_hotlists if
475 user_id in hotlist.owner_ids]
476 else:
477 user_hotlist_ids = list(services.features.LookupHotlistIDs(
478 cnxn, hotlists, [user_id]).values())
479 for hotlist_id in user_hotlist_ids:
480 hotlist_ids.add(hotlist_id)
481 return ast_pb2.Condition(
482 op=_TextOpToIntOp(cond.op),
483 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['hotlist_id']],
484 int_values=list(hotlist_ids))
485
486
487def _PreprocessCustomCond(cnxn, cond, services, is_member):
488 """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
489 # TODO(jrobbins): better support for ambiguous fields.
490 # For now, if any field is USER_TYPE and the value being searched
491 # for is the email address of an existing account, it will convert
492 # to a user ID and we go with exact ID matching. Otherwise, we
493 # leave the cond as-is for ast2select to do string matching on.
494 user_field_defs = [fd for fd in cond.field_defs
495 if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
496 if user_field_defs:
497 return _PreprocessExactUsers(
498 cnxn, cond, services.user, user_field_defs, is_member)
499
500 approval_field_defs = [fd for fd in cond.field_defs
501 if (fd.field_type ==
502 tracker_pb2.FieldTypes.APPROVAL_TYPE)]
503 if approval_field_defs:
504 if cond.key_suffix in [query2ast.APPROVER_SUFFIX, query2ast.SET_BY_SUFFIX]:
505 return _PreprocessExactUsers(
506 cnxn, cond, services.user, approval_field_defs, is_member)
507
508 return cond
509
510
511_PREPROCESSORS = {
512 'open': _PreprocessIsOpenCond,
513 'blocked': _PreprocessIsBlockedCond,
514 'spam': _PreprocessIsSpamCond,
515 'blockedon': _PreprocessBlockedOnCond,
516 'blocking': _PreprocessBlockingCond,
517 'mergedinto': _PreprocessMergedIntoCond,
518 'status': _PreprocessStatusCond,
519 'label': _PreprocessLabelCond,
520 'component': _PreprocessComponentCond,
521 'owner': _PreprocessOwnerCond,
522 'cc': _PreprocessCcCond,
523 'reporter': _PreprocessReporterCond,
524 'starredby': _PreprocessStarredByCond,
525 'commentby': _PreprocessCommentByCond,
526 'hotlist': _PreprocessHotlistCond,
527 }
528
529
530def _PreprocessCond(
531 cnxn, cond, project_ids, services, harmonized_config, is_member):
532 """Preprocess query by looking up status, label and component IDs."""
533 # All the fields in a cond share the same name because they are parsed
534 # from a user query term, and the term syntax allows just one field name.
535 field_name = cond.field_defs[0].field_name
536 assert all(fd.field_name == field_name for fd in cond.field_defs)
537
538 # Case 1: The user is searching custom fields.
539 if any(fd.field_id for fd in cond.field_defs):
540 # There can't be a mix of custom and built-in fields because built-in
541 # field names are reserved and take priority over any conflicting ones.
542 assert all(fd.field_id for fd in cond.field_defs)
543 return _PreprocessCustomCond(cnxn, cond, services, is_member)
544
545 # Case 2: The user is searching a built-in field.
546 preproc = _PREPROCESSORS.get(field_name)
547 if preproc:
548 # We have a preprocessor for that built-in field.
549 return preproc(
550 cnxn, cond, project_ids, services, harmonized_config, is_member)
551 else:
552 # We don't have a preprocessor for it.
553 return cond
554
555
556class MalformedQuery(ValueError):
557 pass