blob: bf4de4f72b3cf779956149352c153be59af5c861 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5
6"""Convert a user's issue search AST into a simplified AST.
7
8This phase of query processing simplifies the user's query by looking up
9the int IDs of any labels, statuses, or components that are mentioned by
10name in the original query. The data needed for lookups is typically cached
11in RAM in each backend job, so this will not put much load on the DB. The
12simplified ASTs are later converted into SQL which is simpler and has
13fewer joins.
14
15The simplified main query is better because:
16 + It is clearly faster, especially in the most common case where config
17 data is in RAM.
18 + Since less RAM is used to process the main query on each shard, query
19 execution time is more consistent with less variability under load. Less
20 variability is good because the user must wait for the slowest shard.
21 + The config tables (LabelDef, StatusDef, etc.) exist only on the primary DB,
22 so they cannot be mentioned in a query that runs on a shard.
23 + The query string itself is shorter when numeric IDs are substituted, which
24 means that we can handle user queries with long lists of labels in a
25 reasonable-sized query.
26 + It bisects the complexity of the operation: it's easier to test and debug
27 the lookup and simplification logic plus the main query logic this way
28 than it would be to deal with an even more complex SQL main query.
29"""
30from __future__ import print_function
31from __future__ import division
32from __future__ import absolute_import
33
34import collections
35import logging
36import re
37
38from framework import exceptions
39from proto import ast_pb2
40from proto import tracker_pb2
41# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
42# remove this dep.
43from search import query2ast
44from tracker import tracker_bizobj
45from features import federated
46
47
48def PreprocessAST(
49 cnxn, query_ast, project_ids, services, harmonized_config, is_member=True):
50 """Preprocess the query by doing lookups so that the SQL query is simpler.
51
52 Args:
53 cnxn: connection to SQL database.
54 query_ast: user query abstract syntax tree parsed by query2ast.py.
55 project_ids: collection of int project IDs to use to look up status values
56 and labels.
57 services: Connections to persistence layer for users and configs.
58 harmonized_config: harmonized config for all projects being searched.
59 is_member: True if user is a member of all the projects being searched,
60 so they can do user substring searches.
61
62 Returns:
63 A new QueryAST PB with simplified conditions. Specifically, string values
64 for labels, statuses, and components are replaced with the int IDs of
65 those items. Also, is:open is distilled down to
66 status_id != closed_status_ids.
67 """
68 new_conjs = []
69 for conj in query_ast.conjunctions:
70 new_conds = [
71 _PreprocessCond(
72 cnxn, cond, project_ids, services, harmonized_config, is_member)
73 for cond in conj.conds]
74 new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
75
76 return ast_pb2.QueryAST(conjunctions=new_conjs)
77
78
79def _PreprocessIsOpenCond(
80 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
81 """Preprocess an is:open cond into status_id != closed_status_ids."""
82 if project_ids:
83 closed_status_ids = []
84 for project_id in project_ids:
85 closed_status_ids.extend(services.config.LookupClosedStatusIDs(
86 cnxn, project_id))
87 else:
88 closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
89
90 # Invert the operator, because we're comparing against *closed* statuses.
91 if cond.op == ast_pb2.QueryOp.EQ:
92 op = ast_pb2.QueryOp.NE
93 elif cond.op == ast_pb2.QueryOp.NE:
94 op = ast_pb2.QueryOp.EQ
95 else:
96 raise MalformedQuery('Open condition got nonsensical op %r' % cond.op)
97
98 return ast_pb2.Condition(
99 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
100 int_values=closed_status_ids)
101
102
103def _PreprocessIsBlockedCond(
104 _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
105 """Preprocess an is:blocked cond into issues that are blocked."""
106 if cond.op == ast_pb2.QueryOp.EQ:
107 op = ast_pb2.QueryOp.IS_DEFINED
108 elif cond.op == ast_pb2.QueryOp.NE:
109 op = ast_pb2.QueryOp.IS_NOT_DEFINED
110 else:
111 raise MalformedQuery('Blocked condition got nonsensical op %r' % cond.op)
112
113 return ast_pb2.Condition(
114 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
115
116
117def _PreprocessIsSpamCond(
118 _cnxn, cond, _project_ids, _services, _harmonized_config, _is_member):
119 """Preprocess an is:spam cond into is_spam == 1."""
120 if cond.op == ast_pb2.QueryOp.EQ:
121 int_values = [1]
122 elif cond.op == ast_pb2.QueryOp.NE:
123 int_values = [0]
124 else:
125 raise MalformedQuery('Spam condition got nonsensical op %r' % cond.op)
126
127 return ast_pb2.Condition(
128 op=ast_pb2.QueryOp.EQ,
129 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['is_spam']],
130 int_values=int_values)
131
132
133def _PreprocessBlockedOnCond(
134 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
135 """Preprocess blockedon=xyz and has:blockedon conds.
136
137 Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
138 Preprocesses has:blockedon cond into issues that are blocked on other issues.
139 """
140 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
141 cond, project_ids, services)
142 return ast_pb2.Condition(
143 op=_TextOpToIntOp(cond.op),
144 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
145 int_values=issue_ids,
146 str_values=ext_issue_ids)
147
148
149def _PreprocessBlockingCond(
150 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
151 """Preprocess blocking=xyz and has:blocking conds.
152
153 Preprocesses blocking=xyz cond into blocking_id:issue_ids.
154 Preprocesses has:blocking cond into issues that are blocking other issues.
155 """
156 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
157 cond, project_ids, services)
158 return ast_pb2.Condition(
159 op=_TextOpToIntOp(cond.op),
160 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
161 int_values=issue_ids,
162 str_values=ext_issue_ids)
163
164
165def _PreprocessMergedIntoCond(
166 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
167 """Preprocess mergedinto=xyz and has:mergedinto conds.
168
169 Preprocesses mergedinto=xyz cond into mergedinto_id:issue_ids.
170 Preprocesses has:mergedinto cond into has:mergedinto_id.
171 """
172 issue_ids, ext_issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn,
173 cond, project_ids, services)
174 return ast_pb2.Condition(
175 op=_TextOpToIntOp(cond.op),
176 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['mergedinto_id']],
177 int_values=issue_ids,
178 str_values=ext_issue_ids)
179
180
181def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
182 """Returns global IDs from the local IDs provided in the cond."""
183 # Get {project_name: project} for all projects in project_ids.
184 ids_to_projects = services.project.GetProjects(cnxn, project_ids)
185 ref_projects = {pb.project_name: pb for pb in ids_to_projects.values()}
186 # Populate default_project_name if there is only one project id provided.
187 default_project_name = None
188 if len(ref_projects) == 1:
189 default_project_name = list(ref_projects.values())[0].project_name
190
191 # Populate refs with (project_name, local_id) pairs.
192 refs = []
193 # Populate ext_issue_ids with strings like 'b/1234'.
194 ext_issue_ids = []
195 for val in cond.str_values:
196 try:
197 project_name, local_id = tracker_bizobj.ParseIssueRef(val)
198 if not project_name:
199 if not default_project_name:
200 # TODO(rmistry): Support the below.
201 raise MalformedQuery(
202 'Searching for issues accross multiple/all projects without '
203 'project prefixes is ambiguous and is currently not supported.')
204 project_name = default_project_name
205 refs.append((project_name, int(local_id)))
206 except MalformedQuery as e:
207 raise e
208 # Can't parse issue id, try external issue pattern.
209 except ValueError as e:
210 if federated.FromShortlink(val):
211 ext_issue_ids.append(val)
212 else:
213 raise MalformedQuery('Could not parse issue reference: %s' % val)
214
215 issue_ids, _misses = services.issue.ResolveIssueRefs(
216 cnxn, ref_projects, default_project_name, refs)
217 return issue_ids, ext_issue_ids
218
219
220def _PreprocessStatusCond(
221 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
222 """Preprocess a status=names cond into status_id=IDs."""
223 if project_ids:
224 status_ids = []
225 for project_id in project_ids:
226 status_ids.extend(services.config.LookupStatusIDs(
227 cnxn, project_id, cond.str_values))
228 else:
229 status_ids = services.config.LookupStatusIDsAnyProject(
230 cnxn, cond.str_values)
231
232 return ast_pb2.Condition(
233 op=_TextOpToIntOp(cond.op),
234 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
235 int_values=status_ids)
236
237
238def _IsEqualityOp(op):
239 """Return True for EQ and NE."""
240 return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
241
242
243def _IsDefinedOp(op):
244 """Return True for IS_DEFINED and IS_NOT_DEFINED."""
245 return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
246
247
248def _TextOpToIntOp(op):
249 """If a query is optimized from string to ID matching, use an equality op."""
250 if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
251 return ast_pb2.QueryOp.EQ
252 elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
253 return ast_pb2.QueryOp.NE
254 return op
255
256
257def _MakePrefixRegex(cond):
258 """Return a regex to match strings that start with cond values."""
259 all_prefixes = '|'.join(map(re.escape, cond.str_values))
260 return re.compile(r'(%s)-.+' % all_prefixes, re.I)
261
262
263def _MakeKeyValueRegex(cond):
264 """Return a regex to match the first token and remaining text separately."""
265 keys, values = list(zip(*[x.split('-', 1) for x in cond.str_values]))
266 if len(set(keys)) != 1:
267 raise MalformedQuery(
268 "KeyValue query with multiple different keys: %r" % cond.str_values)
269 all_values = '|'.join(map(re.escape, values))
270 return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
271
272
273def _MakeWordBoundaryRegex(cond):
274 """Return a regex to match the cond values as whole words."""
275 all_words = '|'.join(map(re.escape, cond.str_values))
276 return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
277
278
279def _PreprocessLabelCond(
280 cnxn, cond, project_ids, services, _harmonized_config, _is_member):
281 """Preprocess a label=names cond into label_id=IDs."""
282 if project_ids:
283 label_ids = []
284 for project_id in project_ids:
285 if _IsEqualityOp(cond.op):
286 label_ids.extend(services.config.LookupLabelIDs(
287 cnxn, project_id, cond.str_values))
288 elif _IsDefinedOp(cond.op):
289 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
290 cnxn, project_id, _MakePrefixRegex(cond)))
291 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
292 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
293 cnxn, project_id, _MakeKeyValueRegex(cond)))
294 else:
295 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
296 cnxn, project_id, _MakeWordBoundaryRegex(cond)))
297 else:
298 if _IsEqualityOp(cond.op):
299 label_ids = services.config.LookupLabelIDsAnyProject(
300 cnxn, cond.str_values)
301 elif _IsDefinedOp(cond.op):
302 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
303 cnxn, _MakePrefixRegex(cond))
304 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
305 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
306 cnxn, _MakeKeyValueRegex(cond))
307 else:
308 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
309 cnxn, _MakeWordBoundaryRegex(cond))
310
311 return ast_pb2.Condition(
312 op=_TextOpToIntOp(cond.op),
313 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
314 int_values=label_ids)
315
316
317def _PreprocessComponentCond(
318 cnxn, cond, project_ids, services, harmonized_config, _is_member):
319 """Preprocess a component= or component:name cond into component_id=IDs."""
320 exact = _IsEqualityOp(cond.op)
321 component_ids = []
322 if project_ids:
323 # We are searching within specific projects, so harmonized_config
324 # holds the config data for all those projects.
325 for comp_path in cond.str_values:
326 component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
327 comp_path, harmonized_config, exact=exact))
328 else:
329 # We are searching across the whole site, so we have no harmonized_config
330 # to use.
331 component_ids = services.config.FindMatchingComponentIDsAnyProject(
332 cnxn, cond.str_values, exact=exact)
333
334 return ast_pb2.Condition(
335 op=_TextOpToIntOp(cond.op),
336 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
337 int_values=component_ids)
338
339
340def _PreprocessExactUsers(
341 cnxn, cond, user_service, id_fields, is_member):
342 """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
343
344 This preprocesing step converts string conditions to int ID conditions.
345 E.g., [owner=email] to [owner_id=ID]. It only does it in cases
346 where (a) the email was "me", so it was already converted to an string of
347 digits in the search pipeline, or (b) it is "user@domain" which resolves to
348 a known Monorail user. It is also possible to search for, e.g.,
349 [owner:substring], but such searches remain 'owner' field searches rather
350 than 'owner_id', and they cannot be combined with the "me" keyword.
351
352 Args:
353 cnxn: connection to the DB.
354 cond: original parsed query Condition PB.
355 user_service: connection to user persistence layer.
356 id_fields: list of the search fields to use if the conversion to IDs
357 succeed.
358 is_member: True if user is a member of all the projects being searchers,
359 so they can do user substring searches.
360
361 Returns:
362 A new Condition PB that checks the id_field. Or, the original cond.
363
364 Raises:
365 MalformedQuery: A non-member used a query term that could be used to
366 guess full user email addresses.
367 """
368 op = _TextOpToIntOp(cond.op)
369 if _IsDefinedOp(op):
370 # No need to look up any IDs if we are just testing for any defined value.
371 return ast_pb2.Condition(op=op, field_defs=id_fields,
372 key_suffix=cond.key_suffix,
373 phase_name=cond.phase_name)
374
375 # This preprocessing step is only for ops that compare whole values, not
376 # substrings.
377 if not _IsEqualityOp(op):
378 logging.info('could not convert to IDs because op is %r', op)
379 if not is_member:
380 raise MalformedQuery('Only project members may compare user strings')
381 return cond
382
383 user_ids = []
384 for val in cond.str_values:
385 try:
386 user_ids.append(int(val))
387 except ValueError:
388 try:
389 user_ids.append(user_service.LookupUserID(cnxn, val))
390 except exceptions.NoSuchUserException:
391 if not is_member and val != 'me' and not val.startswith('@'):
392 logging.info('could not convert user %r to int ID', val)
393 if '@' in val:
394 raise MalformedQuery('User email address not found')
395 else:
396 raise MalformedQuery(
397 'Only project members may search for user substrings')
398 return cond # preprocessing failed, stick with the original cond.
399
400 return ast_pb2.MakeCond(
401 op, id_fields, [], user_ids, key_suffix=cond.key_suffix,
402 phase_name=cond.phase_name)
403
404
405def _PreprocessOwnerCond(
406 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
407 """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
408 return _PreprocessExactUsers(
409 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']],
410 is_member)
411
412
413def _PreprocessCcCond(
414 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
415 """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
416 return _PreprocessExactUsers(
417 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']],
418 is_member)
419
420
421def _PreprocessReporterCond(
422 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
423 """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
424 return _PreprocessExactUsers(
425 cnxn, cond, services.user,
426 [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']], is_member)
427
428
429def _PreprocessStarredByCond(
430 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
431 """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
432 return _PreprocessExactUsers(
433 cnxn, cond, services.user,
434 [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']], is_member)
435
436
437def _PreprocessCommentByCond(
438 cnxn, cond, _project_ids, services, _harmonized_config, is_member):
439 """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
440 return _PreprocessExactUsers(
441 cnxn, cond, services.user,
442 [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']], is_member)
443
444
445def _PreprocessHotlistCond(
446 cnxn, cond, _project_ids, services, _harmonized_config, _is_member):
447 """Preprocess hotlist query
448
449 Preprocesses a hotlist query in the form:
450 'hotlist=<user_email>:<hotlist-name>,<hotlist-name>,<user2_email>:...
451 into hotlist_id=IDs, if exact.
452 """
453 # TODO(jojwang): add support for searches that don't contain domain names.
454 # eg jojwang:hotlist-name
455 users_to_hotlists = collections.defaultdict(list)
456 cur_user = ''
457 for val in cond.str_values:
458 if ':' in val:
459 cur_user, hotlists_str = val.split(':', 1)
460 else:
461 hotlists_str = val
462 try:
463 users_to_hotlists[int(cur_user)].append(hotlists_str)
464 except ValueError:
465 try:
466 user_id = services.user.LookupUserID(cnxn, cur_user)
467 users_to_hotlists[user_id].append(hotlists_str)
468 except exceptions.NoSuchUserException:
469 logging.info('could not convert user %r to int ID', val)
470 return cond
471 hotlist_ids = set()
472 for user_id, hotlists in users_to_hotlists.items():
473 if not hotlists[0]:
474 user_hotlists = services.features.GetHotlistsByUserID(cnxn, user_id)
475 user_hotlist_ids = [hotlist.hotlist_id for hotlist in user_hotlists if
476 user_id in hotlist.owner_ids]
477 else:
478 user_hotlist_ids = list(services.features.LookupHotlistIDs(
479 cnxn, hotlists, [user_id]).values())
480 for hotlist_id in user_hotlist_ids:
481 hotlist_ids.add(hotlist_id)
482 return ast_pb2.Condition(
483 op=_TextOpToIntOp(cond.op),
484 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['hotlist_id']],
485 int_values=list(hotlist_ids))
486
487
488def _PreprocessCustomCond(cnxn, cond, services, is_member):
489 """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
490 # TODO(jrobbins): better support for ambiguous fields.
491 # For now, if any field is USER_TYPE and the value being searched
492 # for is the email address of an existing account, it will convert
493 # to a user ID and we go with exact ID matching. Otherwise, we
494 # leave the cond as-is for ast2select to do string matching on.
495 user_field_defs = [fd for fd in cond.field_defs
496 if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
497 if user_field_defs:
498 return _PreprocessExactUsers(
499 cnxn, cond, services.user, user_field_defs, is_member)
500
501 approval_field_defs = [fd for fd in cond.field_defs
502 if (fd.field_type ==
503 tracker_pb2.FieldTypes.APPROVAL_TYPE)]
504 if approval_field_defs:
505 if cond.key_suffix in [query2ast.APPROVER_SUFFIX, query2ast.SET_BY_SUFFIX]:
506 return _PreprocessExactUsers(
507 cnxn, cond, services.user, approval_field_defs, is_member)
508
509 return cond
510
511
512_PREPROCESSORS = {
513 'open': _PreprocessIsOpenCond,
514 'blocked': _PreprocessIsBlockedCond,
515 'spam': _PreprocessIsSpamCond,
516 'blockedon': _PreprocessBlockedOnCond,
517 'blocking': _PreprocessBlockingCond,
518 'mergedinto': _PreprocessMergedIntoCond,
519 'status': _PreprocessStatusCond,
520 'label': _PreprocessLabelCond,
521 'component': _PreprocessComponentCond,
522 'owner': _PreprocessOwnerCond,
523 'cc': _PreprocessCcCond,
524 'reporter': _PreprocessReporterCond,
525 'starredby': _PreprocessStarredByCond,
526 'commentby': _PreprocessCommentByCond,
527 'hotlist': _PreprocessHotlistCond,
528 }
529
530
531def _PreprocessCond(
532 cnxn, cond, project_ids, services, harmonized_config, is_member):
533 """Preprocess query by looking up status, label and component IDs."""
534 # All the fields in a cond share the same name because they are parsed
535 # from a user query term, and the term syntax allows just one field name.
536 field_name = cond.field_defs[0].field_name
537 assert all(fd.field_name == field_name for fd in cond.field_defs)
538
539 # Case 1: The user is searching custom fields.
540 if any(fd.field_id for fd in cond.field_defs):
541 # There can't be a mix of custom and built-in fields because built-in
542 # field names are reserved and take priority over any conflicting ones.
543 assert all(fd.field_id for fd in cond.field_defs)
544 return _PreprocessCustomCond(cnxn, cond, services, is_member)
545
546 # Case 2: The user is searching a built-in field.
547 preproc = _PREPROCESSORS.get(field_name)
548 if preproc:
549 # We have a preprocessor for that built-in field.
550 return preproc(
551 cnxn, cond, project_ids, services, harmonized_config, is_member)
552 else:
553 # We don't have a preprocessor for it.
554 return cond
555
556
557class MalformedQuery(ValueError):
558 pass