Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style |
| 3 | # license that can be found in the LICENSE file or at |
| 4 | # https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | """ Set of functions for detaling with spam reports. |
| 7 | """ |
| 8 | from __future__ import print_function |
| 9 | from __future__ import division |
| 10 | from __future__ import absolute_import |
| 11 | |
| 12 | import collections |
| 13 | import logging |
| 14 | import settings |
| 15 | import sys |
| 16 | |
| 17 | from collections import defaultdict |
| 18 | from features import filterrules_helpers |
| 19 | from framework import sql |
| 20 | from framework import framework_constants |
| 21 | from infra_libs import ts_mon |
| 22 | from services import ml_helpers |
| 23 | |
| 24 | |
| 25 | SPAMREPORT_TABLE_NAME = 'SpamReport' |
| 26 | SPAMVERDICT_TABLE_NAME = 'SpamVerdict' |
| 27 | ISSUE_TABLE = 'Issue' |
| 28 | |
| 29 | REASON_MANUAL = 'manual' |
| 30 | REASON_THRESHOLD = 'threshold' |
| 31 | REASON_CLASSIFIER = 'classifier' |
| 32 | REASON_FAIL_OPEN = 'fail_open' |
| 33 | SPAM_CLASS_LABEL = '1' |
| 34 | |
| 35 | SPAMREPORT_ISSUE_COLS = ['issue_id', 'reported_user_id', 'user_id'] |
| 36 | SPAMVERDICT_ISSUE_COL = ['created', 'content_created', 'user_id', |
| 37 | 'reported_user_id', 'comment_id', 'issue_id'] |
| 38 | MANUALVERDICT_ISSUE_COLS = ['user_id', 'issue_id', 'is_spam', 'reason', |
| 39 | 'project_id'] |
| 40 | THRESHVERDICT_ISSUE_COLS = ['issue_id', 'is_spam', 'reason', 'project_id'] |
| 41 | |
| 42 | SPAMREPORT_COMMENT_COLS = ['comment_id', 'reported_user_id', 'user_id'] |
| 43 | MANUALVERDICT_COMMENT_COLS = ['user_id', 'comment_id', 'is_spam', 'reason', |
| 44 | 'project_id'] |
| 45 | THRESHVERDICT_COMMENT_COLS = ['comment_id', 'is_spam', 'reason', 'project_id'] |
| 46 | |
| 47 | |
| 48 | class SpamService(object): |
| 49 | """The persistence layer for spam reports.""" |
| 50 | issue_actions = ts_mon.CounterMetric( |
| 51 | 'monorail/spam_svc/issue', 'Count of things that happen to issues.', [ |
| 52 | ts_mon.StringField('type'), |
| 53 | ts_mon.StringField('reporter_id'), |
| 54 | ts_mon.StringField('issue') |
| 55 | ]) |
| 56 | comment_actions = ts_mon.CounterMetric( |
| 57 | 'monorail/spam_svc/comment', 'Count of things that happen to comments.', [ |
| 58 | ts_mon.StringField('type'), |
| 59 | ts_mon.StringField('reporter_id'), |
| 60 | ts_mon.StringField('issue'), |
| 61 | ts_mon.StringField('comment_id') |
| 62 | ]) |
| 63 | ml_engine_failures = ts_mon.CounterMetric( |
| 64 | 'monorail/spam_svc/ml_engine_failure', |
| 65 | 'Failures calling the ML Engine API', |
| 66 | None) |
| 67 | |
| 68 | def __init__(self): |
| 69 | self.report_tbl = sql.SQLTableManager(SPAMREPORT_TABLE_NAME) |
| 70 | self.verdict_tbl = sql.SQLTableManager(SPAMVERDICT_TABLE_NAME) |
| 71 | self.issue_tbl = sql.SQLTableManager(ISSUE_TABLE) |
| 72 | |
| 73 | # ML Engine library is lazy loaded below. |
| 74 | self.ml_engine = None |
| 75 | |
| 76 | def LookupIssuesFlaggers(self, cnxn, issue_ids): |
| 77 | """Returns users who've reported the issues or their comments as spam. |
| 78 | |
| 79 | Returns a dictionary {issue_id: (issue_reporters, comment_reporters)} |
| 80 | issue_reportes is a list of users who flagged the issue; |
| 81 | comment_reporters element is a dictionary {comment_id: [user_ids]} where |
| 82 | user_ids are the users who flagged that comment. |
| 83 | """ |
| 84 | rows = self.report_tbl.Select( |
| 85 | cnxn, cols=['issue_id', 'user_id', 'comment_id'], |
| 86 | issue_id=issue_ids) |
| 87 | |
| 88 | reporters = collections.defaultdict( |
| 89 | # Return a tuple of (issue_reporters, comment_reporters) as described |
| 90 | # above. |
| 91 | lambda: ([], collections.defaultdict(list))) |
| 92 | |
| 93 | for row in rows: |
| 94 | issue_id = int(row[0]) |
| 95 | user_id = row[1] |
| 96 | if row[2]: |
| 97 | comment_id = row[2] |
| 98 | reporters[issue_id][1][comment_id].append(user_id) |
| 99 | else: |
| 100 | reporters[issue_id][0].append(user_id) |
| 101 | |
| 102 | return reporters |
| 103 | |
| 104 | def LookupIssueFlaggers(self, cnxn, issue_id): |
| 105 | """Returns users who've reported the issue or its comments as spam. |
| 106 | |
| 107 | Returns a tuple. First element is a list of users who flagged the issue; |
| 108 | second element is a dictionary of comment id to a list of users who flagged |
| 109 | that comment. |
| 110 | """ |
| 111 | return self.LookupIssuesFlaggers(cnxn, [issue_id])[issue_id] |
| 112 | |
| 113 | def LookupIssueFlagCounts(self, cnxn, issue_ids): |
| 114 | """Returns a map of issue_id to flag counts""" |
| 115 | rows = self.report_tbl.Select(cnxn, cols=['issue_id', 'COUNT(*)'], |
| 116 | issue_id=issue_ids, group_by=['issue_id']) |
| 117 | counts = {} |
| 118 | for row in rows: |
| 119 | counts[int(row[0])] = row[1] |
| 120 | return counts |
| 121 | |
| 122 | def LookupIssueVerdicts(self, cnxn, issue_ids): |
| 123 | """Returns a map of issue_id to most recent spam verdicts""" |
| 124 | rows = self.verdict_tbl.Select(cnxn, |
| 125 | cols=['issue_id', 'reason', 'MAX(created)'], |
| 126 | issue_id=issue_ids, comment_id=None, |
| 127 | group_by=['issue_id']) |
| 128 | counts = {} |
| 129 | for row in rows: |
| 130 | counts[int(row[0])] = row[1] |
| 131 | return counts |
| 132 | |
| 133 | def LookupIssueVerdictHistory(self, cnxn, issue_ids): |
| 134 | """Returns a map of issue_id to most recent spam verdicts""" |
| 135 | rows = self.verdict_tbl.Select(cnxn, cols=[ |
| 136 | 'issue_id', 'reason', 'created', 'is_spam', 'classifier_confidence', |
| 137 | 'user_id', 'overruled'], |
| 138 | issue_id=issue_ids, order_by=[('issue_id', []), ('created', [])]) |
| 139 | |
| 140 | # TODO: group by issue_id, make class instead of dict for verdict. |
| 141 | verdicts = [] |
| 142 | for row in rows: |
| 143 | verdicts.append({ |
| 144 | 'issue_id': row[0], |
| 145 | 'reason': row[1], |
| 146 | 'created': row[2], |
| 147 | 'is_spam': row[3], |
| 148 | 'classifier_confidence': row[4], |
| 149 | 'user_id': row[5], |
| 150 | 'overruled': row[6], |
| 151 | }) |
| 152 | |
| 153 | return verdicts |
| 154 | |
| 155 | def LookupCommentVerdictHistory(self, cnxn, comment_ids): |
| 156 | """Returns a map of issue_id to most recent spam verdicts""" |
| 157 | rows = self.verdict_tbl.Select(cnxn, cols=[ |
| 158 | 'comment_id', 'reason', 'created', 'is_spam', 'classifier_confidence', |
| 159 | 'user_id', 'overruled'], |
| 160 | comment_id=comment_ids, order_by=[('comment_id', []), ('created', [])]) |
| 161 | |
| 162 | # TODO: group by comment_id, make class instead of dict for verdict. |
| 163 | verdicts = [] |
| 164 | for row in rows: |
| 165 | verdicts.append({ |
| 166 | 'comment_id': row[0], |
| 167 | 'reason': row[1], |
| 168 | 'created': row[2], |
| 169 | 'is_spam': row[3], |
| 170 | 'classifier_confidence': row[4], |
| 171 | 'user_id': row[5], |
| 172 | 'overruled': row[6], |
| 173 | }) |
| 174 | |
| 175 | return verdicts |
| 176 | |
| 177 | def FlagIssues(self, cnxn, issue_service, issues, reporting_user_id, |
| 178 | flagged_spam): |
| 179 | """Creates or deletes a spam report on an issue.""" |
| 180 | verdict_updates = [] |
| 181 | if flagged_spam: |
| 182 | rows = [(issue.issue_id, issue.reporter_id, reporting_user_id) |
| 183 | for issue in issues] |
| 184 | self.report_tbl.InsertRows(cnxn, SPAMREPORT_ISSUE_COLS, rows, |
| 185 | ignore=True) |
| 186 | else: |
| 187 | issue_ids = [issue.issue_id for issue in issues] |
| 188 | self.report_tbl.Delete( |
| 189 | cnxn, issue_id=issue_ids, user_id=reporting_user_id, |
| 190 | comment_id=None) |
| 191 | |
| 192 | project_id = issues[0].project_id |
| 193 | |
| 194 | # Now record new verdicts and update issue.is_spam, if they've changed. |
| 195 | ids = [issue.issue_id for issue in issues] |
| 196 | counts = self.LookupIssueFlagCounts(cnxn, ids) |
| 197 | previous_verdicts = self.LookupIssueVerdicts(cnxn, ids) |
| 198 | |
| 199 | for issue_id in counts: |
| 200 | # If the flag counts changed enough to toggle the is_spam bit, need to |
| 201 | # record a new verdict and update the Issue. |
| 202 | |
| 203 | # No number of user spam flags can overturn an admin's verdict. |
| 204 | if previous_verdicts.get(issue_id) == REASON_MANUAL: |
| 205 | continue |
| 206 | |
| 207 | # If enough spam flags come in, mark the issue as spam. |
| 208 | if (flagged_spam and counts[issue_id] >= settings.spam_flag_thresh): |
| 209 | verdict_updates.append(issue_id) |
| 210 | |
| 211 | if len(verdict_updates) == 0: |
| 212 | return |
| 213 | |
| 214 | # Some of the issues may have exceed the flag threshold, so issue verdicts |
| 215 | # and mark as spam in those cases. |
| 216 | rows = [(issue_id, flagged_spam, REASON_THRESHOLD, project_id) |
| 217 | for issue_id in verdict_updates] |
| 218 | self.verdict_tbl.InsertRows(cnxn, THRESHVERDICT_ISSUE_COLS, rows, |
| 219 | ignore=True) |
| 220 | update_issues = [] |
| 221 | for issue in issues: |
| 222 | if issue.issue_id in verdict_updates: |
| 223 | issue.is_spam = flagged_spam |
| 224 | update_issues.append(issue) |
| 225 | |
| 226 | if flagged_spam: |
| 227 | for issue in update_issues: |
| 228 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 229 | self.issue_actions.increment( |
| 230 | { |
| 231 | 'type': 'flag', |
| 232 | 'reporter_id': str(reporting_user_id), |
| 233 | 'issue': issue_ref |
| 234 | }) |
| 235 | |
| 236 | issue_service.UpdateIssues(cnxn, update_issues, update_cols=['is_spam']) |
| 237 | |
| 238 | def FlagComment( |
| 239 | self, cnxn, issue, comment_id, reported_user_id, reporting_user_id, |
| 240 | flagged_spam): |
| 241 | """Creates or deletes a spam report on a comment.""" |
| 242 | # TODO(seanmccullough): Bulk comment flagging? There's no UI for that. |
| 243 | if flagged_spam: |
| 244 | self.report_tbl.InsertRow( |
| 245 | cnxn, |
| 246 | ignore=True, |
| 247 | issue_id=issue.issue_id, |
| 248 | comment_id=comment_id, |
| 249 | reported_user_id=reported_user_id, |
| 250 | user_id=reporting_user_id) |
| 251 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 252 | self.comment_actions.increment( |
| 253 | { |
| 254 | 'type': 'flag', |
| 255 | 'reporter_id': str(reporting_user_id), |
| 256 | 'issue': issue_ref, |
| 257 | 'comment_id': str(comment_id) |
| 258 | }) |
| 259 | else: |
| 260 | self.report_tbl.Delete( |
| 261 | cnxn, |
| 262 | issue_id=issue.issue_id, |
| 263 | comment_id=comment_id, |
| 264 | user_id=reporting_user_id) |
| 265 | |
| 266 | def RecordClassifierIssueVerdict(self, cnxn, issue, is_spam, confidence, |
| 267 | fail_open): |
| 268 | reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER |
| 269 | self.verdict_tbl.InsertRow(cnxn, issue_id=issue.issue_id, is_spam=is_spam, |
| 270 | reason=reason, classifier_confidence=confidence, |
| 271 | project_id=issue.project_id) |
| 272 | if is_spam: |
| 273 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 274 | self.issue_actions.increment( |
| 275 | { |
| 276 | 'type': 'classifier', |
| 277 | 'reporter_id': 'classifier', |
| 278 | 'issue': issue_ref |
| 279 | }) |
| 280 | # This is called at issue creation time, so there's nothing else to do here. |
| 281 | |
| 282 | def RecordManualIssueVerdicts(self, cnxn, issue_service, issues, user_id, |
| 283 | is_spam): |
| 284 | rows = [(user_id, issue.issue_id, is_spam, REASON_MANUAL, issue.project_id) |
| 285 | for issue in issues] |
| 286 | issue_ids = [issue.issue_id for issue in issues] |
| 287 | |
| 288 | # Overrule all previous verdicts. |
| 289 | self.verdict_tbl.Update(cnxn, {'overruled': True}, [ |
| 290 | ('issue_id IN (%s)' % sql.PlaceHolders(issue_ids), issue_ids) |
| 291 | ], commit=False) |
| 292 | |
| 293 | self.verdict_tbl.InsertRows(cnxn, MANUALVERDICT_ISSUE_COLS, rows, |
| 294 | ignore=True) |
| 295 | |
| 296 | for issue in issues: |
| 297 | issue.is_spam = is_spam |
| 298 | |
| 299 | if is_spam: |
| 300 | for issue in issues: |
| 301 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 302 | self.issue_actions.increment( |
| 303 | { |
| 304 | 'type': 'manual', |
| 305 | 'reporter_id': str(user_id), |
| 306 | 'issue': issue_ref |
| 307 | }) |
| 308 | else: |
| 309 | issue_service.AllocateNewLocalIDs(cnxn, issues) |
| 310 | |
| 311 | # This will commit the transaction. |
| 312 | issue_service.UpdateIssues(cnxn, issues, update_cols=['is_spam']) |
| 313 | |
| 314 | def RecordManualCommentVerdict(self, cnxn, issue_service, user_service, |
| 315 | comment_id, user_id, is_spam): |
| 316 | # TODO(seanmccullough): Bulk comment verdicts? There's no UI for that. |
| 317 | self.verdict_tbl.InsertRow(cnxn, ignore=True, |
| 318 | user_id=user_id, comment_id=comment_id, is_spam=is_spam, |
| 319 | reason=REASON_MANUAL) |
| 320 | comment = issue_service.GetComment(cnxn, comment_id) |
| 321 | comment.is_spam = is_spam |
| 322 | issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False) |
| 323 | issue_service.SoftDeleteComment( |
| 324 | cnxn, issue, comment, user_id, user_service, is_spam, True, is_spam) |
| 325 | if is_spam: |
| 326 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 327 | self.comment_actions.increment( |
| 328 | { |
| 329 | 'type': 'manual', |
| 330 | 'reporter_id': str(user_id), |
| 331 | 'issue': issue_ref, |
| 332 | 'comment_id': str(comment_id) |
| 333 | }) |
| 334 | |
| 335 | def RecordClassifierCommentVerdict( |
| 336 | self, cnxn, issue_service, comment, is_spam, confidence, fail_open): |
| 337 | reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER |
| 338 | self.verdict_tbl.InsertRow(cnxn, comment_id=comment.id, is_spam=is_spam, |
| 339 | reason=reason, classifier_confidence=confidence, |
| 340 | project_id=comment.project_id) |
| 341 | if is_spam: |
| 342 | issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False) |
| 343 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 344 | self.comment_actions.increment( |
| 345 | { |
| 346 | 'type': 'classifier', |
| 347 | 'reporter_id': 'classifier', |
| 348 | 'issue': issue_ref, |
| 349 | 'comment_id': str(comment.id) |
| 350 | }) |
| 351 | |
| 352 | def _predict(self, instance): |
| 353 | """Requests a prediction from the ML Engine API. |
| 354 | |
| 355 | Sample API response: |
| 356 | {'predictions': [{ |
| 357 | 'classes': ['0', '1'], |
| 358 | 'scores': [0.4986788034439087, 0.5013211965560913] |
| 359 | }]} |
| 360 | |
| 361 | This hits the default model. |
| 362 | |
| 363 | Returns: |
| 364 | A floating point number representing the confidence |
| 365 | the instance is spam. |
| 366 | """ |
| 367 | model_name = 'projects/%s/models/%s' % ( |
| 368 | settings.classifier_project_id, settings.spam_model_name) |
| 369 | body = {'instances': [{"inputs": instance["word_hashes"]}]} |
| 370 | |
| 371 | if not self.ml_engine: |
| 372 | self.ml_engine = ml_helpers.setup_ml_engine() |
| 373 | |
| 374 | request = self.ml_engine.projects().predict(name=model_name, body=body) |
| 375 | response = request.execute() |
| 376 | logging.info('ML Engine API response: %r' % response) |
| 377 | prediction = response['predictions'][0] |
| 378 | |
| 379 | # Ensure the class confidence we return is for the spam, not the ham label. |
| 380 | # The spam label, '1', is usually at index 1 but I'm not sure of any |
| 381 | # guarantees around label order. |
| 382 | if prediction['classes'][1] == SPAM_CLASS_LABEL: |
| 383 | return prediction['scores'][1] |
| 384 | elif prediction['classes'][0] == SPAM_CLASS_LABEL: |
| 385 | return prediction['scores'][0] |
| 386 | else: |
| 387 | raise Exception('No predicted classes found.') |
| 388 | |
| 389 | def _IsExempt(self, author, is_project_member): |
| 390 | """Return True if the user is exempt from spam checking.""" |
| 391 | if author.email is not None and author.email.endswith( |
| 392 | settings.spam_allowlisted_suffixes): |
| 393 | logging.info('%s allowlisted from spam filtering', author.email) |
| 394 | return True |
| 395 | |
| 396 | if is_project_member: |
| 397 | logging.info('%s is a project member, assuming ham', author.email) |
| 398 | return True |
| 399 | |
| 400 | return False |
| 401 | |
| 402 | def ClassifyIssue(self, issue, firstComment, reporter, is_project_member): |
| 403 | """Classify an issue as either spam or ham. |
| 404 | |
| 405 | Args: |
| 406 | issue: the Issue. |
| 407 | firstComment: the first Comment on issue. |
| 408 | reporter: User PB for the Issue reporter. |
| 409 | is_project_member: True if reporter is a member of issue's project. |
| 410 | |
| 411 | Returns a JSON dict of classifier prediction results from |
| 412 | the ML Engine API. |
| 413 | """ |
| 414 | instance = ml_helpers.GenerateFeaturesRaw( |
| 415 | [issue.summary, firstComment.content], |
| 416 | settings.spam_feature_hashes) |
| 417 | return self._classify(instance, reporter, is_project_member) |
| 418 | |
| 419 | def ClassifyComment(self, comment_content, commenter, is_project_member=True): |
| 420 | """Classify a comment as either spam or ham. |
| 421 | |
| 422 | Args: |
| 423 | comment: the comment text. |
| 424 | commenter: User PB for the user who authored the comment. |
| 425 | |
| 426 | Returns a JSON dict of classifier prediction results from |
| 427 | the ML Engine API. |
| 428 | """ |
| 429 | instance = ml_helpers.GenerateFeaturesRaw( |
| 430 | ['', comment_content], |
| 431 | settings.spam_feature_hashes) |
| 432 | return self._classify(instance, commenter, is_project_member) |
| 433 | |
| 434 | |
| 435 | def _classify(self, instance, author, is_project_member): |
| 436 | # Fail-safe: not spam. |
| 437 | result = self.ham_classification() |
| 438 | |
| 439 | if self._IsExempt(author, is_project_member): |
| 440 | return result |
| 441 | |
| 442 | if not self.ml_engine: |
| 443 | self.ml_engine = ml_helpers.setup_ml_engine() |
| 444 | |
| 445 | # If setup_ml_engine returns None, it failed to init. |
| 446 | if not self.ml_engine: |
| 447 | logging.error("ML Engine not initialized.") |
| 448 | self.ml_engine_failures.increment() |
| 449 | result['failed_open'] = True |
| 450 | return result |
| 451 | |
| 452 | remaining_retries = 3 |
| 453 | while remaining_retries > 0: |
| 454 | try: |
| 455 | result['confidence_is_spam'] = self._predict(instance) |
| 456 | result['failed_open'] = False |
| 457 | return result |
| 458 | except Exception as ex: |
| 459 | remaining_retries = remaining_retries - 1 |
| 460 | self.ml_engine_failures.increment() |
| 461 | logging.error('Error calling ML Engine API: %s' % ex) |
| 462 | |
| 463 | result['failed_open'] = True |
| 464 | return result |
| 465 | |
| 466 | def ham_classification(self): |
| 467 | return {'confidence_is_spam': 0.0, |
| 468 | 'failed_open': False} |
| 469 | |
| 470 | def GetIssueClassifierQueue( |
| 471 | self, cnxn, _issue_service, project_id, offset=0, limit=10): |
| 472 | """Returns list of recent issues with spam verdicts, |
| 473 | ranked in ascending order of confidence (so uncertain items are first). |
| 474 | """ |
| 475 | # TODO(seanmccullough): Optimize pagination. This query probably gets |
| 476 | # slower as the number of SpamVerdicts grows, regardless of offset |
| 477 | # and limit values used here. Using offset,limit in general may not |
| 478 | # be the best way to do this. |
| 479 | issue_results = self.verdict_tbl.Select( |
| 480 | cnxn, |
| 481 | cols=[ |
| 482 | 'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created' |
| 483 | ], |
| 484 | where=[ |
| 485 | ('project_id = %s', [project_id]), |
| 486 | ( |
| 487 | 'classifier_confidence <= %s', |
| 488 | [settings.classifier_moderation_thresh]), |
| 489 | ('overruled = %s', [False]), |
| 490 | ('issue_id IS NOT NULL', []), |
| 491 | ], |
| 492 | order_by=[ |
| 493 | ('classifier_confidence ASC', []), |
| 494 | ('created ASC', []), |
| 495 | ], |
| 496 | group_by=['issue_id'], |
| 497 | offset=offset, |
| 498 | limit=limit, |
| 499 | ) |
| 500 | |
| 501 | ret = [] |
| 502 | for row in issue_results: |
| 503 | ret.append( |
| 504 | ModerationItem( |
| 505 | issue_id=int(row[0]), |
| 506 | is_spam=row[1] == 1, |
| 507 | reason=row[2], |
| 508 | classifier_confidence=row[3], |
| 509 | verdict_time='%s' % row[4], |
| 510 | )) |
| 511 | |
| 512 | count = self.verdict_tbl.SelectValue( |
| 513 | cnxn, |
| 514 | col='COUNT(*)', |
| 515 | where=[ |
| 516 | ('project_id = %s', [project_id]), |
| 517 | ( |
| 518 | 'classifier_confidence <= %s', |
| 519 | [settings.classifier_moderation_thresh]), |
| 520 | ('overruled = %s', [False]), |
| 521 | ('issue_id IS NOT NULL', []), |
| 522 | ]) |
| 523 | |
| 524 | return ret, count |
| 525 | |
| 526 | def GetIssueFlagQueue( |
| 527 | self, cnxn, _issue_service, project_id, offset=0, limit=10): |
| 528 | """Returns list of recent issues that have been flagged by users""" |
| 529 | issue_flags = self.report_tbl.Select( |
| 530 | cnxn, |
| 531 | cols=[ |
| 532 | "Issue.project_id", "Report.issue_id", "count(*) as count", |
| 533 | "max(Report.created) as latest", |
| 534 | "count(distinct Report.user_id) as users" |
| 535 | ], |
| 536 | left_joins=["Issue ON Issue.id = Report.issue_id"], |
| 537 | where=[ |
| 538 | ('Report.issue_id IS NOT NULL', []), |
| 539 | ("Issue.project_id == %v", [project_id]) |
| 540 | ], |
| 541 | order_by=[('count DESC', [])], |
| 542 | group_by=['Report.issue_id'], |
| 543 | offset=offset, |
| 544 | limit=limit) |
| 545 | ret = [] |
| 546 | for row in issue_flags: |
| 547 | ret.append( |
| 548 | ModerationItem( |
| 549 | project_id=row[0], |
| 550 | issue_id=row[1], |
| 551 | count=row[2], |
| 552 | latest_report=row[3], |
| 553 | num_users=row[4], |
| 554 | )) |
| 555 | |
| 556 | count = self.verdict_tbl.SelectValue( |
| 557 | cnxn, |
| 558 | col='COUNT(DISTINCT Report.issue_id)', |
| 559 | where=[('Issue.project_id = %s', [project_id])], |
| 560 | left_joins=["Issue ON Issue.id = SpamReport.issue_id"]) |
| 561 | return ret, count |
| 562 | |
| 563 | |
| 564 | def GetCommentClassifierQueue( |
| 565 | self, cnxn, _issue_service, project_id, offset=0, limit=10): |
| 566 | """Returns list of recent comments with spam verdicts, |
| 567 | ranked in ascending order of confidence (so uncertain items are first). |
| 568 | """ |
| 569 | # TODO(seanmccullough): Optimize pagination. This query probably gets |
| 570 | # slower as the number of SpamVerdicts grows, regardless of offset |
| 571 | # and limit values used here. Using offset,limit in general may not |
| 572 | # be the best way to do this. |
| 573 | comment_results = self.verdict_tbl.Select( |
| 574 | cnxn, |
| 575 | cols=[ |
| 576 | 'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created' |
| 577 | ], |
| 578 | where=[ |
| 579 | ('project_id = %s', [project_id]), |
| 580 | ( |
| 581 | 'classifier_confidence <= %s', |
| 582 | [settings.classifier_moderation_thresh]), |
| 583 | ('overruled = %s', [False]), |
| 584 | ('comment_id IS NOT NULL', []), |
| 585 | ], |
| 586 | order_by=[ |
| 587 | ('classifier_confidence ASC', []), |
| 588 | ('created ASC', []), |
| 589 | ], |
| 590 | group_by=['comment_id'], |
| 591 | offset=offset, |
| 592 | limit=limit, |
| 593 | ) |
| 594 | |
| 595 | ret = [] |
| 596 | for row in comment_results: |
| 597 | ret.append( |
| 598 | ModerationItem( |
| 599 | comment_id=int(row[0]), |
| 600 | is_spam=row[1] == 1, |
| 601 | reason=row[2], |
| 602 | classifier_confidence=row[3], |
| 603 | verdict_time='%s' % row[4], |
| 604 | )) |
| 605 | |
| 606 | count = self.verdict_tbl.SelectValue( |
| 607 | cnxn, |
| 608 | col='COUNT(*)', |
| 609 | where=[ |
| 610 | ('project_id = %s', [project_id]), |
| 611 | ( |
| 612 | 'classifier_confidence <= %s', |
| 613 | [settings.classifier_moderation_thresh]), |
| 614 | ('overruled = %s', [False]), |
| 615 | ('comment_id IS NOT NULL', []), |
| 616 | ]) |
| 617 | |
| 618 | return ret, count |
| 619 | |
| 620 | |
| 621 | def GetTrainingIssues(self, cnxn, issue_service, since, offset=0, limit=100): |
| 622 | """Returns list of recent issues with human-labeled spam/ham verdicts. |
| 623 | """ |
| 624 | |
| 625 | # get all of the manual verdicts in the past day. |
| 626 | results = self.verdict_tbl.Select(cnxn, |
| 627 | cols=['issue_id'], |
| 628 | where=[ |
| 629 | ('overruled = %s', [False]), |
| 630 | ('reason = %s', ['manual']), |
| 631 | ('issue_id IS NOT NULL', []), |
| 632 | ('created > %s', [since.isoformat()]), |
| 633 | ], |
| 634 | offset=offset, |
| 635 | limit=limit, |
| 636 | ) |
| 637 | |
| 638 | issue_ids = [int(row[0]) for row in results if row[0]] |
| 639 | issues = issue_service.GetIssues(cnxn, issue_ids) |
| 640 | comments = issue_service.GetCommentsForIssues(cnxn, issue_ids) |
| 641 | first_comments = {} |
| 642 | for issue in issues: |
| 643 | first_comments[issue.issue_id] = (comments[issue.issue_id][0].content |
| 644 | if issue.issue_id in comments else "[Empty]") |
| 645 | |
| 646 | count = self.verdict_tbl.SelectValue(cnxn, |
| 647 | col='COUNT(*)', |
| 648 | where=[ |
| 649 | ('overruled = %s', [False]), |
| 650 | ('reason = %s', ['manual']), |
| 651 | ('issue_id IS NOT NULL', []), |
| 652 | ('created > %s', [since.isoformat()]), |
| 653 | ]) |
| 654 | |
| 655 | return issues, first_comments, count |
| 656 | |
| 657 | def GetTrainingComments(self, cnxn, issue_service, since, offset=0, |
| 658 | limit=100): |
| 659 | """Returns list of recent comments with human-labeled spam/ham verdicts. |
| 660 | """ |
| 661 | |
| 662 | # get all of the manual verdicts in the past day. |
| 663 | results = self.verdict_tbl.Select( |
| 664 | cnxn, |
| 665 | distinct=True, |
| 666 | cols=['comment_id'], |
| 667 | where=[ |
| 668 | ('overruled = %s', [False]), |
| 669 | ('reason = %s', ['manual']), |
| 670 | ('comment_id IS NOT NULL', []), |
| 671 | ('created > %s', [since.isoformat()]), |
| 672 | ], |
| 673 | offset=offset, |
| 674 | limit=limit, |
| 675 | ) |
| 676 | |
| 677 | comment_ids = [int(row[0]) for row in results if row[0]] |
| 678 | # Don't care about sequence numbers in this context yet. |
| 679 | comments = issue_service.GetCommentsByID(cnxn, comment_ids, |
| 680 | defaultdict(int)) |
| 681 | return comments |
| 682 | |
| 683 | def ExpungeUsersInSpam(self, cnxn, user_ids): |
| 684 | """Removes all references to given users from Spam DB tables. |
| 685 | |
| 686 | This method will not commit the operations. This method will |
| 687 | not make changes to in-memory data. |
| 688 | """ |
| 689 | commit = False |
| 690 | self.report_tbl.Delete(cnxn, reported_user_id=user_ids, commit=commit) |
| 691 | self.report_tbl.Delete(cnxn, user_id=user_ids, commit=commit) |
| 692 | self.verdict_tbl.Delete(cnxn, user_id=user_ids, commit=commit) |
| 693 | |
| 694 | |
| 695 | class ModerationItem: |
| 696 | def __init__(self, **kwargs): |
| 697 | self.__dict__ = kwargs |