Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 1 | # Copyright 2016 The Chromium Authors |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 4 | |
| 5 | """ Set of functions for detaling with spam reports. |
| 6 | """ |
| 7 | from __future__ import print_function |
| 8 | from __future__ import division |
| 9 | from __future__ import absolute_import |
| 10 | |
| 11 | import collections |
| 12 | import logging |
| 13 | import settings |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 14 | import time |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 15 | |
| 16 | from collections import defaultdict |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 17 | from framework import sql |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 18 | from infra_libs import ts_mon |
| 19 | from services import ml_helpers |
| 20 | |
| 21 | |
| 22 | SPAMREPORT_TABLE_NAME = 'SpamReport' |
| 23 | SPAMVERDICT_TABLE_NAME = 'SpamVerdict' |
| 24 | ISSUE_TABLE = 'Issue' |
| 25 | |
| 26 | REASON_MANUAL = 'manual' |
| 27 | REASON_THRESHOLD = 'threshold' |
| 28 | REASON_CLASSIFIER = 'classifier' |
| 29 | REASON_FAIL_OPEN = 'fail_open' |
| 30 | SPAM_CLASS_LABEL = '1' |
| 31 | |
| 32 | SPAMREPORT_ISSUE_COLS = ['issue_id', 'reported_user_id', 'user_id'] |
| 33 | SPAMVERDICT_ISSUE_COL = ['created', 'content_created', 'user_id', |
| 34 | 'reported_user_id', 'comment_id', 'issue_id'] |
| 35 | MANUALVERDICT_ISSUE_COLS = ['user_id', 'issue_id', 'is_spam', 'reason', |
| 36 | 'project_id'] |
| 37 | THRESHVERDICT_ISSUE_COLS = ['issue_id', 'is_spam', 'reason', 'project_id'] |
| 38 | |
| 39 | SPAMREPORT_COMMENT_COLS = ['comment_id', 'reported_user_id', 'user_id'] |
| 40 | MANUALVERDICT_COMMENT_COLS = ['user_id', 'comment_id', 'is_spam', 'reason', |
| 41 | 'project_id'] |
| 42 | THRESHVERDICT_COMMENT_COLS = ['comment_id', 'is_spam', 'reason', 'project_id'] |
| 43 | |
| 44 | |
| 45 | class SpamService(object): |
| 46 | """The persistence layer for spam reports.""" |
| 47 | issue_actions = ts_mon.CounterMetric( |
| 48 | 'monorail/spam_svc/issue', 'Count of things that happen to issues.', [ |
| 49 | ts_mon.StringField('type'), |
| 50 | ts_mon.StringField('reporter_id'), |
| 51 | ts_mon.StringField('issue') |
| 52 | ]) |
| 53 | comment_actions = ts_mon.CounterMetric( |
| 54 | 'monorail/spam_svc/comment', 'Count of things that happen to comments.', [ |
| 55 | ts_mon.StringField('type'), |
| 56 | ts_mon.StringField('reporter_id'), |
| 57 | ts_mon.StringField('issue'), |
| 58 | ts_mon.StringField('comment_id') |
| 59 | ]) |
| 60 | ml_engine_failures = ts_mon.CounterMetric( |
| 61 | 'monorail/spam_svc/ml_engine_failure', |
| 62 | 'Failures calling the ML Engine API', |
| 63 | None) |
| 64 | |
| 65 | def __init__(self): |
| 66 | self.report_tbl = sql.SQLTableManager(SPAMREPORT_TABLE_NAME) |
| 67 | self.verdict_tbl = sql.SQLTableManager(SPAMVERDICT_TABLE_NAME) |
| 68 | self.issue_tbl = sql.SQLTableManager(ISSUE_TABLE) |
| 69 | |
| 70 | # ML Engine library is lazy loaded below. |
| 71 | self.ml_engine = None |
| 72 | |
| 73 | def LookupIssuesFlaggers(self, cnxn, issue_ids): |
| 74 | """Returns users who've reported the issues or their comments as spam. |
| 75 | |
| 76 | Returns a dictionary {issue_id: (issue_reporters, comment_reporters)} |
| 77 | issue_reportes is a list of users who flagged the issue; |
| 78 | comment_reporters element is a dictionary {comment_id: [user_ids]} where |
| 79 | user_ids are the users who flagged that comment. |
| 80 | """ |
| 81 | rows = self.report_tbl.Select( |
| 82 | cnxn, cols=['issue_id', 'user_id', 'comment_id'], |
| 83 | issue_id=issue_ids) |
| 84 | |
| 85 | reporters = collections.defaultdict( |
| 86 | # Return a tuple of (issue_reporters, comment_reporters) as described |
| 87 | # above. |
| 88 | lambda: ([], collections.defaultdict(list))) |
| 89 | |
| 90 | for row in rows: |
| 91 | issue_id = int(row[0]) |
| 92 | user_id = row[1] |
| 93 | if row[2]: |
| 94 | comment_id = row[2] |
| 95 | reporters[issue_id][1][comment_id].append(user_id) |
| 96 | else: |
| 97 | reporters[issue_id][0].append(user_id) |
| 98 | |
| 99 | return reporters |
| 100 | |
| 101 | def LookupIssueFlaggers(self, cnxn, issue_id): |
| 102 | """Returns users who've reported the issue or its comments as spam. |
| 103 | |
| 104 | Returns a tuple. First element is a list of users who flagged the issue; |
| 105 | second element is a dictionary of comment id to a list of users who flagged |
| 106 | that comment. |
| 107 | """ |
| 108 | return self.LookupIssuesFlaggers(cnxn, [issue_id])[issue_id] |
| 109 | |
Adrià Vilanova Martínez | de94280 | 2022-07-15 14:06:55 +0200 | [diff] [blame] | 110 | def _LookupIssueFlagCounts(self, cnxn, issue_ids): |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 111 | """Returns a map of issue_id to flag counts""" |
| 112 | rows = self.report_tbl.Select(cnxn, cols=['issue_id', 'COUNT(*)'], |
| 113 | issue_id=issue_ids, group_by=['issue_id']) |
| 114 | counts = {} |
| 115 | for row in rows: |
| 116 | counts[int(row[0])] = row[1] |
| 117 | return counts |
| 118 | |
| 119 | def LookupIssueVerdicts(self, cnxn, issue_ids): |
| 120 | """Returns a map of issue_id to most recent spam verdicts""" |
| 121 | rows = self.verdict_tbl.Select(cnxn, |
| 122 | cols=['issue_id', 'reason', 'MAX(created)'], |
| 123 | issue_id=issue_ids, comment_id=None, |
| 124 | group_by=['issue_id']) |
| 125 | counts = {} |
| 126 | for row in rows: |
| 127 | counts[int(row[0])] = row[1] |
| 128 | return counts |
| 129 | |
| 130 | def LookupIssueVerdictHistory(self, cnxn, issue_ids): |
| 131 | """Returns a map of issue_id to most recent spam verdicts""" |
| 132 | rows = self.verdict_tbl.Select(cnxn, cols=[ |
| 133 | 'issue_id', 'reason', 'created', 'is_spam', 'classifier_confidence', |
| 134 | 'user_id', 'overruled'], |
| 135 | issue_id=issue_ids, order_by=[('issue_id', []), ('created', [])]) |
| 136 | |
| 137 | # TODO: group by issue_id, make class instead of dict for verdict. |
| 138 | verdicts = [] |
| 139 | for row in rows: |
| 140 | verdicts.append({ |
| 141 | 'issue_id': row[0], |
| 142 | 'reason': row[1], |
| 143 | 'created': row[2], |
| 144 | 'is_spam': row[3], |
| 145 | 'classifier_confidence': row[4], |
| 146 | 'user_id': row[5], |
| 147 | 'overruled': row[6], |
| 148 | }) |
| 149 | |
| 150 | return verdicts |
| 151 | |
| 152 | def LookupCommentVerdictHistory(self, cnxn, comment_ids): |
| 153 | """Returns a map of issue_id to most recent spam verdicts""" |
| 154 | rows = self.verdict_tbl.Select(cnxn, cols=[ |
| 155 | 'comment_id', 'reason', 'created', 'is_spam', 'classifier_confidence', |
| 156 | 'user_id', 'overruled'], |
| 157 | comment_id=comment_ids, order_by=[('comment_id', []), ('created', [])]) |
| 158 | |
| 159 | # TODO: group by comment_id, make class instead of dict for verdict. |
| 160 | verdicts = [] |
| 161 | for row in rows: |
| 162 | verdicts.append({ |
| 163 | 'comment_id': row[0], |
| 164 | 'reason': row[1], |
| 165 | 'created': row[2], |
| 166 | 'is_spam': row[3], |
| 167 | 'classifier_confidence': row[4], |
| 168 | 'user_id': row[5], |
| 169 | 'overruled': row[6], |
| 170 | }) |
| 171 | |
| 172 | return verdicts |
| 173 | |
| 174 | def FlagIssues(self, cnxn, issue_service, issues, reporting_user_id, |
| 175 | flagged_spam): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 176 | """Creates or deletes a spam report on an issue. |
| 177 | |
| 178 | This function is run when a user flags an issue as spam but does not |
| 179 | have 'VerdictSpam' permission.""" |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 180 | verdict_updates = [] |
| 181 | if flagged_spam: |
| 182 | rows = [(issue.issue_id, issue.reporter_id, reporting_user_id) |
| 183 | for issue in issues] |
| 184 | self.report_tbl.InsertRows(cnxn, SPAMREPORT_ISSUE_COLS, rows, |
| 185 | ignore=True) |
| 186 | else: |
| 187 | issue_ids = [issue.issue_id for issue in issues] |
| 188 | self.report_tbl.Delete( |
| 189 | cnxn, issue_id=issue_ids, user_id=reporting_user_id, |
| 190 | comment_id=None) |
| 191 | |
| 192 | project_id = issues[0].project_id |
| 193 | |
| 194 | # Now record new verdicts and update issue.is_spam, if they've changed. |
| 195 | ids = [issue.issue_id for issue in issues] |
Adrià Vilanova Martínez | de94280 | 2022-07-15 14:06:55 +0200 | [diff] [blame] | 196 | counts = self._LookupIssueFlagCounts(cnxn, ids) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 197 | previous_verdicts = self.LookupIssueVerdicts(cnxn, ids) |
| 198 | |
| 199 | for issue_id in counts: |
| 200 | # If the flag counts changed enough to toggle the is_spam bit, need to |
| 201 | # record a new verdict and update the Issue. |
| 202 | |
| 203 | # No number of user spam flags can overturn an admin's verdict. |
| 204 | if previous_verdicts.get(issue_id) == REASON_MANUAL: |
| 205 | continue |
| 206 | |
| 207 | # If enough spam flags come in, mark the issue as spam. |
| 208 | if (flagged_spam and counts[issue_id] >= settings.spam_flag_thresh): |
| 209 | verdict_updates.append(issue_id) |
| 210 | |
| 211 | if len(verdict_updates) == 0: |
| 212 | return |
| 213 | |
| 214 | # Some of the issues may have exceed the flag threshold, so issue verdicts |
| 215 | # and mark as spam in those cases. |
| 216 | rows = [(issue_id, flagged_spam, REASON_THRESHOLD, project_id) |
| 217 | for issue_id in verdict_updates] |
| 218 | self.verdict_tbl.InsertRows(cnxn, THRESHVERDICT_ISSUE_COLS, rows, |
| 219 | ignore=True) |
| 220 | update_issues = [] |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 221 | current_time = int(time.time()) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 222 | for issue in issues: |
| 223 | if issue.issue_id in verdict_updates: |
| 224 | issue.is_spam = flagged_spam |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 225 | issue.migration_modified_timestamp = current_time |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 226 | update_issues.append(issue) |
| 227 | |
| 228 | if flagged_spam: |
| 229 | for issue in update_issues: |
| 230 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 231 | self.issue_actions.increment( |
| 232 | { |
| 233 | 'type': 'flag', |
| 234 | 'reporter_id': str(reporting_user_id), |
| 235 | 'issue': issue_ref |
| 236 | }) |
| 237 | |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 238 | issue_service.UpdateIssues( |
| 239 | cnxn, update_issues, update_cols=['is_spam', 'migration_modified']) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 240 | |
| 241 | def FlagComment( |
| 242 | self, cnxn, issue, comment_id, reported_user_id, reporting_user_id, |
| 243 | flagged_spam): |
| 244 | """Creates or deletes a spam report on a comment.""" |
| 245 | # TODO(seanmccullough): Bulk comment flagging? There's no UI for that. |
| 246 | if flagged_spam: |
| 247 | self.report_tbl.InsertRow( |
| 248 | cnxn, |
| 249 | ignore=True, |
| 250 | issue_id=issue.issue_id, |
| 251 | comment_id=comment_id, |
| 252 | reported_user_id=reported_user_id, |
| 253 | user_id=reporting_user_id) |
| 254 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 255 | self.comment_actions.increment( |
| 256 | { |
| 257 | 'type': 'flag', |
| 258 | 'reporter_id': str(reporting_user_id), |
| 259 | 'issue': issue_ref, |
| 260 | 'comment_id': str(comment_id) |
| 261 | }) |
| 262 | else: |
| 263 | self.report_tbl.Delete( |
| 264 | cnxn, |
| 265 | issue_id=issue.issue_id, |
| 266 | comment_id=comment_id, |
| 267 | user_id=reporting_user_id) |
| 268 | |
| 269 | def RecordClassifierIssueVerdict(self, cnxn, issue, is_spam, confidence, |
| 270 | fail_open): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 271 | """Records a judgment call on whether a new issue is spam. |
| 272 | |
| 273 | Only run when an issue is newly filed. If the issue is determined to be |
| 274 | likely spam, the code increments a counter.""" |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 275 | reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 276 | self.verdict_tbl.InsertRow( |
| 277 | cnxn, |
| 278 | issue_id=issue.issue_id, |
| 279 | is_spam=is_spam, |
| 280 | reason=reason, |
| 281 | classifier_confidence=confidence, |
| 282 | project_id=issue.project_id, |
| 283 | overruled=False) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 284 | if is_spam: |
| 285 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 286 | self.issue_actions.increment( |
| 287 | { |
| 288 | 'type': 'classifier', |
| 289 | 'reporter_id': 'classifier', |
| 290 | 'issue': issue_ref |
| 291 | }) |
| 292 | # This is called at issue creation time, so there's nothing else to do here. |
| 293 | |
| 294 | def RecordManualIssueVerdicts(self, cnxn, issue_service, issues, user_id, |
| 295 | is_spam): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 296 | """Bypasses the classifier to manually classify an issue as spam. |
| 297 | |
| 298 | This code can only be run by users with the 'VerdictSpam' permission.""" |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 299 | rows = [(user_id, issue.issue_id, is_spam, REASON_MANUAL, issue.project_id) |
| 300 | for issue in issues] |
| 301 | issue_ids = [issue.issue_id for issue in issues] |
| 302 | |
| 303 | # Overrule all previous verdicts. |
| 304 | self.verdict_tbl.Update(cnxn, {'overruled': True}, [ |
| 305 | ('issue_id IN (%s)' % sql.PlaceHolders(issue_ids), issue_ids) |
| 306 | ], commit=False) |
| 307 | |
| 308 | self.verdict_tbl.InsertRows(cnxn, MANUALVERDICT_ISSUE_COLS, rows, |
| 309 | ignore=True) |
| 310 | |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 311 | current_time = int(time.time()) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 312 | for issue in issues: |
| 313 | issue.is_spam = is_spam |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 314 | issue.migration_modified_timestamp = current_time |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 315 | |
| 316 | if is_spam: |
| 317 | for issue in issues: |
| 318 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 319 | self.issue_actions.increment( |
| 320 | { |
| 321 | 'type': 'manual', |
| 322 | 'reporter_id': str(user_id), |
| 323 | 'issue': issue_ref |
| 324 | }) |
| 325 | else: |
| 326 | issue_service.AllocateNewLocalIDs(cnxn, issues) |
| 327 | |
| 328 | # This will commit the transaction. |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 329 | issue_service.UpdateIssues( |
| 330 | cnxn, issues, update_cols=['is_spam', 'migration_modified']) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 331 | |
| 332 | def RecordManualCommentVerdict(self, cnxn, issue_service, user_service, |
| 333 | comment_id, user_id, is_spam): |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 334 | """Bypasses the classifier to manually classify a comment as spam. |
| 335 | |
| 336 | This code can only be run by users with the 'VerdictSpam' permission.""" |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 337 | # TODO(seanmccullough): Bulk comment verdicts? There's no UI for that. |
| 338 | self.verdict_tbl.InsertRow(cnxn, ignore=True, |
| 339 | user_id=user_id, comment_id=comment_id, is_spam=is_spam, |
| 340 | reason=REASON_MANUAL) |
| 341 | comment = issue_service.GetComment(cnxn, comment_id) |
| 342 | comment.is_spam = is_spam |
| 343 | issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False) |
| 344 | issue_service.SoftDeleteComment( |
| 345 | cnxn, issue, comment, user_id, user_service, is_spam, True, is_spam) |
| 346 | if is_spam: |
| 347 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 348 | self.comment_actions.increment( |
| 349 | { |
| 350 | 'type': 'manual', |
| 351 | 'reporter_id': str(user_id), |
| 352 | 'issue': issue_ref, |
| 353 | 'comment_id': str(comment_id) |
| 354 | }) |
| 355 | |
| 356 | def RecordClassifierCommentVerdict( |
| 357 | self, cnxn, issue_service, comment, is_spam, confidence, fail_open): |
| 358 | reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER |
| 359 | self.verdict_tbl.InsertRow(cnxn, comment_id=comment.id, is_spam=is_spam, |
| 360 | reason=reason, classifier_confidence=confidence, |
| 361 | project_id=comment.project_id) |
| 362 | if is_spam: |
| 363 | issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False) |
| 364 | issue_ref = '%s:%s' % (issue.project_name, issue.local_id) |
| 365 | self.comment_actions.increment( |
| 366 | { |
| 367 | 'type': 'classifier', |
| 368 | 'reporter_id': 'classifier', |
| 369 | 'issue': issue_ref, |
| 370 | 'comment_id': str(comment.id) |
| 371 | }) |
| 372 | |
| 373 | def _predict(self, instance): |
| 374 | """Requests a prediction from the ML Engine API. |
| 375 | |
| 376 | Sample API response: |
| 377 | {'predictions': [{ |
| 378 | 'classes': ['0', '1'], |
| 379 | 'scores': [0.4986788034439087, 0.5013211965560913] |
| 380 | }]} |
| 381 | |
| 382 | This hits the default model. |
| 383 | |
| 384 | Returns: |
| 385 | A floating point number representing the confidence |
| 386 | the instance is spam. |
| 387 | """ |
| 388 | model_name = 'projects/%s/models/%s' % ( |
| 389 | settings.classifier_project_id, settings.spam_model_name) |
| 390 | body = {'instances': [{"inputs": instance["word_hashes"]}]} |
| 391 | |
| 392 | if not self.ml_engine: |
| 393 | self.ml_engine = ml_helpers.setup_ml_engine() |
| 394 | |
| 395 | request = self.ml_engine.projects().predict(name=model_name, body=body) |
| 396 | response = request.execute() |
| 397 | logging.info('ML Engine API response: %r' % response) |
| 398 | prediction = response['predictions'][0] |
| 399 | |
| 400 | # Ensure the class confidence we return is for the spam, not the ham label. |
| 401 | # The spam label, '1', is usually at index 1 but I'm not sure of any |
| 402 | # guarantees around label order. |
| 403 | if prediction['classes'][1] == SPAM_CLASS_LABEL: |
| 404 | return prediction['scores'][1] |
| 405 | elif prediction['classes'][0] == SPAM_CLASS_LABEL: |
| 406 | return prediction['scores'][0] |
| 407 | else: |
| 408 | raise Exception('No predicted classes found.') |
| 409 | |
| 410 | def _IsExempt(self, author, is_project_member): |
| 411 | """Return True if the user is exempt from spam checking.""" |
| 412 | if author.email is not None and author.email.endswith( |
| 413 | settings.spam_allowlisted_suffixes): |
| 414 | logging.info('%s allowlisted from spam filtering', author.email) |
| 415 | return True |
| 416 | |
| 417 | if is_project_member: |
| 418 | logging.info('%s is a project member, assuming ham', author.email) |
| 419 | return True |
| 420 | |
| 421 | return False |
| 422 | |
| 423 | def ClassifyIssue(self, issue, firstComment, reporter, is_project_member): |
| 424 | """Classify an issue as either spam or ham. |
| 425 | |
| 426 | Args: |
| 427 | issue: the Issue. |
| 428 | firstComment: the first Comment on issue. |
| 429 | reporter: User PB for the Issue reporter. |
| 430 | is_project_member: True if reporter is a member of issue's project. |
| 431 | |
| 432 | Returns a JSON dict of classifier prediction results from |
| 433 | the ML Engine API. |
| 434 | """ |
| 435 | instance = ml_helpers.GenerateFeaturesRaw( |
| 436 | [issue.summary, firstComment.content], |
| 437 | settings.spam_feature_hashes) |
| 438 | return self._classify(instance, reporter, is_project_member) |
| 439 | |
| 440 | def ClassifyComment(self, comment_content, commenter, is_project_member=True): |
| 441 | """Classify a comment as either spam or ham. |
| 442 | |
| 443 | Args: |
| 444 | comment: the comment text. |
| 445 | commenter: User PB for the user who authored the comment. |
| 446 | |
| 447 | Returns a JSON dict of classifier prediction results from |
| 448 | the ML Engine API. |
| 449 | """ |
| 450 | instance = ml_helpers.GenerateFeaturesRaw( |
| 451 | ['', comment_content], |
| 452 | settings.spam_feature_hashes) |
| 453 | return self._classify(instance, commenter, is_project_member) |
| 454 | |
| 455 | |
| 456 | def _classify(self, instance, author, is_project_member): |
| 457 | # Fail-safe: not spam. |
| 458 | result = self.ham_classification() |
| 459 | |
| 460 | if self._IsExempt(author, is_project_member): |
| 461 | return result |
| 462 | |
| 463 | if not self.ml_engine: |
| 464 | self.ml_engine = ml_helpers.setup_ml_engine() |
| 465 | |
| 466 | # If setup_ml_engine returns None, it failed to init. |
| 467 | if not self.ml_engine: |
| 468 | logging.error("ML Engine not initialized.") |
| 469 | self.ml_engine_failures.increment() |
| 470 | result['failed_open'] = True |
| 471 | return result |
| 472 | |
| 473 | remaining_retries = 3 |
| 474 | while remaining_retries > 0: |
| 475 | try: |
| 476 | result['confidence_is_spam'] = self._predict(instance) |
| 477 | result['failed_open'] = False |
| 478 | return result |
| 479 | except Exception as ex: |
| 480 | remaining_retries = remaining_retries - 1 |
| 481 | self.ml_engine_failures.increment() |
| 482 | logging.error('Error calling ML Engine API: %s' % ex) |
| 483 | |
| 484 | result['failed_open'] = True |
| 485 | return result |
| 486 | |
| 487 | def ham_classification(self): |
| 488 | return {'confidence_is_spam': 0.0, |
| 489 | 'failed_open': False} |
| 490 | |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 491 | def GetIssueFlagQueue( |
| 492 | self, cnxn, _issue_service, project_id, offset=0, limit=10): |
| 493 | """Returns list of recent issues that have been flagged by users""" |
| 494 | issue_flags = self.report_tbl.Select( |
| 495 | cnxn, |
| 496 | cols=[ |
| 497 | "Issue.project_id", "Report.issue_id", "count(*) as count", |
| 498 | "max(Report.created) as latest", |
| 499 | "count(distinct Report.user_id) as users" |
| 500 | ], |
| 501 | left_joins=["Issue ON Issue.id = Report.issue_id"], |
| 502 | where=[ |
| 503 | ('Report.issue_id IS NOT NULL', []), |
| 504 | ("Issue.project_id == %v", [project_id]) |
| 505 | ], |
| 506 | order_by=[('count DESC', [])], |
| 507 | group_by=['Report.issue_id'], |
| 508 | offset=offset, |
| 509 | limit=limit) |
| 510 | ret = [] |
| 511 | for row in issue_flags: |
| 512 | ret.append( |
| 513 | ModerationItem( |
| 514 | project_id=row[0], |
| 515 | issue_id=row[1], |
| 516 | count=row[2], |
| 517 | latest_report=row[3], |
| 518 | num_users=row[4], |
| 519 | )) |
| 520 | |
| 521 | count = self.verdict_tbl.SelectValue( |
| 522 | cnxn, |
| 523 | col='COUNT(DISTINCT Report.issue_id)', |
| 524 | where=[('Issue.project_id = %s', [project_id])], |
| 525 | left_joins=["Issue ON Issue.id = SpamReport.issue_id"]) |
| 526 | return ret, count |
| 527 | |
| 528 | |
| 529 | def GetCommentClassifierQueue( |
| 530 | self, cnxn, _issue_service, project_id, offset=0, limit=10): |
| 531 | """Returns list of recent comments with spam verdicts, |
| 532 | ranked in ascending order of confidence (so uncertain items are first). |
| 533 | """ |
| 534 | # TODO(seanmccullough): Optimize pagination. This query probably gets |
| 535 | # slower as the number of SpamVerdicts grows, regardless of offset |
| 536 | # and limit values used here. Using offset,limit in general may not |
| 537 | # be the best way to do this. |
| 538 | comment_results = self.verdict_tbl.Select( |
| 539 | cnxn, |
| 540 | cols=[ |
| 541 | 'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created' |
| 542 | ], |
| 543 | where=[ |
| 544 | ('project_id = %s', [project_id]), |
| 545 | ( |
| 546 | 'classifier_confidence <= %s', |
| 547 | [settings.classifier_moderation_thresh]), |
| 548 | ('overruled = %s', [False]), |
| 549 | ('comment_id IS NOT NULL', []), |
| 550 | ], |
| 551 | order_by=[ |
| 552 | ('classifier_confidence ASC', []), |
| 553 | ('created ASC', []), |
| 554 | ], |
| 555 | group_by=['comment_id'], |
| 556 | offset=offset, |
| 557 | limit=limit, |
| 558 | ) |
| 559 | |
| 560 | ret = [] |
| 561 | for row in comment_results: |
| 562 | ret.append( |
| 563 | ModerationItem( |
| 564 | comment_id=int(row[0]), |
| 565 | is_spam=row[1] == 1, |
| 566 | reason=row[2], |
| 567 | classifier_confidence=row[3], |
| 568 | verdict_time='%s' % row[4], |
| 569 | )) |
| 570 | |
| 571 | count = self.verdict_tbl.SelectValue( |
| 572 | cnxn, |
| 573 | col='COUNT(*)', |
| 574 | where=[ |
| 575 | ('project_id = %s', [project_id]), |
| 576 | ( |
| 577 | 'classifier_confidence <= %s', |
| 578 | [settings.classifier_moderation_thresh]), |
| 579 | ('overruled = %s', [False]), |
| 580 | ('comment_id IS NOT NULL', []), |
| 581 | ]) |
| 582 | |
| 583 | return ret, count |
| 584 | |
| 585 | |
| 586 | def GetTrainingIssues(self, cnxn, issue_service, since, offset=0, limit=100): |
| 587 | """Returns list of recent issues with human-labeled spam/ham verdicts. |
| 588 | """ |
| 589 | |
| 590 | # get all of the manual verdicts in the past day. |
| 591 | results = self.verdict_tbl.Select(cnxn, |
| 592 | cols=['issue_id'], |
| 593 | where=[ |
| 594 | ('overruled = %s', [False]), |
| 595 | ('reason = %s', ['manual']), |
| 596 | ('issue_id IS NOT NULL', []), |
| 597 | ('created > %s', [since.isoformat()]), |
| 598 | ], |
| 599 | offset=offset, |
| 600 | limit=limit, |
| 601 | ) |
| 602 | |
| 603 | issue_ids = [int(row[0]) for row in results if row[0]] |
| 604 | issues = issue_service.GetIssues(cnxn, issue_ids) |
| 605 | comments = issue_service.GetCommentsForIssues(cnxn, issue_ids) |
| 606 | first_comments = {} |
| 607 | for issue in issues: |
| 608 | first_comments[issue.issue_id] = (comments[issue.issue_id][0].content |
| 609 | if issue.issue_id in comments else "[Empty]") |
| 610 | |
| 611 | count = self.verdict_tbl.SelectValue(cnxn, |
| 612 | col='COUNT(*)', |
| 613 | where=[ |
| 614 | ('overruled = %s', [False]), |
| 615 | ('reason = %s', ['manual']), |
| 616 | ('issue_id IS NOT NULL', []), |
| 617 | ('created > %s', [since.isoformat()]), |
| 618 | ]) |
| 619 | |
| 620 | return issues, first_comments, count |
| 621 | |
| 622 | def GetTrainingComments(self, cnxn, issue_service, since, offset=0, |
| 623 | limit=100): |
| 624 | """Returns list of recent comments with human-labeled spam/ham verdicts. |
| 625 | """ |
| 626 | |
| 627 | # get all of the manual verdicts in the past day. |
| 628 | results = self.verdict_tbl.Select( |
| 629 | cnxn, |
| 630 | distinct=True, |
| 631 | cols=['comment_id'], |
| 632 | where=[ |
| 633 | ('overruled = %s', [False]), |
| 634 | ('reason = %s', ['manual']), |
| 635 | ('comment_id IS NOT NULL', []), |
| 636 | ('created > %s', [since.isoformat()]), |
| 637 | ], |
| 638 | offset=offset, |
| 639 | limit=limit, |
| 640 | ) |
| 641 | |
| 642 | comment_ids = [int(row[0]) for row in results if row[0]] |
| 643 | # Don't care about sequence numbers in this context yet. |
| 644 | comments = issue_service.GetCommentsByID(cnxn, comment_ids, |
| 645 | defaultdict(int)) |
| 646 | return comments |
| 647 | |
| 648 | def ExpungeUsersInSpam(self, cnxn, user_ids): |
| 649 | """Removes all references to given users from Spam DB tables. |
| 650 | |
| 651 | This method will not commit the operations. This method will |
| 652 | not make changes to in-memory data. |
| 653 | """ |
| 654 | commit = False |
| 655 | self.report_tbl.Delete(cnxn, reported_user_id=user_ids, commit=commit) |
| 656 | self.report_tbl.Delete(cnxn, user_id=user_ids, commit=commit) |
| 657 | self.verdict_tbl.Delete(cnxn, user_id=user_ids, commit=commit) |
| 658 | |
| 659 | |
| 660 | class ModerationItem: |
| 661 | def __init__(self, **kwargs): |
| 662 | self.__dict__ = kwargs |