Blame - services/spam_svc.py - monorail-avm99963

blob: 9a62cb9fef0aa55b11dbb33125abc7cb5f73a03a [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	# Copyright 2016 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5
				6	""" Set of functions for detaling with spam reports.
				7	"""
				8	from __future__ import print_function
				9	from __future__ import division
				10	from __future__ import absolute_import
				11
				12	import collections
				13	import logging
				14	import settings
				15	import sys
				16
				17	from collections import defaultdict
				18	from features import filterrules_helpers
				19	from framework import sql
				20	from framework import framework_constants
				21	from infra_libs import ts_mon
				22	from services import ml_helpers
				23
				24
				25	SPAMREPORT_TABLE_NAME = 'SpamReport'
				26	SPAMVERDICT_TABLE_NAME = 'SpamVerdict'
				27	ISSUE_TABLE = 'Issue'
				28
				29	REASON_MANUAL = 'manual'
				30	REASON_THRESHOLD = 'threshold'
				31	REASON_CLASSIFIER = 'classifier'
				32	REASON_FAIL_OPEN = 'fail_open'
				33	SPAM_CLASS_LABEL = '1'
				34
				35	SPAMREPORT_ISSUE_COLS = ['issue_id', 'reported_user_id', 'user_id']
				36	SPAMVERDICT_ISSUE_COL = ['created', 'content_created', 'user_id',
				37	'reported_user_id', 'comment_id', 'issue_id']
				38	MANUALVERDICT_ISSUE_COLS = ['user_id', 'issue_id', 'is_spam', 'reason',
				39	'project_id']
				40	THRESHVERDICT_ISSUE_COLS = ['issue_id', 'is_spam', 'reason', 'project_id']
				41
				42	SPAMREPORT_COMMENT_COLS = ['comment_id', 'reported_user_id', 'user_id']
				43	MANUALVERDICT_COMMENT_COLS = ['user_id', 'comment_id', 'is_spam', 'reason',
				44	'project_id']
				45	THRESHVERDICT_COMMENT_COLS = ['comment_id', 'is_spam', 'reason', 'project_id']
				46
				47
				48	class SpamService(object):
				49	"""The persistence layer for spam reports."""
				50	issue_actions = ts_mon.CounterMetric(
				51	'monorail/spam_svc/issue', 'Count of things that happen to issues.', [
				52	ts_mon.StringField('type'),
				53	ts_mon.StringField('reporter_id'),
				54	ts_mon.StringField('issue')
				55	])
				56	comment_actions = ts_mon.CounterMetric(
				57	'monorail/spam_svc/comment', 'Count of things that happen to comments.', [
				58	ts_mon.StringField('type'),
				59	ts_mon.StringField('reporter_id'),
				60	ts_mon.StringField('issue'),
				61	ts_mon.StringField('comment_id')
				62	])
				63	ml_engine_failures = ts_mon.CounterMetric(
				64	'monorail/spam_svc/ml_engine_failure',
				65	'Failures calling the ML Engine API',
				66	None)
				67
				68	def __init__(self):
				69	self.report_tbl = sql.SQLTableManager(SPAMREPORT_TABLE_NAME)
				70	self.verdict_tbl = sql.SQLTableManager(SPAMVERDICT_TABLE_NAME)
				71	self.issue_tbl = sql.SQLTableManager(ISSUE_TABLE)
				72
				73	# ML Engine library is lazy loaded below.
				74	self.ml_engine = None
				75
				76	def LookupIssuesFlaggers(self, cnxn, issue_ids):
				77	"""Returns users who've reported the issues or their comments as spam.
				78
				79	Returns a dictionary {issue_id: (issue_reporters, comment_reporters)}
				80	issue_reportes is a list of users who flagged the issue;
				81	comment_reporters element is a dictionary {comment_id: [user_ids]} where
				82	user_ids are the users who flagged that comment.
				83	"""
				84	rows = self.report_tbl.Select(
				85	cnxn, cols=['issue_id', 'user_id', 'comment_id'],
				86	issue_id=issue_ids)
				87
				88	reporters = collections.defaultdict(
				89	# Return a tuple of (issue_reporters, comment_reporters) as described
				90	# above.
				91	lambda: ([], collections.defaultdict(list)))
				92
				93	for row in rows:
				94	issue_id = int(row[0])
				95	user_id = row[1]
				96	if row[2]:
				97	comment_id = row[2]
				98	reporters[issue_id][1][comment_id].append(user_id)
				99	else:
				100	reporters[issue_id][0].append(user_id)
				101
				102	return reporters
				103
				104	def LookupIssueFlaggers(self, cnxn, issue_id):
				105	"""Returns users who've reported the issue or its comments as spam.
				106
				107	Returns a tuple. First element is a list of users who flagged the issue;
				108	second element is a dictionary of comment id to a list of users who flagged
				109	that comment.
				110	"""
				111	return self.LookupIssuesFlaggers(cnxn, [issue_id])[issue_id]
				112
				113	def LookupIssueFlagCounts(self, cnxn, issue_ids):
				114	"""Returns a map of issue_id to flag counts"""
				115	rows = self.report_tbl.Select(cnxn, cols=['issue_id', 'COUNT(*)'],
				116	issue_id=issue_ids, group_by=['issue_id'])
				117	counts = {}
				118	for row in rows:
				119	counts[int(row[0])] = row[1]
				120	return counts
				121
				122	def LookupIssueVerdicts(self, cnxn, issue_ids):
				123	"""Returns a map of issue_id to most recent spam verdicts"""
				124	rows = self.verdict_tbl.Select(cnxn,
				125	cols=['issue_id', 'reason', 'MAX(created)'],
				126	issue_id=issue_ids, comment_id=None,
				127	group_by=['issue_id'])
				128	counts = {}
				129	for row in rows:
				130	counts[int(row[0])] = row[1]
				131	return counts
				132
				133	def LookupIssueVerdictHistory(self, cnxn, issue_ids):
				134	"""Returns a map of issue_id to most recent spam verdicts"""
				135	rows = self.verdict_tbl.Select(cnxn, cols=[
				136	'issue_id', 'reason', 'created', 'is_spam', 'classifier_confidence',
				137	'user_id', 'overruled'],
				138	issue_id=issue_ids, order_by=[('issue_id', []), ('created', [])])
				139
				140	# TODO: group by issue_id, make class instead of dict for verdict.
				141	verdicts = []
				142	for row in rows:
				143	verdicts.append({
				144	'issue_id': row[0],
				145	'reason': row[1],
				146	'created': row[2],
				147	'is_spam': row[3],
				148	'classifier_confidence': row[4],
				149	'user_id': row[5],
				150	'overruled': row[6],
				151	})
				152
				153	return verdicts
				154
				155	def LookupCommentVerdictHistory(self, cnxn, comment_ids):
				156	"""Returns a map of issue_id to most recent spam verdicts"""
				157	rows = self.verdict_tbl.Select(cnxn, cols=[
				158	'comment_id', 'reason', 'created', 'is_spam', 'classifier_confidence',
				159	'user_id', 'overruled'],
				160	comment_id=comment_ids, order_by=[('comment_id', []), ('created', [])])
				161
				162	# TODO: group by comment_id, make class instead of dict for verdict.
				163	verdicts = []
				164	for row in rows:
				165	verdicts.append({
				166	'comment_id': row[0],
				167	'reason': row[1],
				168	'created': row[2],
				169	'is_spam': row[3],
				170	'classifier_confidence': row[4],
				171	'user_id': row[5],
				172	'overruled': row[6],
				173	})
				174
				175	return verdicts
				176
				177	def FlagIssues(self, cnxn, issue_service, issues, reporting_user_id,
				178	flagged_spam):
				179	"""Creates or deletes a spam report on an issue."""
				180	verdict_updates = []
				181	if flagged_spam:
				182	rows = [(issue.issue_id, issue.reporter_id, reporting_user_id)
				183	for issue in issues]
				184	self.report_tbl.InsertRows(cnxn, SPAMREPORT_ISSUE_COLS, rows,
				185	ignore=True)
				186	else:
				187	issue_ids = [issue.issue_id for issue in issues]
				188	self.report_tbl.Delete(
				189	cnxn, issue_id=issue_ids, user_id=reporting_user_id,
				190	comment_id=None)
				191
				192	project_id = issues[0].project_id
				193
				194	# Now record new verdicts and update issue.is_spam, if they've changed.
				195	ids = [issue.issue_id for issue in issues]
				196	counts = self.LookupIssueFlagCounts(cnxn, ids)
				197	previous_verdicts = self.LookupIssueVerdicts(cnxn, ids)
				198
				199	for issue_id in counts:
				200	# If the flag counts changed enough to toggle the is_spam bit, need to
				201	# record a new verdict and update the Issue.
				202
				203	# No number of user spam flags can overturn an admin's verdict.
				204	if previous_verdicts.get(issue_id) == REASON_MANUAL:
				205	continue
				206
				207	# If enough spam flags come in, mark the issue as spam.
				208	if (flagged_spam and counts[issue_id] >= settings.spam_flag_thresh):
				209	verdict_updates.append(issue_id)
				210
				211	if len(verdict_updates) == 0:
				212	return
				213
				214	# Some of the issues may have exceed the flag threshold, so issue verdicts
				215	# and mark as spam in those cases.
				216	rows = [(issue_id, flagged_spam, REASON_THRESHOLD, project_id)
				217	for issue_id in verdict_updates]
				218	self.verdict_tbl.InsertRows(cnxn, THRESHVERDICT_ISSUE_COLS, rows,
				219	ignore=True)
				220	update_issues = []
				221	for issue in issues:
				222	if issue.issue_id in verdict_updates:
				223	issue.is_spam = flagged_spam
				224	update_issues.append(issue)
				225
				226	if flagged_spam:
				227	for issue in update_issues:
				228	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				229	self.issue_actions.increment(
				230	{
				231	'type': 'flag',
				232	'reporter_id': str(reporting_user_id),
				233	'issue': issue_ref
				234	})
				235
				236	issue_service.UpdateIssues(cnxn, update_issues, update_cols=['is_spam'])
				237
				238	def FlagComment(
				239	self, cnxn, issue, comment_id, reported_user_id, reporting_user_id,
				240	flagged_spam):
				241	"""Creates or deletes a spam report on a comment."""
				242	# TODO(seanmccullough): Bulk comment flagging? There's no UI for that.
				243	if flagged_spam:
				244	self.report_tbl.InsertRow(
				245	cnxn,
				246	ignore=True,
				247	issue_id=issue.issue_id,
				248	comment_id=comment_id,
				249	reported_user_id=reported_user_id,
				250	user_id=reporting_user_id)
				251	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				252	self.comment_actions.increment(
				253	{
				254	'type': 'flag',
				255	'reporter_id': str(reporting_user_id),
				256	'issue': issue_ref,
				257	'comment_id': str(comment_id)
				258	})
				259	else:
				260	self.report_tbl.Delete(
				261	cnxn,
				262	issue_id=issue.issue_id,
				263	comment_id=comment_id,
				264	user_id=reporting_user_id)
				265
				266	def RecordClassifierIssueVerdict(self, cnxn, issue, is_spam, confidence,
				267	fail_open):
				268	reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER
				269	self.verdict_tbl.InsertRow(cnxn, issue_id=issue.issue_id, is_spam=is_spam,
				270	reason=reason, classifier_confidence=confidence,
				271	project_id=issue.project_id)
				272	if is_spam:
				273	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				274	self.issue_actions.increment(
				275	{
				276	'type': 'classifier',
				277	'reporter_id': 'classifier',
				278	'issue': issue_ref
				279	})
				280	# This is called at issue creation time, so there's nothing else to do here.
				281
				282	def RecordManualIssueVerdicts(self, cnxn, issue_service, issues, user_id,
				283	is_spam):
				284	rows = [(user_id, issue.issue_id, is_spam, REASON_MANUAL, issue.project_id)
				285	for issue in issues]
				286	issue_ids = [issue.issue_id for issue in issues]
				287
				288	# Overrule all previous verdicts.
				289	self.verdict_tbl.Update(cnxn, {'overruled': True}, [
				290	('issue_id IN (%s)' % sql.PlaceHolders(issue_ids), issue_ids)
				291	], commit=False)
				292
				293	self.verdict_tbl.InsertRows(cnxn, MANUALVERDICT_ISSUE_COLS, rows,
				294	ignore=True)
				295
				296	for issue in issues:
				297	issue.is_spam = is_spam
				298
				299	if is_spam:
				300	for issue in issues:
				301	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				302	self.issue_actions.increment(
				303	{
				304	'type': 'manual',
				305	'reporter_id': str(user_id),
				306	'issue': issue_ref
				307	})
				308	else:
				309	issue_service.AllocateNewLocalIDs(cnxn, issues)
				310
				311	# This will commit the transaction.
				312	issue_service.UpdateIssues(cnxn, issues, update_cols=['is_spam'])
				313
				314	def RecordManualCommentVerdict(self, cnxn, issue_service, user_service,
				315	comment_id, user_id, is_spam):
				316	# TODO(seanmccullough): Bulk comment verdicts? There's no UI for that.
				317	self.verdict_tbl.InsertRow(cnxn, ignore=True,
				318	user_id=user_id, comment_id=comment_id, is_spam=is_spam,
				319	reason=REASON_MANUAL)
				320	comment = issue_service.GetComment(cnxn, comment_id)
				321	comment.is_spam = is_spam
				322	issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False)
				323	issue_service.SoftDeleteComment(
				324	cnxn, issue, comment, user_id, user_service, is_spam, True, is_spam)
				325	if is_spam:
				326	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				327	self.comment_actions.increment(
				328	{
				329	'type': 'manual',
				330	'reporter_id': str(user_id),
				331	'issue': issue_ref,
				332	'comment_id': str(comment_id)
				333	})
				334
				335	def RecordClassifierCommentVerdict(
				336	self, cnxn, issue_service, comment, is_spam, confidence, fail_open):
				337	reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER
				338	self.verdict_tbl.InsertRow(cnxn, comment_id=comment.id, is_spam=is_spam,
				339	reason=reason, classifier_confidence=confidence,
				340	project_id=comment.project_id)
				341	if is_spam:
				342	issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False)
				343	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				344	self.comment_actions.increment(
				345	{
				346	'type': 'classifier',
				347	'reporter_id': 'classifier',
				348	'issue': issue_ref,
				349	'comment_id': str(comment.id)
				350	})
				351
				352	def _predict(self, instance):
				353	"""Requests a prediction from the ML Engine API.
				354
				355	Sample API response:
				356	{'predictions': [{
				357	'classes': ['0', '1'],
				358	'scores': [0.4986788034439087, 0.5013211965560913]
				359	}]}
				360
				361	This hits the default model.
				362
				363	Returns:
				364	A floating point number representing the confidence
				365	the instance is spam.
				366	"""
				367	model_name = 'projects/%s/models/%s' % (
				368	settings.classifier_project_id, settings.spam_model_name)
				369	body = {'instances': [{"inputs": instance["word_hashes"]}]}
				370
				371	if not self.ml_engine:
				372	self.ml_engine = ml_helpers.setup_ml_engine()
				373
				374	request = self.ml_engine.projects().predict(name=model_name, body=body)
				375	response = request.execute()
				376	logging.info('ML Engine API response: %r' % response)
				377	prediction = response['predictions'][0]
				378
				379	# Ensure the class confidence we return is for the spam, not the ham label.
				380	# The spam label, '1', is usually at index 1 but I'm not sure of any
				381	# guarantees around label order.
				382	if prediction['classes'][1] == SPAM_CLASS_LABEL:
				383	return prediction['scores'][1]
				384	elif prediction['classes'][0] == SPAM_CLASS_LABEL:
				385	return prediction['scores'][0]
				386	else:
				387	raise Exception('No predicted classes found.')
				388
				389	def _IsExempt(self, author, is_project_member):
				390	"""Return True if the user is exempt from spam checking."""
				391	if author.email is not None and author.email.endswith(
				392	settings.spam_allowlisted_suffixes):
				393	logging.info('%s allowlisted from spam filtering', author.email)
				394	return True
				395
				396	if is_project_member:
				397	logging.info('%s is a project member, assuming ham', author.email)
				398	return True
				399
				400	return False
				401
				402	def ClassifyIssue(self, issue, firstComment, reporter, is_project_member):
				403	"""Classify an issue as either spam or ham.
				404
				405	Args:
				406	issue: the Issue.
				407	firstComment: the first Comment on issue.
				408	reporter: User PB for the Issue reporter.
				409	is_project_member: True if reporter is a member of issue's project.
				410
				411	Returns a JSON dict of classifier prediction results from
				412	the ML Engine API.
				413	"""
				414	instance = ml_helpers.GenerateFeaturesRaw(
				415	[issue.summary, firstComment.content],
				416	settings.spam_feature_hashes)
				417	return self._classify(instance, reporter, is_project_member)
				418
				419	def ClassifyComment(self, comment_content, commenter, is_project_member=True):
				420	"""Classify a comment as either spam or ham.
				421
				422	Args:
				423	comment: the comment text.
				424	commenter: User PB for the user who authored the comment.
				425
				426	Returns a JSON dict of classifier prediction results from
				427	the ML Engine API.
				428	"""
				429	instance = ml_helpers.GenerateFeaturesRaw(
				430	['', comment_content],
				431	settings.spam_feature_hashes)
				432	return self._classify(instance, commenter, is_project_member)
				433
				434
				435	def _classify(self, instance, author, is_project_member):
				436	# Fail-safe: not spam.
				437	result = self.ham_classification()
				438
				439	if self._IsExempt(author, is_project_member):
				440	return result
				441
				442	if not self.ml_engine:
				443	self.ml_engine = ml_helpers.setup_ml_engine()
				444
				445	# If setup_ml_engine returns None, it failed to init.
				446	if not self.ml_engine:
				447	logging.error("ML Engine not initialized.")
				448	self.ml_engine_failures.increment()
				449	result['failed_open'] = True
				450	return result
				451
				452	remaining_retries = 3
				453	while remaining_retries > 0:
				454	try:
				455	result['confidence_is_spam'] = self._predict(instance)
				456	result['failed_open'] = False
				457	return result
				458	except Exception as ex:
				459	remaining_retries = remaining_retries - 1
				460	self.ml_engine_failures.increment()
				461	logging.error('Error calling ML Engine API: %s' % ex)
				462
				463	result['failed_open'] = True
				464	return result
				465
				466	def ham_classification(self):
				467	return {'confidence_is_spam': 0.0,
				468	'failed_open': False}
				469
				470	def GetIssueClassifierQueue(
				471	self, cnxn, _issue_service, project_id, offset=0, limit=10):
				472	"""Returns list of recent issues with spam verdicts,
				473	ranked in ascending order of confidence (so uncertain items are first).
				474	"""
				475	# TODO(seanmccullough): Optimize pagination. This query probably gets
				476	# slower as the number of SpamVerdicts grows, regardless of offset
				477	# and limit values used here. Using offset,limit in general may not
				478	# be the best way to do this.
				479	issue_results = self.verdict_tbl.Select(
				480	cnxn,
				481	cols=[
				482	'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created'
				483	],
				484	where=[
				485	('project_id = %s', [project_id]),
				486	(
				487	'classifier_confidence <= %s',
				488	[settings.classifier_moderation_thresh]),
				489	('overruled = %s', [False]),
				490	('issue_id IS NOT NULL', []),
				491	],
				492	order_by=[
				493	('classifier_confidence ASC', []),
				494	('created ASC', []),
				495	],
				496	group_by=['issue_id'],
				497	offset=offset,
				498	limit=limit,
				499	)
				500
				501	ret = []
				502	for row in issue_results:
				503	ret.append(
				504	ModerationItem(
				505	issue_id=int(row[0]),
				506	is_spam=row[1] == 1,
				507	reason=row[2],
				508	classifier_confidence=row[3],
				509	verdict_time='%s' % row[4],
				510	))
				511
				512	count = self.verdict_tbl.SelectValue(
				513	cnxn,
				514	col='COUNT(*)',
				515	where=[
				516	('project_id = %s', [project_id]),
				517	(
				518	'classifier_confidence <= %s',
				519	[settings.classifier_moderation_thresh]),
				520	('overruled = %s', [False]),
				521	('issue_id IS NOT NULL', []),
				522	])
				523
				524	return ret, count
				525
				526	def GetIssueFlagQueue(
				527	self, cnxn, _issue_service, project_id, offset=0, limit=10):
				528	"""Returns list of recent issues that have been flagged by users"""
				529	issue_flags = self.report_tbl.Select(
				530	cnxn,
				531	cols=[
				532	"Issue.project_id", "Report.issue_id", "count(*) as count",
				533	"max(Report.created) as latest",
				534	"count(distinct Report.user_id) as users"
				535	],
				536	left_joins=["Issue ON Issue.id = Report.issue_id"],
				537	where=[
				538	('Report.issue_id IS NOT NULL', []),
				539	("Issue.project_id == %v", [project_id])
				540	],
				541	order_by=[('count DESC', [])],
				542	group_by=['Report.issue_id'],
				543	offset=offset,
				544	limit=limit)
				545	ret = []
				546	for row in issue_flags:
				547	ret.append(
				548	ModerationItem(
				549	project_id=row[0],
				550	issue_id=row[1],
				551	count=row[2],
				552	latest_report=row[3],
				553	num_users=row[4],
				554	))
				555
				556	count = self.verdict_tbl.SelectValue(
				557	cnxn,
				558	col='COUNT(DISTINCT Report.issue_id)',
				559	where=[('Issue.project_id = %s', [project_id])],
				560	left_joins=["Issue ON Issue.id = SpamReport.issue_id"])
				561	return ret, count
				562
				563
				564	def GetCommentClassifierQueue(
				565	self, cnxn, _issue_service, project_id, offset=0, limit=10):
				566	"""Returns list of recent comments with spam verdicts,
				567	ranked in ascending order of confidence (so uncertain items are first).
				568	"""
				569	# TODO(seanmccullough): Optimize pagination. This query probably gets
				570	# slower as the number of SpamVerdicts grows, regardless of offset
				571	# and limit values used here. Using offset,limit in general may not
				572	# be the best way to do this.
				573	comment_results = self.verdict_tbl.Select(
				574	cnxn,
				575	cols=[
				576	'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created'
				577	],
				578	where=[
				579	('project_id = %s', [project_id]),
				580	(
				581	'classifier_confidence <= %s',
				582	[settings.classifier_moderation_thresh]),
				583	('overruled = %s', [False]),
				584	('comment_id IS NOT NULL', []),
				585	],
				586	order_by=[
				587	('classifier_confidence ASC', []),
				588	('created ASC', []),
				589	],
				590	group_by=['comment_id'],
				591	offset=offset,
				592	limit=limit,
				593	)
				594
				595	ret = []
				596	for row in comment_results:
				597	ret.append(
				598	ModerationItem(
				599	comment_id=int(row[0]),
				600	is_spam=row[1] == 1,
				601	reason=row[2],
				602	classifier_confidence=row[3],
				603	verdict_time='%s' % row[4],
				604	))
				605
				606	count = self.verdict_tbl.SelectValue(
				607	cnxn,
				608	col='COUNT(*)',
				609	where=[
				610	('project_id = %s', [project_id]),
				611	(
				612	'classifier_confidence <= %s',
				613	[settings.classifier_moderation_thresh]),
				614	('overruled = %s', [False]),
				615	('comment_id IS NOT NULL', []),
				616	])
				617
				618	return ret, count
				619
				620
				621	def GetTrainingIssues(self, cnxn, issue_service, since, offset=0, limit=100):
				622	"""Returns list of recent issues with human-labeled spam/ham verdicts.
				623	"""
				624
				625	# get all of the manual verdicts in the past day.
				626	results = self.verdict_tbl.Select(cnxn,
				627	cols=['issue_id'],
				628	where=[
				629	('overruled = %s', [False]),
				630	('reason = %s', ['manual']),
				631	('issue_id IS NOT NULL', []),
				632	('created > %s', [since.isoformat()]),
				633	],
				634	offset=offset,
				635	limit=limit,
				636	)
				637
				638	issue_ids = [int(row[0]) for row in results if row[0]]
				639	issues = issue_service.GetIssues(cnxn, issue_ids)
				640	comments = issue_service.GetCommentsForIssues(cnxn, issue_ids)
				641	first_comments = {}
				642	for issue in issues:
				643	first_comments[issue.issue_id] = (comments[issue.issue_id][0].content
				644	if issue.issue_id in comments else "[Empty]")
				645
				646	count = self.verdict_tbl.SelectValue(cnxn,
				647	col='COUNT(*)',
				648	where=[
				649	('overruled = %s', [False]),
				650	('reason = %s', ['manual']),
				651	('issue_id IS NOT NULL', []),
				652	('created > %s', [since.isoformat()]),
				653	])
				654
				655	return issues, first_comments, count
				656
				657	def GetTrainingComments(self, cnxn, issue_service, since, offset=0,
				658	limit=100):
				659	"""Returns list of recent comments with human-labeled spam/ham verdicts.
				660	"""
				661
				662	# get all of the manual verdicts in the past day.
				663	results = self.verdict_tbl.Select(
				664	cnxn,
				665	distinct=True,
				666	cols=['comment_id'],
				667	where=[
				668	('overruled = %s', [False]),
				669	('reason = %s', ['manual']),
				670	('comment_id IS NOT NULL', []),
				671	('created > %s', [since.isoformat()]),
				672	],
				673	offset=offset,
				674	limit=limit,
				675	)
				676
				677	comment_ids = [int(row[0]) for row in results if row[0]]
				678	# Don't care about sequence numbers in this context yet.
				679	comments = issue_service.GetCommentsByID(cnxn, comment_ids,
				680	defaultdict(int))
				681	return comments
				682
				683	def ExpungeUsersInSpam(self, cnxn, user_ids):
				684	"""Removes all references to given users from Spam DB tables.
				685
				686	This method will not commit the operations. This method will
				687	not make changes to in-memory data.
				688	"""
				689	commit = False
				690	self.report_tbl.Delete(cnxn, reported_user_id=user_ids, commit=commit)
				691	self.report_tbl.Delete(cnxn, user_id=user_ids, commit=commit)
				692	self.verdict_tbl.Delete(cnxn, user_id=user_ids, commit=commit)
				693
				694
				695	class ModerationItem:
				696	def __init__(self, **kwargs):
				697	self.__dict__ = kwargs