Blame - services/spam_svc.py - monorail-avm99963

blob: 02ec7d8516ac2392e47a37cc6b97b2eb8987117f [file] [log] [blame]

Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	1	# Copyright 2016 The Chromium Authors
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	4
				5	""" Set of functions for detaling with spam reports.
				6	"""
				7	from __future__ import print_function
				8	from __future__ import division
				9	from __future__ import absolute_import
				10
				11	import collections
				12	import logging
				13	import settings
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	14	import time
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	15
				16	from collections import defaultdict
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	17	from framework import sql
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	18	from infra_libs import ts_mon
				19	from services import ml_helpers
				20
				21
				22	SPAMREPORT_TABLE_NAME = 'SpamReport'
				23	SPAMVERDICT_TABLE_NAME = 'SpamVerdict'
				24	ISSUE_TABLE = 'Issue'
				25
				26	REASON_MANUAL = 'manual'
				27	REASON_THRESHOLD = 'threshold'
				28	REASON_CLASSIFIER = 'classifier'
				29	REASON_FAIL_OPEN = 'fail_open'
				30	SPAM_CLASS_LABEL = '1'
				31
				32	SPAMREPORT_ISSUE_COLS = ['issue_id', 'reported_user_id', 'user_id']
				33	SPAMVERDICT_ISSUE_COL = ['created', 'content_created', 'user_id',
				34	'reported_user_id', 'comment_id', 'issue_id']
				35	MANUALVERDICT_ISSUE_COLS = ['user_id', 'issue_id', 'is_spam', 'reason',
				36	'project_id']
				37	THRESHVERDICT_ISSUE_COLS = ['issue_id', 'is_spam', 'reason', 'project_id']
				38
				39	SPAMREPORT_COMMENT_COLS = ['comment_id', 'reported_user_id', 'user_id']
				40	MANUALVERDICT_COMMENT_COLS = ['user_id', 'comment_id', 'is_spam', 'reason',
				41	'project_id']
				42	THRESHVERDICT_COMMENT_COLS = ['comment_id', 'is_spam', 'reason', 'project_id']
				43
				44
				45	class SpamService(object):
				46	"""The persistence layer for spam reports."""
				47	issue_actions = ts_mon.CounterMetric(
				48	'monorail/spam_svc/issue', 'Count of things that happen to issues.', [
				49	ts_mon.StringField('type'),
				50	ts_mon.StringField('reporter_id'),
				51	ts_mon.StringField('issue')
				52	])
				53	comment_actions = ts_mon.CounterMetric(
				54	'monorail/spam_svc/comment', 'Count of things that happen to comments.', [
				55	ts_mon.StringField('type'),
				56	ts_mon.StringField('reporter_id'),
				57	ts_mon.StringField('issue'),
				58	ts_mon.StringField('comment_id')
				59	])
				60	ml_engine_failures = ts_mon.CounterMetric(
				61	'monorail/spam_svc/ml_engine_failure',
				62	'Failures calling the ML Engine API',
				63	None)
				64
				65	def __init__(self):
				66	self.report_tbl = sql.SQLTableManager(SPAMREPORT_TABLE_NAME)
				67	self.verdict_tbl = sql.SQLTableManager(SPAMVERDICT_TABLE_NAME)
				68	self.issue_tbl = sql.SQLTableManager(ISSUE_TABLE)
				69
				70	# ML Engine library is lazy loaded below.
				71	self.ml_engine = None
				72
				73	def LookupIssuesFlaggers(self, cnxn, issue_ids):
				74	"""Returns users who've reported the issues or their comments as spam.
				75
				76	Returns a dictionary {issue_id: (issue_reporters, comment_reporters)}
				77	issue_reportes is a list of users who flagged the issue;
				78	comment_reporters element is a dictionary {comment_id: [user_ids]} where
				79	user_ids are the users who flagged that comment.
				80	"""
				81	rows = self.report_tbl.Select(
				82	cnxn, cols=['issue_id', 'user_id', 'comment_id'],
				83	issue_id=issue_ids)
				84
				85	reporters = collections.defaultdict(
				86	# Return a tuple of (issue_reporters, comment_reporters) as described
				87	# above.
				88	lambda: ([], collections.defaultdict(list)))
				89
				90	for row in rows:
				91	issue_id = int(row[0])
				92	user_id = row[1]
				93	if row[2]:
				94	comment_id = row[2]
				95	reporters[issue_id][1][comment_id].append(user_id)
				96	else:
				97	reporters[issue_id][0].append(user_id)
				98
				99	return reporters
				100
				101	def LookupIssueFlaggers(self, cnxn, issue_id):
				102	"""Returns users who've reported the issue or its comments as spam.
				103
				104	Returns a tuple. First element is a list of users who flagged the issue;
				105	second element is a dictionary of comment id to a list of users who flagged
				106	that comment.
				107	"""
				108	return self.LookupIssuesFlaggers(cnxn, [issue_id])[issue_id]
				109
Adrià Vilanova Martínez	de94280	2022-07-15 14:06:55 +0200	[diff] [blame]	110	def _LookupIssueFlagCounts(self, cnxn, issue_ids):
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	111	"""Returns a map of issue_id to flag counts"""
				112	rows = self.report_tbl.Select(cnxn, cols=['issue_id', 'COUNT(*)'],
				113	issue_id=issue_ids, group_by=['issue_id'])
				114	counts = {}
				115	for row in rows:
				116	counts[int(row[0])] = row[1]
				117	return counts
				118
				119	def LookupIssueVerdicts(self, cnxn, issue_ids):
				120	"""Returns a map of issue_id to most recent spam verdicts"""
				121	rows = self.verdict_tbl.Select(cnxn,
				122	cols=['issue_id', 'reason', 'MAX(created)'],
				123	issue_id=issue_ids, comment_id=None,
				124	group_by=['issue_id'])
				125	counts = {}
				126	for row in rows:
				127	counts[int(row[0])] = row[1]
				128	return counts
				129
				130	def LookupIssueVerdictHistory(self, cnxn, issue_ids):
				131	"""Returns a map of issue_id to most recent spam verdicts"""
				132	rows = self.verdict_tbl.Select(cnxn, cols=[
				133	'issue_id', 'reason', 'created', 'is_spam', 'classifier_confidence',
				134	'user_id', 'overruled'],
				135	issue_id=issue_ids, order_by=[('issue_id', []), ('created', [])])
				136
				137	# TODO: group by issue_id, make class instead of dict for verdict.
				138	verdicts = []
				139	for row in rows:
				140	verdicts.append({
				141	'issue_id': row[0],
				142	'reason': row[1],
				143	'created': row[2],
				144	'is_spam': row[3],
				145	'classifier_confidence': row[4],
				146	'user_id': row[5],
				147	'overruled': row[6],
				148	})
				149
				150	return verdicts
				151
				152	def LookupCommentVerdictHistory(self, cnxn, comment_ids):
				153	"""Returns a map of issue_id to most recent spam verdicts"""
				154	rows = self.verdict_tbl.Select(cnxn, cols=[
				155	'comment_id', 'reason', 'created', 'is_spam', 'classifier_confidence',
				156	'user_id', 'overruled'],
				157	comment_id=comment_ids, order_by=[('comment_id', []), ('created', [])])
				158
				159	# TODO: group by comment_id, make class instead of dict for verdict.
				160	verdicts = []
				161	for row in rows:
				162	verdicts.append({
				163	'comment_id': row[0],
				164	'reason': row[1],
				165	'created': row[2],
				166	'is_spam': row[3],
				167	'classifier_confidence': row[4],
				168	'user_id': row[5],
				169	'overruled': row[6],
				170	})
				171
				172	return verdicts
				173
				174	def FlagIssues(self, cnxn, issue_service, issues, reporting_user_id,
				175	flagged_spam):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	176	"""Creates or deletes a spam report on an issue.
				177
				178	This function is run when a user flags an issue as spam but does not
				179	have 'VerdictSpam' permission."""
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	180	verdict_updates = []
				181	if flagged_spam:
				182	rows = [(issue.issue_id, issue.reporter_id, reporting_user_id)
				183	for issue in issues]
				184	self.report_tbl.InsertRows(cnxn, SPAMREPORT_ISSUE_COLS, rows,
				185	ignore=True)
				186	else:
				187	issue_ids = [issue.issue_id for issue in issues]
				188	self.report_tbl.Delete(
				189	cnxn, issue_id=issue_ids, user_id=reporting_user_id,
				190	comment_id=None)
				191
				192	project_id = issues[0].project_id
				193
				194	# Now record new verdicts and update issue.is_spam, if they've changed.
				195	ids = [issue.issue_id for issue in issues]
Adrià Vilanova Martínez	de94280	2022-07-15 14:06:55 +0200	[diff] [blame]	196	counts = self._LookupIssueFlagCounts(cnxn, ids)
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	197	previous_verdicts = self.LookupIssueVerdicts(cnxn, ids)
				198
				199	for issue_id in counts:
				200	# If the flag counts changed enough to toggle the is_spam bit, need to
				201	# record a new verdict and update the Issue.
				202
				203	# No number of user spam flags can overturn an admin's verdict.
				204	if previous_verdicts.get(issue_id) == REASON_MANUAL:
				205	continue
				206
				207	# If enough spam flags come in, mark the issue as spam.
				208	if (flagged_spam and counts[issue_id] >= settings.spam_flag_thresh):
				209	verdict_updates.append(issue_id)
				210
				211	if len(verdict_updates) == 0:
				212	return
				213
				214	# Some of the issues may have exceed the flag threshold, so issue verdicts
				215	# and mark as spam in those cases.
				216	rows = [(issue_id, flagged_spam, REASON_THRESHOLD, project_id)
				217	for issue_id in verdict_updates]
				218	self.verdict_tbl.InsertRows(cnxn, THRESHVERDICT_ISSUE_COLS, rows,
				219	ignore=True)
				220	update_issues = []
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	221	current_time = int(time.time())
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	222	for issue in issues:
				223	if issue.issue_id in verdict_updates:
				224	issue.is_spam = flagged_spam
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	225	issue.migration_modified_timestamp = current_time
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	226	update_issues.append(issue)
				227
				228	if flagged_spam:
				229	for issue in update_issues:
				230	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				231	self.issue_actions.increment(
				232	{
				233	'type': 'flag',
				234	'reporter_id': str(reporting_user_id),
				235	'issue': issue_ref
				236	})
				237
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	238	issue_service.UpdateIssues(
				239	cnxn, update_issues, update_cols=['is_spam', 'migration_modified'])
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	240
				241	def FlagComment(
				242	self, cnxn, issue, comment_id, reported_user_id, reporting_user_id,
				243	flagged_spam):
				244	"""Creates or deletes a spam report on a comment."""
				245	# TODO(seanmccullough): Bulk comment flagging? There's no UI for that.
				246	if flagged_spam:
				247	self.report_tbl.InsertRow(
				248	cnxn,
				249	ignore=True,
				250	issue_id=issue.issue_id,
				251	comment_id=comment_id,
				252	reported_user_id=reported_user_id,
				253	user_id=reporting_user_id)
				254	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				255	self.comment_actions.increment(
				256	{
				257	'type': 'flag',
				258	'reporter_id': str(reporting_user_id),
				259	'issue': issue_ref,
				260	'comment_id': str(comment_id)
				261	})
				262	else:
				263	self.report_tbl.Delete(
				264	cnxn,
				265	issue_id=issue.issue_id,
				266	comment_id=comment_id,
				267	user_id=reporting_user_id)
				268
				269	def RecordClassifierIssueVerdict(self, cnxn, issue, is_spam, confidence,
				270	fail_open):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	271	"""Records a judgment call on whether a new issue is spam.
				272
				273	Only run when an issue is newly filed. If the issue is determined to be
				274	likely spam, the code increments a counter."""
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	275	reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	276	self.verdict_tbl.InsertRow(
				277	cnxn,
				278	issue_id=issue.issue_id,
				279	is_spam=is_spam,
				280	reason=reason,
				281	classifier_confidence=confidence,
				282	project_id=issue.project_id,
				283	overruled=False)
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	284	if is_spam:
				285	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				286	self.issue_actions.increment(
				287	{
				288	'type': 'classifier',
				289	'reporter_id': 'classifier',
				290	'issue': issue_ref
				291	})
				292	# This is called at issue creation time, so there's nothing else to do here.
				293
				294	def RecordManualIssueVerdicts(self, cnxn, issue_service, issues, user_id,
				295	is_spam):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	296	"""Bypasses the classifier to manually classify an issue as spam.
				297
				298	This code can only be run by users with the 'VerdictSpam' permission."""
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	299	rows = [(user_id, issue.issue_id, is_spam, REASON_MANUAL, issue.project_id)
				300	for issue in issues]
				301	issue_ids = [issue.issue_id for issue in issues]
				302
				303	# Overrule all previous verdicts.
				304	self.verdict_tbl.Update(cnxn, {'overruled': True}, [
				305	('issue_id IN (%s)' % sql.PlaceHolders(issue_ids), issue_ids)
				306	], commit=False)
				307
				308	self.verdict_tbl.InsertRows(cnxn, MANUALVERDICT_ISSUE_COLS, rows,
				309	ignore=True)
				310
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	311	current_time = int(time.time())
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	312	for issue in issues:
				313	issue.is_spam = is_spam
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	314	issue.migration_modified_timestamp = current_time
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	315
				316	if is_spam:
				317	for issue in issues:
				318	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				319	self.issue_actions.increment(
				320	{
				321	'type': 'manual',
				322	'reporter_id': str(user_id),
				323	'issue': issue_ref
				324	})
				325	else:
				326	issue_service.AllocateNewLocalIDs(cnxn, issues)
				327
				328	# This will commit the transaction.
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	329	issue_service.UpdateIssues(
				330	cnxn, issues, update_cols=['is_spam', 'migration_modified'])
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	331
				332	def RecordManualCommentVerdict(self, cnxn, issue_service, user_service,
				333	comment_id, user_id, is_spam):
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	334	"""Bypasses the classifier to manually classify a comment as spam.
				335
				336	This code can only be run by users with the 'VerdictSpam' permission."""
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	337	# TODO(seanmccullough): Bulk comment verdicts? There's no UI for that.
				338	self.verdict_tbl.InsertRow(cnxn, ignore=True,
				339	user_id=user_id, comment_id=comment_id, is_spam=is_spam,
				340	reason=REASON_MANUAL)
				341	comment = issue_service.GetComment(cnxn, comment_id)
				342	comment.is_spam = is_spam
				343	issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False)
				344	issue_service.SoftDeleteComment(
				345	cnxn, issue, comment, user_id, user_service, is_spam, True, is_spam)
				346	if is_spam:
				347	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				348	self.comment_actions.increment(
				349	{
				350	'type': 'manual',
				351	'reporter_id': str(user_id),
				352	'issue': issue_ref,
				353	'comment_id': str(comment_id)
				354	})
				355
				356	def RecordClassifierCommentVerdict(
				357	self, cnxn, issue_service, comment, is_spam, confidence, fail_open):
				358	reason = REASON_FAIL_OPEN if fail_open else REASON_CLASSIFIER
				359	self.verdict_tbl.InsertRow(cnxn, comment_id=comment.id, is_spam=is_spam,
				360	reason=reason, classifier_confidence=confidence,
				361	project_id=comment.project_id)
				362	if is_spam:
				363	issue = issue_service.GetIssue(cnxn, comment.issue_id, use_cache=False)
				364	issue_ref = '%s:%s' % (issue.project_name, issue.local_id)
				365	self.comment_actions.increment(
				366	{
				367	'type': 'classifier',
				368	'reporter_id': 'classifier',
				369	'issue': issue_ref,
				370	'comment_id': str(comment.id)
				371	})
				372
				373	def _predict(self, instance):
				374	"""Requests a prediction from the ML Engine API.
				375
				376	Sample API response:
				377	{'predictions': [{
				378	'classes': ['0', '1'],
				379	'scores': [0.4986788034439087, 0.5013211965560913]
				380	}]}
				381
				382	This hits the default model.
				383
				384	Returns:
				385	A floating point number representing the confidence
				386	the instance is spam.
				387	"""
				388	model_name = 'projects/%s/models/%s' % (
				389	settings.classifier_project_id, settings.spam_model_name)
				390	body = {'instances': [{"inputs": instance["word_hashes"]}]}
				391
				392	if not self.ml_engine:
				393	self.ml_engine = ml_helpers.setup_ml_engine()
				394
				395	request = self.ml_engine.projects().predict(name=model_name, body=body)
				396	response = request.execute()
				397	logging.info('ML Engine API response: %r' % response)
				398	prediction = response['predictions'][0]
				399
				400	# Ensure the class confidence we return is for the spam, not the ham label.
				401	# The spam label, '1', is usually at index 1 but I'm not sure of any
				402	# guarantees around label order.
				403	if prediction['classes'][1] == SPAM_CLASS_LABEL:
				404	return prediction['scores'][1]
				405	elif prediction['classes'][0] == SPAM_CLASS_LABEL:
				406	return prediction['scores'][0]
				407	else:
				408	raise Exception('No predicted classes found.')
				409
				410	def _IsExempt(self, author, is_project_member):
				411	"""Return True if the user is exempt from spam checking."""
				412	if author.email is not None and author.email.endswith(
				413	settings.spam_allowlisted_suffixes):
				414	logging.info('%s allowlisted from spam filtering', author.email)
				415	return True
				416
				417	if is_project_member:
				418	logging.info('%s is a project member, assuming ham', author.email)
				419	return True
				420
				421	return False
				422
				423	def ClassifyIssue(self, issue, firstComment, reporter, is_project_member):
				424	"""Classify an issue as either spam or ham.
				425
				426	Args:
				427	issue: the Issue.
				428	firstComment: the first Comment on issue.
				429	reporter: User PB for the Issue reporter.
				430	is_project_member: True if reporter is a member of issue's project.
				431
				432	Returns a JSON dict of classifier prediction results from
				433	the ML Engine API.
				434	"""
				435	instance = ml_helpers.GenerateFeaturesRaw(
				436	[issue.summary, firstComment.content],
				437	settings.spam_feature_hashes)
				438	return self._classify(instance, reporter, is_project_member)
				439
				440	def ClassifyComment(self, comment_content, commenter, is_project_member=True):
				441	"""Classify a comment as either spam or ham.
				442
				443	Args:
				444	comment: the comment text.
				445	commenter: User PB for the user who authored the comment.
				446
				447	Returns a JSON dict of classifier prediction results from
				448	the ML Engine API.
				449	"""
				450	instance = ml_helpers.GenerateFeaturesRaw(
				451	['', comment_content],
				452	settings.spam_feature_hashes)
				453	return self._classify(instance, commenter, is_project_member)
				454
				455
				456	def _classify(self, instance, author, is_project_member):
				457	# Fail-safe: not spam.
				458	result = self.ham_classification()
				459
				460	if self._IsExempt(author, is_project_member):
				461	return result
				462
				463	if not self.ml_engine:
				464	self.ml_engine = ml_helpers.setup_ml_engine()
				465
				466	# If setup_ml_engine returns None, it failed to init.
				467	if not self.ml_engine:
				468	logging.error("ML Engine not initialized.")
				469	self.ml_engine_failures.increment()
				470	result['failed_open'] = True
				471	return result
				472
				473	remaining_retries = 3
				474	while remaining_retries > 0:
				475	try:
				476	result['confidence_is_spam'] = self._predict(instance)
				477	result['failed_open'] = False
				478	return result
				479	except Exception as ex:
				480	remaining_retries = remaining_retries - 1
				481	self.ml_engine_failures.increment()
				482	logging.error('Error calling ML Engine API: %s' % ex)
				483
				484	result['failed_open'] = True
				485	return result
				486
				487	def ham_classification(self):
				488	return {'confidence_is_spam': 0.0,
				489	'failed_open': False}
				490
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	491	def GetIssueFlagQueue(
				492	self, cnxn, _issue_service, project_id, offset=0, limit=10):
				493	"""Returns list of recent issues that have been flagged by users"""
				494	issue_flags = self.report_tbl.Select(
				495	cnxn,
				496	cols=[
				497	"Issue.project_id", "Report.issue_id", "count(*) as count",
				498	"max(Report.created) as latest",
				499	"count(distinct Report.user_id) as users"
				500	],
				501	left_joins=["Issue ON Issue.id = Report.issue_id"],
				502	where=[
				503	('Report.issue_id IS NOT NULL', []),
				504	("Issue.project_id == %v", [project_id])
				505	],
				506	order_by=[('count DESC', [])],
				507	group_by=['Report.issue_id'],
				508	offset=offset,
				509	limit=limit)
				510	ret = []
				511	for row in issue_flags:
				512	ret.append(
				513	ModerationItem(
				514	project_id=row[0],
				515	issue_id=row[1],
				516	count=row[2],
				517	latest_report=row[3],
				518	num_users=row[4],
				519	))
				520
				521	count = self.verdict_tbl.SelectValue(
				522	cnxn,
				523	col='COUNT(DISTINCT Report.issue_id)',
				524	where=[('Issue.project_id = %s', [project_id])],
				525	left_joins=["Issue ON Issue.id = SpamReport.issue_id"])
				526	return ret, count
				527
				528
				529	def GetCommentClassifierQueue(
				530	self, cnxn, _issue_service, project_id, offset=0, limit=10):
				531	"""Returns list of recent comments with spam verdicts,
				532	ranked in ascending order of confidence (so uncertain items are first).
				533	"""
				534	# TODO(seanmccullough): Optimize pagination. This query probably gets
				535	# slower as the number of SpamVerdicts grows, regardless of offset
				536	# and limit values used here. Using offset,limit in general may not
				537	# be the best way to do this.
				538	comment_results = self.verdict_tbl.Select(
				539	cnxn,
				540	cols=[
				541	'issue_id', 'is_spam', 'reason', 'classifier_confidence', 'created'
				542	],
				543	where=[
				544	('project_id = %s', [project_id]),
				545	(
				546	'classifier_confidence <= %s',
				547	[settings.classifier_moderation_thresh]),
				548	('overruled = %s', [False]),
				549	('comment_id IS NOT NULL', []),
				550	],
				551	order_by=[
				552	('classifier_confidence ASC', []),
				553	('created ASC', []),
				554	],
				555	group_by=['comment_id'],
				556	offset=offset,
				557	limit=limit,
				558	)
				559
				560	ret = []
				561	for row in comment_results:
				562	ret.append(
				563	ModerationItem(
				564	comment_id=int(row[0]),
				565	is_spam=row[1] == 1,
				566	reason=row[2],
				567	classifier_confidence=row[3],
				568	verdict_time='%s' % row[4],
				569	))
				570
				571	count = self.verdict_tbl.SelectValue(
				572	cnxn,
				573	col='COUNT(*)',
				574	where=[
				575	('project_id = %s', [project_id]),
				576	(
				577	'classifier_confidence <= %s',
				578	[settings.classifier_moderation_thresh]),
				579	('overruled = %s', [False]),
				580	('comment_id IS NOT NULL', []),
				581	])
				582
				583	return ret, count
				584
				585
				586	def GetTrainingIssues(self, cnxn, issue_service, since, offset=0, limit=100):
				587	"""Returns list of recent issues with human-labeled spam/ham verdicts.
				588	"""
				589
				590	# get all of the manual verdicts in the past day.
				591	results = self.verdict_tbl.Select(cnxn,
				592	cols=['issue_id'],
				593	where=[
				594	('overruled = %s', [False]),
				595	('reason = %s', ['manual']),
				596	('issue_id IS NOT NULL', []),
				597	('created > %s', [since.isoformat()]),
				598	],
				599	offset=offset,
				600	limit=limit,
				601	)
				602
				603	issue_ids = [int(row[0]) for row in results if row[0]]
				604	issues = issue_service.GetIssues(cnxn, issue_ids)
				605	comments = issue_service.GetCommentsForIssues(cnxn, issue_ids)
				606	first_comments = {}
				607	for issue in issues:
				608	first_comments[issue.issue_id] = (comments[issue.issue_id][0].content
				609	if issue.issue_id in comments else "[Empty]")
				610
				611	count = self.verdict_tbl.SelectValue(cnxn,
				612	col='COUNT(*)',
				613	where=[
				614	('overruled = %s', [False]),
				615	('reason = %s', ['manual']),
				616	('issue_id IS NOT NULL', []),
				617	('created > %s', [since.isoformat()]),
				618	])
				619
				620	return issues, first_comments, count
				621
				622	def GetTrainingComments(self, cnxn, issue_service, since, offset=0,
				623	limit=100):
				624	"""Returns list of recent comments with human-labeled spam/ham verdicts.
				625	"""
				626
				627	# get all of the manual verdicts in the past day.
				628	results = self.verdict_tbl.Select(
				629	cnxn,
				630	distinct=True,
				631	cols=['comment_id'],
				632	where=[
				633	('overruled = %s', [False]),
				634	('reason = %s', ['manual']),
				635	('comment_id IS NOT NULL', []),
				636	('created > %s', [since.isoformat()]),
				637	],
				638	offset=offset,
				639	limit=limit,
				640	)
				641
				642	comment_ids = [int(row[0]) for row in results if row[0]]
				643	# Don't care about sequence numbers in this context yet.
				644	comments = issue_service.GetCommentsByID(cnxn, comment_ids,
				645	defaultdict(int))
				646	return comments
				647
				648	def ExpungeUsersInSpam(self, cnxn, user_ids):
				649	"""Removes all references to given users from Spam DB tables.
				650
				651	This method will not commit the operations. This method will
				652	not make changes to in-memory data.
				653	"""
				654	commit = False
				655	self.report_tbl.Delete(cnxn, reported_user_id=user_ids, commit=commit)
				656	self.report_tbl.Delete(cnxn, user_id=user_ids, commit=commit)
				657	self.verdict_tbl.Delete(cnxn, user_id=user_ids, commit=commit)
				658
				659
				660	class ModerationItem:
				661	def __init__(self, **kwargs):
				662	self.__dict__ = kwargs