Blame - features/spammodel.py - monorail-avm99963

blob: dc5e7159ee2db270c879a1a7a6c8b27c44039263 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame^]	1	# Copyright 2016 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5	""" Tasks and handlers for maintaining the spam classifier model. These
				6	should be run via cron and task queue rather than manually.
				7	"""
				8	from __future__ import print_function
				9	from __future__ import division
				10	from __future__ import absolute_import
				11
				12	import csv
				13	import logging
				14	import webapp2
				15	import cloudstorage
				16	import json
				17
				18	from datetime import date
				19	from datetime import datetime
				20	from datetime import timedelta
				21	from google.appengine.api import app_identity
				22
				23	from framework import cloud_tasks_helpers
				24	from framework import gcs_helpers
				25	from framework import servlet
				26	from framework import urls
				27
				28	class TrainingDataExport(webapp2.RequestHandler):
				29	"""Trigger a training data export task"""
				30	def get(self):
				31	task = cloud_tasks_helpers.generate_simple_task(
				32	urls.SPAM_DATA_EXPORT_TASK + '.do', {})
				33	cloud_tasks_helpers.create_task(task)
				34
				35
				36	BATCH_SIZE = 1000
				37
				38	class TrainingDataExportTask(servlet.Servlet):
				39	"""Export any human-labeled ham or spam from the previous day. These
				40	records will be used by a subsequent task to create an updated model.
				41	"""
				42	CHECK_SECURITY_TOKEN = False
				43
				44	def ProcessFormData(self, mr, post_data):
				45	logging.info("Training data export initiated.")
				46
				47	bucket_name = app_identity.get_default_gcs_bucket_name()
				48	date_str = date.today().isoformat()
				49	export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str
				50	total_issues = 0
				51
				52	with cloudstorage.open(export_target_path, mode='w',
				53	content_type=None, options=None, retry_params=None) as gcs_file:
				54
				55	csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"',
				56	quoting=csv.QUOTE_ALL, lineterminator='\n')
				57
				58	since = datetime.now() - timedelta(days=7)
				59
				60	# TODO: Further pagination.
				61	issues, first_comments, _count = (
				62	self.services.spam.GetTrainingIssues(
				63	mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))
				64	total_issues += len(issues)
				65	for issue in issues:
				66	# Cloud Prediction API doesn't allow newlines in the training data.
				67	fixed_summary = issue.summary.replace('\r\n', ' ')
				68	fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ')
				69	email = self.services.user.LookupUserEmail(mr.cnxn, issue.reporter_id)
				70	csv_writer.writerow([
				71	'spam' if issue.is_spam else 'ham',
				72	fixed_summary.encode('utf-8'), fixed_comment.encode('utf-8'), email,
				73	])
				74
				75	comments = (
				76	self.services.spam.GetTrainingComments(
				77	mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))
				78	total_comments = len(comments)
				79	for comment in comments:
				80	# Cloud Prediction API doesn't allow newlines in the training data.
				81	fixed_comment = comment.content.replace('\r\n', ' ')
				82	email = self.services.user.LookupUserEmail(mr.cnxn, comment.user_id)
				83	csv_writer.writerow([
				84	'spam' if comment.is_spam else 'ham',
				85	# Comments don't have summaries, so it's blank:
				86	'', fixed_comment.encode('utf-8'), email
				87	])
				88
				89	self.response.body = json.dumps({
				90	"exported_issue_count": total_issues,
				91	"exported_comment_count": total_comments,
				92	})