blob: dc5e7159ee2db270c879a1a7a6c8b27c44039263 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file or at
4# https://developers.google.com/open-source/licenses/bsd
5""" Tasks and handlers for maintaining the spam classifier model. These
6 should be run via cron and task queue rather than manually.
7"""
8from __future__ import print_function
9from __future__ import division
10from __future__ import absolute_import
11
12import csv
13import logging
14import webapp2
15import cloudstorage
16import json
17
18from datetime import date
19from datetime import datetime
20from datetime import timedelta
21from google.appengine.api import app_identity
22
23from framework import cloud_tasks_helpers
24from framework import gcs_helpers
25from framework import servlet
26from framework import urls
27
28class TrainingDataExport(webapp2.RequestHandler):
29 """Trigger a training data export task"""
30 def get(self):
31 task = cloud_tasks_helpers.generate_simple_task(
32 urls.SPAM_DATA_EXPORT_TASK + '.do', {})
33 cloud_tasks_helpers.create_task(task)
34
35
36BATCH_SIZE = 1000
37
38class TrainingDataExportTask(servlet.Servlet):
39 """Export any human-labeled ham or spam from the previous day. These
40 records will be used by a subsequent task to create an updated model.
41 """
42 CHECK_SECURITY_TOKEN = False
43
44 def ProcessFormData(self, mr, post_data):
45 logging.info("Training data export initiated.")
46
47 bucket_name = app_identity.get_default_gcs_bucket_name()
48 date_str = date.today().isoformat()
49 export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str
50 total_issues = 0
51
52 with cloudstorage.open(export_target_path, mode='w',
53 content_type=None, options=None, retry_params=None) as gcs_file:
54
55 csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"',
56 quoting=csv.QUOTE_ALL, lineterminator='\n')
57
58 since = datetime.now() - timedelta(days=7)
59
60 # TODO: Further pagination.
61 issues, first_comments, _count = (
62 self.services.spam.GetTrainingIssues(
63 mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))
64 total_issues += len(issues)
65 for issue in issues:
66 # Cloud Prediction API doesn't allow newlines in the training data.
67 fixed_summary = issue.summary.replace('\r\n', ' ')
68 fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ')
69 email = self.services.user.LookupUserEmail(mr.cnxn, issue.reporter_id)
70 csv_writer.writerow([
71 'spam' if issue.is_spam else 'ham',
72 fixed_summary.encode('utf-8'), fixed_comment.encode('utf-8'), email,
73 ])
74
75 comments = (
76 self.services.spam.GetTrainingComments(
77 mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))
78 total_comments = len(comments)
79 for comment in comments:
80 # Cloud Prediction API doesn't allow newlines in the training data.
81 fixed_comment = comment.content.replace('\r\n', ' ')
82 email = self.services.user.LookupUserEmail(mr.cnxn, comment.user_id)
83 csv_writer.writerow([
84 'spam' if comment.is_spam else 'ham',
85 # Comments don't have summaries, so it's blank:
86 '', fixed_comment.encode('utf-8'), email
87 ])
88
89 self.response.body = json.dumps({
90 "exported_issue_count": total_issues,
91 "exported_comment_count": total_comments,
92 })