Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame^] | 1 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style |
| 3 | # license that can be found in the LICENSE file or at |
| 4 | # https://developers.google.com/open-source/licenses/bsd |
| 5 | """ Tasks and handlers for maintaining the spam classifier model. These |
| 6 | should be run via cron and task queue rather than manually. |
| 7 | """ |
| 8 | from __future__ import print_function |
| 9 | from __future__ import division |
| 10 | from __future__ import absolute_import |
| 11 | |
| 12 | import csv |
| 13 | import logging |
| 14 | import webapp2 |
| 15 | import cloudstorage |
| 16 | import json |
| 17 | |
| 18 | from datetime import date |
| 19 | from datetime import datetime |
| 20 | from datetime import timedelta |
| 21 | from google.appengine.api import app_identity |
| 22 | |
| 23 | from framework import cloud_tasks_helpers |
| 24 | from framework import gcs_helpers |
| 25 | from framework import servlet |
| 26 | from framework import urls |
| 27 | |
| 28 | class TrainingDataExport(webapp2.RequestHandler): |
| 29 | """Trigger a training data export task""" |
| 30 | def get(self): |
| 31 | task = cloud_tasks_helpers.generate_simple_task( |
| 32 | urls.SPAM_DATA_EXPORT_TASK + '.do', {}) |
| 33 | cloud_tasks_helpers.create_task(task) |
| 34 | |
| 35 | |
| 36 | BATCH_SIZE = 1000 |
| 37 | |
| 38 | class TrainingDataExportTask(servlet.Servlet): |
| 39 | """Export any human-labeled ham or spam from the previous day. These |
| 40 | records will be used by a subsequent task to create an updated model. |
| 41 | """ |
| 42 | CHECK_SECURITY_TOKEN = False |
| 43 | |
| 44 | def ProcessFormData(self, mr, post_data): |
| 45 | logging.info("Training data export initiated.") |
| 46 | |
| 47 | bucket_name = app_identity.get_default_gcs_bucket_name() |
| 48 | date_str = date.today().isoformat() |
| 49 | export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str |
| 50 | total_issues = 0 |
| 51 | |
| 52 | with cloudstorage.open(export_target_path, mode='w', |
| 53 | content_type=None, options=None, retry_params=None) as gcs_file: |
| 54 | |
| 55 | csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"', |
| 56 | quoting=csv.QUOTE_ALL, lineterminator='\n') |
| 57 | |
| 58 | since = datetime.now() - timedelta(days=7) |
| 59 | |
| 60 | # TODO: Further pagination. |
| 61 | issues, first_comments, _count = ( |
| 62 | self.services.spam.GetTrainingIssues( |
| 63 | mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE)) |
| 64 | total_issues += len(issues) |
| 65 | for issue in issues: |
| 66 | # Cloud Prediction API doesn't allow newlines in the training data. |
| 67 | fixed_summary = issue.summary.replace('\r\n', ' ') |
| 68 | fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ') |
| 69 | email = self.services.user.LookupUserEmail(mr.cnxn, issue.reporter_id) |
| 70 | csv_writer.writerow([ |
| 71 | 'spam' if issue.is_spam else 'ham', |
| 72 | fixed_summary.encode('utf-8'), fixed_comment.encode('utf-8'), email, |
| 73 | ]) |
| 74 | |
| 75 | comments = ( |
| 76 | self.services.spam.GetTrainingComments( |
| 77 | mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE)) |
| 78 | total_comments = len(comments) |
| 79 | for comment in comments: |
| 80 | # Cloud Prediction API doesn't allow newlines in the training data. |
| 81 | fixed_comment = comment.content.replace('\r\n', ' ') |
| 82 | email = self.services.user.LookupUserEmail(mr.cnxn, comment.user_id) |
| 83 | csv_writer.writerow([ |
| 84 | 'spam' if comment.is_spam else 'ham', |
| 85 | # Comments don't have summaries, so it's blank: |
| 86 | '', fixed_comment.encode('utf-8'), email |
| 87 | ]) |
| 88 | |
| 89 | self.response.body = json.dumps({ |
| 90 | "exported_issue_count": total_issues, |
| 91 | "exported_comment_count": total_comments, |
| 92 | }) |