Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/ml/Makefile b/tools/ml/Makefile
new file mode 100644
index 0000000..b0a8684
--- /dev/null
+++ b/tools/ml/Makefile
@@ -0,0 +1,222 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+# Use 'make help' for a list of commands.
+
+OUTPUT_DIR := /tmp/monospam-local-training/
+TIMESTAMP := $(shell date +%s)
+MODEL_DIR := /tmp/monospam-local-training/export/Servo/{TIMESTAMP}/
+SPAM_JOB_NAME := spam_trainer_$(TIMESTAMP)
+COMP_JOB_NAME := comp_trainer_$(TIMESTAMP)
+
+default: help
+
+help:
+ @echo "Available commands:"
+ @sed -n '/^[a-zA-Z0-9_.]*:/s/:.*//p' <Makefile
+
+train_local_spam:
+ gcloud ai-platform local train \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --job-dir $(OUTPUT_DIR) \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --train-file $(TRAIN_FILE) \
+ --trainer-type spam
+
+train_local_spam_2:
+ gcloud ai-platform local train \
+ --package-path trainer2/ \
+ --module-name trainer2.task \
+ --job-dir $(OUTPUT_DIR) \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --train-file $(TRAIN_FILE) \
+ --trainer-type spam
+
+predict_local_spam:
+ ./spam.py local-predict
+ gcloud ai-platform local predict \
+ --model-dir $(MODEL_DIR) \
+ --json-instances /tmp/instances.json
+
+train_from_prod_data_spam:
+ gcloud ai-platform local train \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --job-dir $(OUTPUT_DIR) \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix spam_training_data \
+ --trainer-type spam
+
+train_from_prod_data_spam_2:
+ gcloud ai-platform local train \
+ --package-path trainer2/ \
+ --module-name trainer2.task \
+ --job-dir $(OUTPUT_DIR) \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix spam_training_data \
+ --trainer-type spam
+
+submit_train_job_spam:
+ @echo ${TIMESTAMP}
+ gcloud ai-platform jobs submit training $(SPAM_JOB_NAME) \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --runtime-version 1.2 \
+ --job-dir gs://monorail-prod-mlengine/$(SPAM_JOB_NAME) \
+ --region us-central1 \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix spam_training_data \
+ --trainer-type spam
+
+submit_train_job_spam_2:
+ @echo ${TIMESTAMP}
+ gcloud ai-platform jobs submit training $(SPAM_JOB_NAME) \
+ --package-path trainer2/ \
+ --module-name trainer2.task \
+ --runtime-version 2.1 \
+ --python-version 3.7 \
+ --job-dir gs://monorail-prod-mlengine/$(SPAM_JOB_NAME) \
+ --region us-central1 \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix spam_training_data \
+ --trainer-type spam
+
+# VERSION of format 'v_TIMESTAMP' should match TIMESTAMP in SPAM_JOB_NAME and MODEL_BINARIES.
+upload_model_prod_spam:
+ifndef MODEL_BINARIES
+ $(error MODEL_BINARIES not set)
+endif
+ifndef VERSION
+ $(error VERSION not set)
+endif
+ gsutil ls -r gs://monorail-prod-mlengine/$(SPAM_JOB_NAME)
+ gcloud ai-platform versions create $(VERSION) \
+ --model spam_only_words \
+ --origin $(MODEL_BINARIES) \
+ --runtime-version 1.2
+ gcloud ai-platform versions set-default $(VERSION) --model spam_only_words
+
+submit_pred_spam:
+ifndef SUMMARY_PATH
+ $(error SUMMARY_PATH not set)
+endif
+ifndef CONTENT_PATH
+ $(error CONTENT_PATH not set)
+endif
+ ./spam.py predict --summary $(SUMMARY_PATH) --content $(CONTENT_PATH)
+
+
+train_from_prod_data_component:
+ gcloud ai-platform local train \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --job-dir $(OUTPUT_DIR) \
+ -- \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix component_training_data \
+ --trainer-type component
+
+submit_train_job_component:
+ gcloud init
+ gcloud ai-platform jobs submit training $(COMP_JOB_NAME) \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --runtime-version 1.2 \
+ --job-dir gs://monorail-prod-mlengine/$(COMP_JOB_NAME) \
+ --region us-central1 \
+ --scale-tier custom \
+ --config config.json \
+ -- \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix component_training_data \
+ --trainer-type component
+
+submit_train_job_component_2:
+ gcloud ai-platform jobs submit training $(COMP_JOB_NAME) \
+ --package-path trainer2/ \
+ --module-name trainer2.task \
+ --runtime-version 2.1 \
+ --python-version 3.7 \
+ --job-dir gs://monorail-prod-mlengine/$(COMP_JOB_NAME) \
+ --region us-central1 \
+ --scale-tier custom \
+ --master-machine-type n1-highmem-8 \
+ -- \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix component_training_data \
+ --trainer-type component
+
+# VERSION of format 'v_TIMESTAMP' should match TIMESTAMP in COMP_JOB_NAME and MODEL_BINARIES.
+upload_model_prod_component:
+ifndef MODEL_BINARIES
+ $(error MODEL_BINARIES not set)
+endif
+ifndef VERSION
+ $(error VERSION not set)
+endif
+ gsutil ls -r gs://monorail-prod-mlengine/$(COMP_JOB_NAME)
+ gcloud ai-platform versions create $(VERSION) \
+ --model component_top_words \
+ --origin $(MODEL_BINARIES) \
+ --runtime-version 1.2
+ gcloud ai-platform versions set-default $(VERSION) --model component_top_words
+
+submit_pred_component:
+ifndef CONTENT_PATH
+ $(error CONTENT_PATH not set)
+endif
+ ./component.py --project monorail-prod --content $(CONTENT_PATH)
+
+
+### Local Training in TF 2.0
+
+tf2_train_local_spam:
+ifndef TRAIN_FILE
+ $(error TRAIN_FILE not set)
+endif
+ python3 ./trainer2/task.py \
+ --train-file $(TRAIN_FILE) \
+ --job-dir $(OUTPUT_DIR) \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --trainer-type spam
+
+tf2_train_local_component:
+ifndef TRAIN_FILE
+ $(error TRAIN_FILE not set)
+endif
+ python3 ./trainer2/task.py \
+ --train-file $(TRAIN_FILE) \
+ --job-dir $(OUTPUT_DIR) \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --trainer-type component
diff --git a/tools/ml/README.md b/tools/ml/README.md
new file mode 100644
index 0000000..01b0702
--- /dev/null
+++ b/tools/ml/README.md
@@ -0,0 +1,222 @@
+# Monorail Machine Learning Classifiers
+
+Monorail has two machine learning classifiers running in ML Engine: a spam classifier and a component predictor.
+
+Whenever a user creates a new issue (or comments on an issue without an assigned component), components are suggested based on the text the user types using Monorail's component predictor.
+
+Monorail also runs each new issue and comment through a spam classifier model.
+
+In order to train a new model locally or in the cloud, follow the instructions below.
+
+> Note: you must be logged into the correct GCP project with `gcloud` in order to run the below commands.
+
+### New model in trainer2/
+
+The new code is used for local training and exporting model using Python3 and TensorFlow 2.0. Future predictor should also be migrated to use the training files in trainer2/.
+
+### Trainer
+
+Both trainers are Python modules that do the following:
+
+1. Download all (spam or component) exported training data from GCS
+2. Define a TensorFlow Estimator and Experiment
+
+ML Engine uses the high-level [`learn_runner`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/learn_runner/run) API (see [`trainer/task.py`](trainer/task.py)) which allows it to train, evaluate, and predict against a model saved in GCS.
+
+## Monorail Spam Classifier
+
+### Run locally
+
+To run any training jobs locally, you'll need Python 2 and TensorFlow 1.2:
+
+```sh
+pip install -r requirements.txt
+```
+
+Run a local training job with placeholder data:
+
+```sh
+make TRAIN_FILE=./sample_spam_training_data.csv train_local_spam
+```
+
+To have the local trainer download and train on the real training data, you'll
+need to be logged into `gcloud` and have access to the `monorail-prod` project.
+
+```sh
+make train_from_prod_data_spam
+```
+
+<!-- TODO: the below has not been reviewed recently. -->
+
+### Submit a local prediction
+
+```sh
+./spam.py local-predict
+gcloud ml-engine local predict --model-dir $OUTPUT_DIR/export/Servo/{TIMESTAMP}/ --json-instances /tmp/instances.json
+```
+
+### Submitting a training job to ML Engine
+
+This will run a job and output a trained model to GCS. Job names must be unique.
+
+First verify you're in the `monorail-prod` GCP project.
+
+```sh
+gcloud init
+```
+
+To submit a training job manually, run:
+
+```sh
+TIMESTAMP=$(date +%s)
+JOB_NAME=spam_trainer_$TIMESTAMP
+gcloud ml-engine jobs submit training $JOB_NAME \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --runtime-version 1.2 \
+ --job-dir gs://monorail-prod-mlengine/$JOB_NAME \
+ --region us-central1 \
+ -- \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix spam_training_data \
+ --trainer-type spam
+```
+
+### Uploading a model and and promoting it to production
+
+To upload a model you'll need to locate the exported model directory in GCS. To do that, run:
+
+```sh
+gsutil ls -r gs://monorail-prod-mlengine/$JOB_NAME
+
+# Look for a directory that matches the below structure and assign it.
+# It should have the structure $GCS_OUTPUT_LOCATION/export/Servo/$TIMESTAMP/.
+MODEL_BINARIES=gs://monorail-prod-mlengine/spam_trainer_1507059720/export/Servo/1507060043/
+
+VERSION=v_$TIMESTAMP
+gcloud ml-engine versions create $VERSION \
+ --model spam_only_words \
+ --origin $MODEL_BINARIES \
+ --runtime-version 1.2
+```
+
+To promote to production, set that model as default.
+
+```sh
+gcloud ml-engine versions set-default $VERSION --model spam_only_words
+```
+
+### Submit a prediction
+
+Use the script [`spam.py`](spam.py) to make predictions
+from the command line. Files containing text for classification must be provided as summary and content arguments.
+
+```sh
+$ ./spam.py predict --summary summary.txt --content content.txt
+{u'predictions': [{u'classes': [u'0', u'1'], u'scores': [0.4986788034439087, 0.5013211965560913]}]}
+```
+
+A higher probability for class 1 indicates that the text was classified as spam.
+
+### Compare model accuracy
+
+After submitting a job to ML Engine, you can compare the accuracy of two submitted jobs using their trainer names.
+
+```sh
+$ ./spam.py --project monorail-prod compare-accuracy --model1 spam_trainer_1521756634 --model2 spam_trainer_1516759200
+spam_trainer_1521756634:
+AUC: 0.996436 AUC Precision/Recall: 0.997456
+
+spam_trainer_1516759200:
+AUC: 0.982159 AUC Precision/Recall: 0.985069
+```
+
+By default, model1 is the default model running in the specified project. Note that an error will be thrown if the trainer does not contain an eval_data.json file.
+
+## Monorail Component Predictor
+
+### Run locally
+
+To kick off a local training job, run:
+
+```sh
+OUTPUT_DIR=/tmp/monospam-local-training
+rm -rf $OUTPUT_DIR
+gcloud ml-engine local train \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --job-dir $OUTPUT_DIR \
+ -- \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix component_training_data \
+ --trainer-type component
+```
+
+### Submitting a training job to ML Engine
+
+This will run a job and output a trained model to GCS. Job names must be unique.
+
+First verify you're in the `monorail-prod` GCP project.
+
+```sh
+gcloud init
+```
+
+To submit a training job manually, run:
+
+```sh
+TIMESTAMP=$(date +%s)
+JOB_NAME=component_trainer_$TIMESTAMP
+gcloud ml-engine jobs submit training $JOB_NAME \
+ --package-path trainer/ \
+ --module-name trainer.task \
+ --runtime-version 1.2 \
+ --job-dir gs://monorail-prod-mlengine/$JOB_NAME \
+ --region us-central1 \
+ --scale-tier custom \
+ --config config.json \
+ -- \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --gcs-bucket monorail-prod.appspot.com \
+ --gcs-prefix component_training_data \
+ --trainer-type component
+```
+
+### Uploading a model and and promoting it to production
+
+To upload a model you'll need to locate the exported model directory in GCS. To do that, run:
+
+```sh
+gsutil ls -r gs://monorail-prod-mlengine/$JOB_NAME
+
+# Look for a directory that matches the below structure and assign it.
+# It should have the structure $GCS_OUTPUT_LOCATION/export/Servo/$TIMESTAMP/.
+MODEL_BINARIES=gs://monorail-prod-mlengine/component_trainer_1507059720/export/Servo/1507060043/
+
+VERSION=v_$TIMESTAMP
+gcloud ml-engine versions create $VERSION \
+ --model component_top_words \
+ --origin $MODEL_BINARIES \
+ --runtime-version 1.2
+```
+To promote to production, set that model as default.
+
+```sh
+gcloud ml-engine versions set-default $VERSION --model component_top_words
+```
+
+### Submit a prediction
+
+Use the script [`component.py`](component.py) to make predictions from the command line. A file containing text for classification must be provided as the content argument.
+
+```sh
+$ ./component.py --project monorail-prod --content content.txt
+Most likely component: index 108, component id 36250211
+```
diff --git a/tools/ml/comment-training-export.sql b/tools/ml/comment-training-export.sql
new file mode 100644
index 0000000..891ed18
--- /dev/null
+++ b/tools/ml/comment-training-export.sql
@@ -0,0 +1,16 @@
+select
+ IF(v.is_spam, "spam", "ham"),
+ "",
+ REPLACE(cc.content, '\n', '\r'),
+ u.email,
+ CONCAT("https://bugs.chromium.org/p/", p.project_name, "/issues/detail?id=", i.local_id),
+ r.email
+from SpamVerdict v
+ join Comment c on c.id = v.comment_id
+ join CommentContent cc on cc.comment_id = c.id
+ join Project p on p.project_id = c.project_id
+ join Issue i on i.id=c.issue_id
+ join User u on u.user_id = c.commenter_id
+ join User r on r.user_id = v.user_id
+where
+ v.reason='manual' and v.overruled = 0;
diff --git a/tools/ml/component.py b/tools/ml/component.py
new file mode 100755
index 0000000..9b401f3
--- /dev/null
+++ b/tools/ml/component.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""
+Component classifier command line tools.
+
+Use this command to submit predictions to the model running
+in production.
+
+Note that in order for this command to work, you must be logged into
+gcloud in the project under which you wish to run commands.
+"""
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import argparse
+import json
+import os
+import re
+import sys
+
+import googleapiclient
+from googleapiclient import discovery
+from googleapiclient import errors
+from google.cloud.storage import client, bucket, blob
+from apiclient.discovery import build
+from oauth2client.client import GoogleCredentials
+
+import ml_helpers
+
+credentials = GoogleCredentials.get_application_default()
+
+# This must be identical with settings.component_features.
+COMPONENT_FEATURES = 5000
+
+MODEL_NAME = 'component_top_words'
+
+
+def Predict(args):
+ ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
+
+ with open(args.content) as f:
+ content = f.read()
+
+ project_ID = 'projects/%s' % args.project
+ full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
+ model_request = ml.projects().models().get(name=full_model_name)
+ model_response = model_request.execute()
+
+ version_name = model_response['defaultVersion']['name']
+
+ model_name = 'component_trainer_' + re.search("v_(\d+)",
+ version_name).group(1)
+
+ client_obj = client.Client(project=args.project)
+ bucket_name = '%s-mlengine' % args.project
+ bucket_obj = bucket.Bucket(client_obj, bucket_name)
+
+ instance = ml_helpers.GenerateFeaturesRaw([content],
+ COMPONENT_FEATURES,
+ getTopWords(bucket_name,
+ model_name))
+
+
+ request = ml.projects().predict(name=full_model_name, body={
+ 'instances': [{'inputs': instance['word_features']}]
+ })
+
+ try:
+ response = request.execute()
+
+
+ bucket_obj.blob = blob.Blob('%s/component_index.json'
+ % model_name, bucket_obj)
+ component_index = bucket_obj.blob.download_as_string()
+ component_index_dict = json.loads(component_index)
+
+ return read_indexes(response, component_index_dict)
+
+ except googleapiclient.errors.HttpError, err:
+ print('There was an error. Check the details:')
+ print(err._get_reason())
+
+
+def getTopWords(bucket_name, model_name):
+ storage = discovery.build('storage', 'v1', credentials=credentials)
+ objects = storage.objects()
+
+ request = objects.get_media(bucket=bucket_name,
+ object=model_name + '/topwords.txt')
+ response = request.execute()
+
+ top_list = response.split()
+ top_words = {}
+ for i in range(len(top_list)):
+ top_words[top_list[i]] = i
+
+ return top_words
+
+
+def read_indexes(response, component_index):
+
+ scores = response['predictions'][0]['scores']
+ highest = scores.index(max(scores))
+
+ component_id = component_index[str(highest)]
+
+ return "Most likely component: index %d, component id %d" % (
+ int(highest), int(component_id))
+
+
+def main():
+ if not credentials and 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
+ print(('GOOGLE_APPLICATION_CREDENTIALS environment variable is not set. '
+ 'Exiting.'))
+ sys.exit(1)
+
+ parser = argparse.ArgumentParser(
+ description='Component classifier utilities.')
+ parser.add_argument('--project', '-p', default='monorail-staging')
+
+ parser.add_argument('--content', '-c', required=True,
+ help='A file containing the content.')
+
+ args = parser.parse_args()
+
+ res = Predict(args)
+
+ print(res)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/ml/config.json b/tools/ml/config.json
new file mode 100644
index 0000000..6c36e3e
--- /dev/null
+++ b/tools/ml/config.json
@@ -0,0 +1,5 @@
+{
+ "trainingInput": {
+ "masterType": "large_model"
+ }
+}
diff --git a/tools/ml/issue-training-export.sql b/tools/ml/issue-training-export.sql
new file mode 100644
index 0000000..73a637b
--- /dev/null
+++ b/tools/ml/issue-training-export.sql
@@ -0,0 +1,17 @@
+select
+ IF(v.is_spam, "spam", "ham"),
+ REPLACE(s.summary, '\n', '\r'),
+ REPLACE(cc.content, '\n', '\r'),
+ u.email,
+ CONCAT("https://bugs.chromium.org/p/", p.project_name, "/issues/detail?id=", i.local_id),
+ r.email
+from SpamVerdict v
+ join Issue i on i.id = v.issue_id
+ join Comment c on c.issue_id = i.id
+ join CommentContent cc on cc.comment_id = c.id
+ join IssueSummary s on s.issue_id = i.id
+ join Project p on p.project_id = i.project_id
+ join User u on u.user_id = c.commenter_id
+ join User r on r.user_id = v.user_id
+where
+ v.reason='manual' and v.overruled = 0;
diff --git a/tools/ml/ml_helpers.py b/tools/ml/ml_helpers.py
new file mode 120000
index 0000000..894569b
--- /dev/null
+++ b/tools/ml/ml_helpers.py
@@ -0,0 +1 @@
+../../services/ml_helpers.py
\ No newline at end of file
diff --git a/tools/ml/requirements.txt b/tools/ml/requirements.txt
new file mode 100644
index 0000000..e0a7166
--- /dev/null
+++ b/tools/ml/requirements.txt
@@ -0,0 +1 @@
+tensorflow==1.2
diff --git a/tools/ml/sample_spam_training_data.csv b/tools/ml/sample_spam_training_data.csv
new file mode 100644
index 0000000..4de2805
--- /dev/null
+++ b/tools/ml/sample_spam_training_data.csv
@@ -0,0 +1,36 @@
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"ham","","# 1231
+ - sdfsdf","ddoman@google.com"
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"ham","","# 1231
+ - sdfsdf","ddoman@google.com"
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"ham","","# 1231
+ - sdfsdf","ddoman@google.com"
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"ham","","# 1231
+ - sdfsdf","ddoman@google.com"
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"ham","","# 1231
+ - sdfsdf","ddoman@google.com"
+"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"spam","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"spam","test","hmmm","zhangtiff@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
+"ham","Chicken","<b>Feature description:</b> <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b> ","jojwang@google.com"
diff --git a/tools/ml/setup.py b/tools/ml/setup.py
new file mode 100644
index 0000000..728cd55
--- /dev/null
+++ b/tools/ml/setup.py
@@ -0,0 +1,19 @@
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+from setuptools import find_packages
+from setuptools import setup
+
+REQUIRED_PACKAGES = ['google-cloud-storage']
+
+setup(
+ name='trainer',
+ version='0.1',
+ install_requires=REQUIRED_PACKAGES,
+ packages=find_packages(),
+ include_package_data=True,
+ description="""Trainer application package for training a spam classification
+ model in ML Engine and storing the saved model and accuracy
+ results in GCS."""
+)
diff --git a/tools/ml/spam.py b/tools/ml/spam.py
new file mode 100755
index 0000000..afc9d4d
--- /dev/null
+++ b/tools/ml/spam.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""
+Spam classifier command line tools.
+
+Use this command to submit predictions locally or to the model running
+in production. See tools/spam/README.md for more context on training
+and model operations.
+
+Note that in order for this command to work, you must be logged into
+gcloud in the project under which you wish to run commands.
+"""
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import argparse
+import json
+import os
+import re
+import sys
+import googleapiclient
+
+from google.cloud.storage import client, bucket, blob
+import ml_helpers
+from apiclient.discovery import build
+from oauth2client.client import GoogleCredentials
+
+credentials = GoogleCredentials.get_application_default()
+
+# This must be identical with settings.spam_feature_hashes.
+SPAM_FEATURE_HASHES = 500
+
+MODEL_NAME = 'spam_only_words'
+
+
+def Predict(args):
+ ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
+
+ with open(args.summary) as f:
+ summary = f.read()
+ with open(args.content) as f:
+ content = f.read()
+
+ instance = ml_helpers.GenerateFeaturesRaw([summary, content],
+ SPAM_FEATURE_HASHES)
+
+ project_ID = 'projects/%s' % args.project
+ full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
+ request = ml.projects().predict(name=full_model_name, body={
+ 'instances': [{'inputs': instance['word_hashes']}]
+ })
+
+ try:
+ response = request.execute()
+ print(response)
+ except googleapiclient.errors.HttpError, err:
+ print('There was an error. Check the details:')
+ print(err._get_reason())
+
+
+def LocalPredict(_):
+ print('This will write /tmp/instances.json.')
+ print('Then you can call:')
+ print(('gcloud ml-engine local predict --json-instances /tmp/instances.json'
+ ' --model-dir {model_dir}'))
+
+ summary = raw_input('Summary: ')
+ description = raw_input('Description: ')
+ instance = ml_helpers.GenerateFeaturesRaw([summary, description],
+ SPAM_FEATURE_HASHES)
+
+ with open('/tmp/instances.json', 'w') as f:
+ json.dump({'inputs': instance['word_hashes']}, f)
+
+
+def get_auc(model_name, bucket_obj):
+ bucket_obj.blob = blob.Blob('%s/eval_data.json' % model_name, bucket_obj)
+ data = bucket_obj.blob.download_as_string()
+ data_dict = json.loads(data)
+ return data_dict['auc'], data_dict['auc_precision_recall']
+
+
+def CompareAccuracy(args):
+ client_obj = client.Client(project=args.project)
+ bucket_name = '%s-mlengine' % args.project
+ bucket_obj = bucket.Bucket(client_obj, bucket_name)
+
+ model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj)
+ print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n'
+ % (args.model1, model1_auc, model1_auc_pr))
+
+ model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj)
+ print('%s:\nAUC: %f\tAUC Precision/Recall: %f'
+ % (args.model2, model2_auc, model2_auc_pr))
+
+
+def main():
+ if not credentials and 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
+ print(('GOOGLE_APPLICATION_CREDENTIALS environment variable is not set. '
+ 'Exiting.'))
+ sys.exit(1)
+
+ parser = argparse.ArgumentParser(description='Spam classifier utilities.')
+ parser.add_argument('--project', '-p', default='monorail-staging')
+
+ project = parser.parse_known_args()
+ subparsers = parser.add_subparsers(dest='command')
+
+ predict = subparsers.add_parser('predict',
+ help='Submit a prediction to the default model in ML Engine.')
+ predict.add_argument('--summary', help='A file containing the summary.')
+ predict.add_argument('--content', help='A file containing the content.')
+
+ subparsers.add_parser('local-predict',
+ help='Create an instance on the local filesystem to use in prediction.')
+
+ ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
+
+ request = ml.projects().models().get(name='projects/%s/models/%s'
+ % (project[0].project, MODEL_NAME))
+ response = request.execute()
+
+ default_version = re.search(
+ '.*(spam_trainer_\d+).*',
+ response['defaultVersion']['deploymentUri']).group(1)
+
+ compare = subparsers.add_parser('compare-accuracy',
+ help='Compare the accuracy of two models.')
+
+ compare.add_argument('--model1',
+ default=default_version,
+ help='The first model to find the auc values of.')
+
+ # TODO(carapew): Make second default the most recently deployed model
+ compare.add_argument('--model2',
+ default='spam_trainer_1513384515'
+ if project[0].project == 'monorail-staging' else
+ 'spam_trainer_1522141200',
+ help='The second model to find the auc values of.')
+
+ args = parser.parse_args()
+
+ cmds = {
+ 'predict': Predict,
+ 'local-predict': LocalPredict,
+ 'compare-accuracy': CompareAccuracy,
+ }
+ res = cmds[args.command](args)
+
+ print(json.dumps(res, indent=2))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/ml/trainer/__init__.py b/tools/ml/trainer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tools/ml/trainer/__init__.py
diff --git a/tools/ml/trainer/dataset.py b/tools/ml/trainer/dataset.py
new file mode 100644
index 0000000..0def4b6
--- /dev/null
+++ b/tools/ml/trainer/dataset.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import StringIO
+import tensorflow as tf
+
+import csv
+import sys
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+import trainer.ml_helpers
+
+
+def fetch_training_data(bucket, prefix, trainer_type):
+
+ credentials = GoogleCredentials.get_application_default()
+ storage = discovery.build('storage', 'v1', credentials=credentials)
+ objects = storage.objects()
+
+ request = objects.list(bucket=bucket, prefix=prefix)
+ response = make_api_request(request)
+ items = response.get('items')
+ csv_filepaths = [blob.get('name') for blob in items]
+
+ if trainer_type == 'spam':
+ return fetch_spam(csv_filepaths, bucket, objects)
+ else:
+ return fetch_component(csv_filepaths, bucket, objects)
+
+
+def fetch_spam(csv_filepaths, bucket, objects):
+
+ training_data = []
+ # Add code
+ csv_filepaths = [
+ 'spam-training-data/full-android.csv',
+ 'spam-training-data/full-support.csv',
+ ] + csv_filepaths
+
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ rows, skipped_rows = trainer.ml_helpers.spam_from_file(
+ StringIO.StringIO(media))
+
+ if len(rows):
+ training_data.extend(rows)
+
+ tf.logging.info('{:<40}{:<20}{:<20}'.format(
+ filepath,
+ 'added %d rows' % len(rows),
+ 'skipped %d rows' % skipped_rows))
+
+ return training_data
+
+
+def fetch_component(csv_filepaths, bucket, objects):
+
+ training_data = []
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ rows = trainer.ml_helpers.component_from_file(
+ StringIO.StringIO(media))
+
+ if len(rows):
+ training_data.extend(rows)
+
+ tf.logging.info('{:<40}{:<20}'.format(
+ filepath,
+ 'added %d rows' % len(rows)))
+
+ return training_data
+
+
+def fetch_training_csv(filepath, objects, bucket):
+ request = objects.get_media(bucket=bucket, object=filepath)
+ return make_api_request(request)
+
+
+def make_api_request(request):
+ try:
+ return request.execute()
+ except errors.HttpError, err:
+ tf.logging.error('There was an error with the API. Details:')
+ tf.logging.error(err._get_reason())
+ raise
+
+
diff --git a/tools/ml/trainer/ml_helpers.py b/tools/ml/trainer/ml_helpers.py
new file mode 120000
index 0000000..c790a2c
--- /dev/null
+++ b/tools/ml/trainer/ml_helpers.py
@@ -0,0 +1 @@
+../../../services/ml_helpers.py
\ No newline at end of file
diff --git a/tools/ml/trainer/model.py b/tools/ml/trainer/model.py
new file mode 100644
index 0000000..3b627a9
--- /dev/null
+++ b/tools/ml/trainer/model.py
@@ -0,0 +1,109 @@
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from trainer.ml_helpers import COMPONENT_FEATURES
+from trainer.ml_helpers import SPAM_FEATURE_HASHES
+
+# Important: we assume this list mirrors the output of GenerateFeaturesRaw.
+INPUT_COLUMNS = {'component': [
+ tf.feature_column.numeric_column(
+ key='word_features',
+ shape=(COMPONENT_FEATURES,)),
+ ],
+ 'spam': [
+ tf.feature_column.numeric_column(
+ key='word_hashes',
+ shape=(SPAM_FEATURE_HASHES,)),
+ ]}
+
+
+def build_estimator(config, trainer_type, class_count):
+ """Returns a tf.Estimator.
+
+ Args:
+ config: tf.contrib.learn.RunConfig defining the runtime environment for the
+ estimator (including model_dir).
+ Returns:
+ A LinearClassifier
+ """
+ return tf.contrib.learn.DNNClassifier(
+ config=config,
+ feature_columns=(INPUT_COLUMNS[trainer_type]),
+ hidden_units=[1024, 512, 256],
+ optimizer=tf.train.AdamOptimizer(learning_rate=0.001,
+ beta1=0.9,
+ beta2=0.999,
+ epsilon=1e-08,
+ use_locking=False,
+ name='Adam'),
+ n_classes=class_count
+ )
+
+
+def feature_list_to_dict(X, trainer_type):
+ """Converts an array of feature dicts into to one dict of
+ {feature_name: [feature_values]}.
+
+ Important: this assumes the ordering of X and INPUT_COLUMNS is the same.
+
+ Args:
+ X: an array of feature dicts
+ Returns:
+ A dictionary where each key is a feature name its value is a numpy array of
+ shape (len(X),).
+ """
+ feature_dict = {}
+
+ for feature_column in INPUT_COLUMNS[trainer_type]:
+ feature_dict[feature_column.name] = []
+
+ for instance in X:
+ for key in instance.keys():
+ feature_dict[key].append(instance[key])
+
+ for key in [f.name for f in INPUT_COLUMNS[trainer_type]]:
+ feature_dict[key] = np.array(feature_dict[key])
+
+ return feature_dict
+
+
+def generate_json_serving_input_fn(trainer_type):
+ def json_serving_input_fn():
+ """Build the serving inputs.
+
+ Returns:
+ An InputFnOps containing features with placeholders.
+ """
+ features_placeholders = {}
+ for column in INPUT_COLUMNS[trainer_type]:
+ name = '%s_placeholder' % column.name
+
+ # Special case non-scalar features.
+ if column.shape[0] > 1:
+ shape = [None, column.shape[0]]
+ else:
+ shape = [None]
+
+ placeholder = tf.placeholder(tf.float32, shape, name=name)
+ features_placeholders[column.name] = placeholder
+
+ labels = None # Unknown at serving time
+ return tf.contrib.learn.InputFnOps(features_placeholders, labels,
+ features_placeholders)
+
+ return json_serving_input_fn
+
+
+SERVING_FUNCTIONS = {
+ 'JSON-component': generate_json_serving_input_fn('component'),
+ 'JSON-spam': generate_json_serving_input_fn('spam')
+}
diff --git a/tools/ml/trainer/task.py b/tools/ml/trainer/task.py
new file mode 100644
index 0000000..7416c68
--- /dev/null
+++ b/tools/ml/trainer/task.py
@@ -0,0 +1,284 @@
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import re
+
+import numpy as np
+import tensorflow as tf
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+from sklearn.model_selection import train_test_split
+from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+from tensorflow.contrib.training.python.training import hparam
+
+from google.cloud.storage import blob, bucket, client
+
+import trainer.dataset
+import trainer.model
+import trainer.ml_helpers
+import trainer.top_words
+
+def generate_experiment_fn(**experiment_args):
+ """Create an experiment function.
+
+ Args:
+ experiment_args: keyword arguments to be passed through to experiment
+ See `tf.contrib.learn.Experiment` for full args.
+ Returns:
+ A function:
+ (tf.contrib.learn.RunConfig, tf.contrib.training.HParams) -> Experiment
+
+ This function is used by learn_runner to create an Experiment which
+ executes model code provided in the form of an Estimator and
+ input functions.
+ """
+ def _experiment_fn(config, hparams):
+ index_to_component = {}
+
+ if hparams.train_file:
+ with open(hparams.train_file) as f:
+ if hparams.trainer_type == 'spam':
+ training_data = trainer.ml_helpers.spam_from_file(f)
+ else:
+ training_data = trainer.ml_helpers.component_from_file(f)
+ else:
+ training_data = trainer.dataset.fetch_training_data(hparams.gcs_bucket,
+ hparams.gcs_prefix, hparams.trainer_type)
+
+ tf.logging.info('Training data received. Len: %d' % len(training_data))
+
+ if hparams.trainer_type == 'spam':
+ X, y = trainer.ml_helpers.transform_spam_csv_to_features(
+ training_data)
+ else:
+ top_list = trainer.top_words.make_top_words_list(hparams.job_dir)
+ X, y, index_to_component = trainer.ml_helpers \
+ .transform_component_csv_to_features(training_data, top_list)
+
+ tf.logging.info('Features generated')
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
+ random_state=42)
+
+ train_input_fn = tf.estimator.inputs.numpy_input_fn(
+ x=trainer.model.feature_list_to_dict(X_train, hparams.trainer_type),
+ y=np.array(y_train),
+ num_epochs=hparams.num_epochs,
+ batch_size=hparams.train_batch_size,
+ shuffle=True
+ )
+ eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+ x=trainer.model.feature_list_to_dict(X_test, hparams.trainer_type),
+ y=np.array(y_test),
+ num_epochs=None,
+ batch_size=hparams.eval_batch_size,
+ shuffle=False # Don't shuffle evaluation data
+ )
+
+ tf.logging.info('Numpy fns created')
+ if hparams.trainer_type == 'component':
+ store_component_conversion(hparams.job_dir, index_to_component)
+
+ return tf.contrib.learn.Experiment(
+ trainer.model.build_estimator(config=config,
+ trainer_type=hparams.trainer_type,
+ class_count=len(set(y))),
+ train_input_fn=train_input_fn,
+ eval_input_fn=eval_input_fn,
+ **experiment_args
+ )
+ return _experiment_fn
+
+
+def store_component_conversion(job_dir, data):
+
+ tf.logging.info('job_dir: %s' % job_dir)
+ job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)',
+ job_dir)
+
+ # Check if training is being done on GAE or locally.
+ if job_info:
+ project = job_info.group(1)
+ job_name = job_info.group(2)
+
+ client_obj = client.Client(project=project)
+ bucket_name = '%s-mlengine' % project
+ bucket_obj = bucket.Bucket(client_obj, bucket_name)
+
+ bucket_obj.blob = blob.Blob(job_name + '/component_index.json', bucket_obj)
+
+ bucket_obj.blob.upload_from_string(json.dumps(data),
+ content_type='application/json')
+
+ else:
+ paths = job_dir.split('/')
+ for y, _ in enumerate(list(range(1, len(paths))), 1):
+ if not os.path.exists("/".join(paths[:y+1])):
+ os.makedirs('/'.join(paths[:y+1]))
+ with open(job_dir + '/component_index.json', 'w') as f:
+ f.write(json.dumps(data))
+
+
+def store_eval(job_dir, results):
+
+ tf.logging.info('job_dir: %s' % job_dir)
+ job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)',
+ job_dir)
+
+ # Only upload eval data if this is not being run locally.
+ if job_info:
+ project = job_info.group(1)
+ job_name = job_info.group(2)
+
+ tf.logging.info('project: %s' % project)
+ tf.logging.info('job_name: %s' % job_name)
+
+ client_obj = client.Client(project=project)
+ bucket_name = '%s-mlengine' % project
+ bucket_obj = bucket.Bucket(client_obj, bucket_name)
+
+ bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj)
+ for key, value in results[0].items():
+ if isinstance(value, np.float32):
+ results[0][key] = value.item()
+
+ bucket_obj.blob.upload_from_string(json.dumps(results[0]),
+ content_type='application/json')
+
+ else:
+ tf.logging.error('Could not find bucket "%s" to output evalution to.'
+ % job_dir)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+
+ # Input Arguments
+ parser.add_argument(
+ '--train-file',
+ help='GCS or local path to training data',
+ )
+ parser.add_argument(
+ '--gcs-bucket',
+ help='GCS bucket for training data.',
+ )
+ parser.add_argument(
+ '--gcs-prefix',
+ help='Training data path prefix inside GCS bucket.',
+ )
+ parser.add_argument(
+ '--num-epochs',
+ help="""\
+ Maximum number of training data epochs on which to train.
+ If both --max-steps and --num-epochs are specified,
+ the training job will run for --max-steps or --num-epochs,
+ whichever occurs first. If unspecified will run for --max-steps.\
+ """,
+ type=int,
+ )
+ parser.add_argument(
+ '--train-batch-size',
+ help='Batch size for training steps',
+ type=int,
+ default=128
+ )
+ parser.add_argument(
+ '--eval-batch-size',
+ help='Batch size for evaluation steps',
+ type=int,
+ default=128
+ )
+
+ # Training arguments
+ parser.add_argument(
+ '--job-dir',
+ help='GCS location to write checkpoints and export models',
+ required=True
+ )
+
+ # Logging arguments
+ parser.add_argument(
+ '--verbosity',
+ choices=[
+ 'DEBUG',
+ 'ERROR',
+ 'FATAL',
+ 'INFO',
+ 'WARN'
+ ],
+ default='INFO',
+ )
+
+ # Experiment arguments
+ parser.add_argument(
+ '--eval-delay-secs',
+ help='How long to wait before running first evaluation',
+ default=10,
+ type=int
+ )
+ parser.add_argument(
+ '--min-eval-frequency',
+ help='Minimum number of training steps between evaluations',
+ default=None, # Use TensorFlow's default (currently, 1000)
+ type=int
+ )
+ parser.add_argument(
+ '--train-steps',
+ help="""\
+ Steps to run the training job for. If --num-epochs is not specified,
+ this must be. Otherwise the training job will run indefinitely.\
+ """,
+ type=int
+ )
+ parser.add_argument(
+ '--eval-steps',
+ help='Number of steps to run evalution for at each checkpoint',
+ default=100,
+ type=int
+ )
+ parser.add_argument(
+ '--trainer-type',
+ help='Which trainer to use (spam or component)',
+ choices=['spam', 'component'],
+ required=True
+ )
+
+ args = parser.parse_args()
+
+ tf.logging.set_verbosity(args.verbosity)
+
+ # Run the training job
+ # learn_runner pulls configuration information from environment
+ # variables using tf.learn.RunConfig and uses this configuration
+ # to conditionally execute Experiment, or param server code.
+ eval_results = learn_runner.run(
+ generate_experiment_fn(
+ min_eval_frequency=args.min_eval_frequency,
+ eval_delay_secs=args.eval_delay_secs,
+ train_steps=args.train_steps,
+ eval_steps=args.eval_steps,
+ export_strategies=[saved_model_export_utils.make_export_strategy(
+ trainer.model.SERVING_FUNCTIONS['JSON-' + args.trainer_type],
+ exports_to_keep=1,
+ default_output_alternative_key=None,
+ )],
+ ),
+ run_config=run_config.RunConfig(model_dir=args.job_dir),
+ hparams=hparam.HParams(**args.__dict__)
+ )
+
+ # Store a json blob in GCS with the results of training job (AUC of
+ # precision/recall, etc).
+ if args.trainer_type == 'spam':
+ store_eval(args.job_dir, eval_results)
diff --git a/tools/ml/trainer/top_words.py b/tools/ml/trainer/top_words.py
new file mode 100644
index 0000000..26da211
--- /dev/null
+++ b/tools/ml/trainer/top_words.py
@@ -0,0 +1,127 @@
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import os
+import re
+import StringIO
+import sys
+import tensorflow as tf
+import time
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+import google
+from google.cloud.storage import blob, bucket, client
+
+import trainer.ml_helpers
+import trainer.dataset
+
+
+TOP_WORDS = 'topwords.txt'
+STOP_WORDS = 'stopwords.txt'
+
+
+def fetch_stop_words(project_id, objects):
+ request = objects.get_media(bucket=project_id + '-mlengine',
+ object=STOP_WORDS)
+ response = trainer.dataset.make_api_request(request)
+ return response.split()
+
+
+def fetch_training_csv(filepath, objects, b):
+ request = objects.get_media(bucket=b, object=filepath)
+ return trainer.dataset.make_api_request(request)
+
+
+def GenerateTopWords(objects, word_dict, project_id):
+ stop_words = fetch_stop_words(project_id, objects)
+ sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
+
+ top_words = []
+ index = 0
+
+ while len(top_words) < trainer.ml_helpers.COMPONENT_FEATURES:
+ if sorted_words[index] not in stop_words:
+ top_words.append(sorted_words[index])
+ index += 1
+
+ return top_words
+
+
+def make_top_words_list(job_dir):
+ """Returns the top (most common) words in the entire dataset for component
+ prediction. If a file is already stored in GCS containing these words, the
+ words from the file are simply returned. Otherwise, the most common words are
+ determined and written to GCS, before being returned.
+
+ Returns:
+ A list of the most common words in the dataset (the number of them
+ determined by ml_helpers.COMPONENT_FEATURES).
+ """
+
+ credentials = GoogleCredentials.get_application_default()
+ storage = discovery.build('storage', 'v1', credentials=credentials)
+ objects = storage.objects()
+
+ subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)',
+ job_dir)
+
+ if subpaths:
+ project_id = subpaths.group(1)
+ trainer_folder = subpaths.group(2)
+ else:
+ project_id = 'monorail-prod'
+
+ storage_bucket = project_id + '.appspot.com'
+ request = objects.list(bucket=storage_bucket,
+ prefix='component_training_data')
+
+ response = trainer.dataset.make_api_request(request)
+
+ items = response.get('items')
+ csv_filepaths = [b.get('name') for b in items]
+
+ final_string = ''
+
+ for word in parse_words(csv_filepaths, objects, storage_bucket, project_id):
+ final_string += word + '\n'
+
+ if subpaths:
+ client_obj = client.Client(project=project_id)
+ bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine')
+
+ bucket_obj.blob = google.cloud.storage.blob.Blob(trainer_folder
+ + '/'
+ + TOP_WORDS,
+ bucket_obj)
+ bucket_obj.blob.upload_from_string(final_string,
+ content_type='text/plain')
+ return final_string.split()
+
+
+def parse_words(files, objects, b, project_id):
+ word_dict = {}
+
+ csv.field_size_limit(sys.maxsize)
+ for filepath in files:
+ media = fetch_training_csv(filepath, objects, b)
+
+ for row in csv.reader(StringIO.StringIO(media)):
+ _, content = row
+ words = content.split()
+
+ for word in words:
+ if word in word_dict:
+ word_dict[word] += 1
+ else:
+ word_dict[word] = 1
+
+ return GenerateTopWords(objects, word_dict, project_id)
diff --git a/tools/ml/trainer2/README.md b/tools/ml/trainer2/README.md
new file mode 100644
index 0000000..d32c8bf
--- /dev/null
+++ b/tools/ml/trainer2/README.md
@@ -0,0 +1,35 @@
+### Trainer
+
+## Monorail Spam Classifier
+
+To have the trainer run locally, you'll need to supply the
+`--train-file` arguments.
+
+```sh
+TRAIN_FILE=./spam_training_examples.csv
+OUTPUT_DIR=/tmp/monospam-local-training/
+rm -rf $OUTPUT_DIR
+python3 ./task.py \
+ --train-file $TRAIN_FILE \
+ --job-dir $OUTPUT_DIR \
+ --train-steps 1000 \
+ --verbosity DEBUG \
+ --trainer-type spam
+```
+## Monorail Component Predictor
+
+To have the trainer run locally, you'll need to supply the
+`--train-file` arguments.
+
+```sh
+TRAIN_FILE=./component_training_examples.csv
+OUTPUT_DIR=/tmp/monospam-local-training/
+rm -rf $OUTPUT_DIR
+python3 ./task.py \
+ --train-file $TRAIN_FILE \
+ --job-dir $OUTPUT_DIR \
+ --train-steps 10000 \
+ --eval-steps 1000 \
+ --verbosity DEBUG \
+ --trainer-type component
+```
\ No newline at end of file
diff --git a/tools/ml/trainer2/__init__.py b/tools/ml/trainer2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tools/ml/trainer2/__init__.py
diff --git a/tools/ml/trainer2/dataset.py b/tools/ml/trainer2/dataset.py
new file mode 100644
index 0000000..9e7ae77
--- /dev/null
+++ b/tools/ml/trainer2/dataset.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import io
+import tensorflow as tf
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from trainer2 import train_ml_helpers
+
+
+def fetch_training_data(bucket, prefix, trainer_type):
+
+ credentials = GoogleCredentials.get_application_default()
+ storage = discovery.build('storage', 'v1', credentials=credentials)
+ objects = storage.objects()
+
+ request = objects.list(bucket=bucket, prefix=prefix)
+ response = make_api_request(request)
+ items = response.get('items')
+ csv_filepaths = [blob.get('name') for blob in items]
+
+ if trainer_type == 'spam':
+ return fetch_spam(csv_filepaths, bucket, objects)
+ else:
+ return fetch_component(csv_filepaths, bucket, objects)
+
+
+def fetch_spam(csv_filepaths, bucket, objects):
+
+ all_contents = []
+ all_labels = []
+ # Add code
+ csv_filepaths = [
+ 'spam-training-data/full-android.csv',
+ 'spam-training-data/full-support.csv',
+ ] + csv_filepaths
+
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
+ io.StringIO(media))
+
+ # Sanity check: the contents and labels should be matched pairs.
+ if len(contents) == len(labels) != 0:
+ all_contents.extend(contents)
+ all_labels.extend(labels)
+
+ tf.get_logger().info(
+ '{:<40}{:<20}{:<20}'.format(
+ filepath, 'added %d rows' % len(contents),
+ 'skipped %d rows' % skipped_rows))
+
+ return all_contents, all_labels
+
+
+def fetch_component(csv_filepaths, bucket, objects):
+
+ all_contents = []
+ all_labels = []
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))
+
+ # Sanity check: the contents and labels should be matched pairs.
+ if len(contents) == len(labels) != 0:
+ all_contents.extend(contents)
+ all_labels.extend(labels)
+
+ tf.get_logger().info(
+ '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))
+
+ return all_contents, all_labels
+
+
+def fetch_training_csv(filepath, objects, bucket):
+ request = objects.get_media(bucket=bucket, object=filepath)
+ return str(make_api_request(request), 'utf-8')
+
+
+def make_api_request(request):
+ try:
+ return request.execute()
+ except errors.HttpError as err:
+ tf.get_logger().error('There was an error with the API. Details:')
+ tf.get_logger().error(err._get_reason())
+ raise
diff --git a/tools/ml/trainer2/model.py b/tools/ml/trainer2/model.py
new file mode 100644
index 0000000..823d0d1
--- /dev/null
+++ b/tools/ml/trainer2/model.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import tensorflow as tf
+
+from trainer2.train_ml_helpers import COMPONENT_FEATURES
+from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
+
+# Important: we assume this list mirrors the output of GenerateFeaturesRaw.
+INPUT_COLUMNS = {'component': [
+ tf.feature_column.numeric_column(
+ key='word_features',
+ shape=(COMPONENT_FEATURES,)),
+ ],
+ 'spam': [
+ tf.feature_column.numeric_column(
+ key='word_hashes',
+ shape=(SPAM_FEATURE_HASHES,)),
+ ]}
+
+def build_estimator(config, job_dir, trainer_type, class_count):
+ """Returns a tf.Estimator.
+
+ Args:
+ config: tf.contrib.learn.RunConfig defining the runtime environment for the
+ estimator (including model_dir).
+ Returns:
+ A LinearClassifier
+ """
+ return tf.estimator.DNNClassifier(
+ config=config,
+ model_dir=job_dir,
+ feature_columns=(INPUT_COLUMNS[trainer_type]),
+ hidden_units=[1024, 512, 256],
+ optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
+ beta_1=0.9,
+ beta_2=0.999,
+ epsilon=1e-08,
+ name='Adam'),
+ n_classes=class_count
+ )
diff --git a/tools/ml/trainer2/requirements.txt b/tools/ml/trainer2/requirements.txt
new file mode 100644
index 0000000..7ff5ef7
--- /dev/null
+++ b/tools/ml/trainer2/requirements.txt
@@ -0,0 +1,3 @@
+google-cloud-storage==1.26.0
+tensorflow==2.1.0
+scikit-learn[alldeps]
diff --git a/tools/ml/trainer2/stopwords.py b/tools/ml/trainer2/stopwords.py
new file mode 100644
index 0000000..c4e4c31
--- /dev/null
+++ b/tools/ml/trainer2/stopwords.py
@@ -0,0 +1,21 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+# A list of stopwords to parse text in component predictor.
+STOP_WORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
+ 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
+ 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
+ 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
+ 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+ 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
+ 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
+ 'with', 'about', 'against', 'between', 'into', 'through', 'during',
+ 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
+ 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
+ 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
+ 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
+ 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
+ 'can', 'will', 'just', 'don', 'should', 'now']
diff --git a/tools/ml/trainer2/task.py b/tools/ml/trainer2/task.py
new file mode 100644
index 0000000..2fa8580
--- /dev/null
+++ b/tools/ml/trainer2/task.py
@@ -0,0 +1,256 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import argparse
+import json
+import logging
+import os
+
+import tensorflow as tf
+from tensorflow.estimator import RunConfig
+from sklearn.model_selection import train_test_split
+
+from trainer2 import dataset
+from trainer2 import model
+from trainer2 import top_words
+from trainer2 import train_ml_helpers
+from trainer2.train_ml_helpers import COMPONENT_FEATURES
+from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
+
+INPUT_TYPE_MAP = {
+ 'component': {'key': 'word_features', 'shape': (COMPONENT_FEATURES,)},
+ 'spam': {'key': 'word_hashes', 'shape': (SPAM_FEATURE_HASHES,)}
+}
+
+
+def make_input_fn(trainer_type, features, targets,
+ num_epochs=None, shuffle=True, batch_size=128):
+ """Generate input function for training and testing.
+
+ Args:
+ trainer_type: spam / component
+ features: an array of features shape like INPUT_TYPE_MAP
+ targets: an array of labels with the same length of features
+ num_epochs: training epochs
+ batch_size: dataset batch size
+
+ Returns:
+ input function to feed into TrainSpec and EvalSpec.
+ """
+ def _input_fn():
+ def gen():
+ """Generator function to format feature and target. """
+ for feature, target in zip(features, targets):
+ yield feature[INPUT_TYPE_MAP[trainer_type]['key']], target
+
+ data = tf.data.Dataset.from_generator(
+ gen, (tf.float64, tf.int32),
+ output_shapes=(INPUT_TYPE_MAP[trainer_type]['shape'], ()))
+ data = data.map(lambda x, y: ({INPUT_TYPE_MAP[trainer_type]['key']: x}, y))
+ if shuffle:
+ data = data.shuffle(buffer_size=batch_size * 10)
+ data = data.repeat(num_epochs).batch(batch_size)
+ return data
+
+ return _input_fn
+
+
+def generate_json_input_fn(trainer_type):
+ """Generate ServingInputReceiver function for testing.
+
+ Args:
+ trainer_type: spam / component
+
+ Returns:
+ ServingInputReceiver function to feed into exporter.
+ """
+ feature_spec = {
+ INPUT_TYPE_MAP[trainer_type]['key']:
+ tf.io.FixedLenFeature(INPUT_TYPE_MAP[trainer_type]['shape'], tf.float32)
+ }
+ return tf.estimator.export.build_parsing_serving_input_receiver_fn(
+ feature_spec)
+
+
+def train_and_evaluate_model(config, hparams):
+ """Runs the local training job given provided command line arguments.
+
+ Args:
+ config: RunConfig object
+ hparams: dictionary passed by command line arguments
+
+ """
+
+ if hparams['train_file']:
+ with open(hparams['train_file']) as f:
+ if hparams['trainer_type'] == 'spam':
+ contents, labels, _ = train_ml_helpers.spam_from_file(f)
+ else:
+ contents, labels = train_ml_helpers.component_from_file(f)
+ else:
+ contents, labels = dataset.fetch_training_data(
+ hparams['gcs_bucket'], hparams['gcs_prefix'], hparams['trainer_type'])
+
+ logger.info('Training data received. Len: %d' % len(contents))
+
+ # Generate features and targets from extracted contents and labels.
+ if hparams['trainer_type'] == 'spam':
+ features, targets = train_ml_helpers \
+ .transform_spam_csv_to_features(contents, labels)
+ else:
+ #top_list = top_words.make_top_words_list(contents, hparams['job_dir'])
+ top_list = top_words.parse_words_from_content(contents)
+ features, targets, index_to_component = train_ml_helpers \
+ .transform_component_csv_to_features(contents, labels, top_list)
+
+ # Split training and testing set.
+ logger.info('Features generated')
+ features_train, features_test, targets_train, targets_test = train_test_split(
+ features, targets, test_size=0.2, random_state=42)
+
+ # Generate TrainSpec and EvalSpec for train and evaluate.
+ estimator = model.build_estimator(config=config,
+ job_dir=hparams['job_dir'],
+ trainer_type=hparams['trainer_type'],
+ class_count=len(set(labels)))
+ exporter = tf.estimator.LatestExporter(name='saved_model',
+ serving_input_receiver_fn=generate_json_input_fn(hparams['trainer_type']))
+
+ train_spec = tf.estimator.TrainSpec(
+ input_fn=make_input_fn(hparams['trainer_type'],
+ features_train, targets_train, num_epochs=hparams['num_epochs'],
+ batch_size=hparams['train_batch_size']),
+ max_steps=hparams['train_steps'])
+ eval_spec = tf.estimator.EvalSpec(
+ input_fn=make_input_fn(hparams['trainer_type'],
+ features_test, targets_test, shuffle=False,
+ batch_size=hparams['eval_batch_size']),
+ exporters=exporter, steps=hparams['eval_steps'])
+
+ if hparams['trainer_type'] == 'component':
+ store_component_conversion(hparams['job_dir'], index_to_component)
+
+ result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+ logging.info(result)
+
+ parsing_spec = tf.feature_column.make_parse_example_spec(
+ model.INPUT_COLUMNS[hparams['trainer_type']])
+ serving_input_fn = (
+ tf.estimator.export.build_parsing_serving_input_receiver_fn(parsing_spec))
+ estimator.export_saved_model(hparams['job_dir'], serving_input_fn)
+
+
+def store_component_conversion(job_dir, data):
+ logger.info('job_dir: %s' % job_dir)
+
+ # Store component conversion locally.
+ paths = job_dir.split('/')
+ for y, _ in enumerate(list(range(1, len(paths))), 1):
+ if not os.path.exists("/".join(paths[:y+1])):
+ os.makedirs('/'.join(paths[:y+1]))
+ with open(job_dir + '/component_index.json', 'w') as f:
+ f.write(json.dumps(data))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+
+ # Input Arguments
+ parser.add_argument(
+ '--train-file',
+ help='GCS or local path to training data',
+ )
+ parser.add_argument(
+ '--gcs-bucket',
+ help='GCS bucket for training data.',
+ )
+ parser.add_argument(
+ '--gcs-prefix',
+ help='Training data path prefix inside GCS bucket.',
+ )
+ parser.add_argument(
+ '--num-epochs',
+ help="""\
+ Maximum number of training data epochs on which to train.
+ If both --train-steps and --num-epochs are specified,
+ the training job will run for --num-epochs.
+ If unspecified will run for --train-steps.\
+ """,
+ type=int,
+ )
+ parser.add_argument(
+ '--train-batch-size',
+ help='Batch size for training steps',
+ type=int,
+ default=128
+ )
+ parser.add_argument(
+ '--eval-batch-size',
+ help='Batch size for evaluation steps',
+ type=int,
+ default=128
+ )
+
+ # Training arguments
+ parser.add_argument(
+ '--job-dir',
+ help='GCS location to write checkpoints and export models',
+ required=True
+ )
+
+ # Logging arguments
+ parser.add_argument(
+ '--verbosity',
+ choices=[
+ 'DEBUG',
+ 'ERROR',
+ 'CRITICAL',
+ 'INFO',
+ 'WARNING'
+ ],
+ default='INFO',
+ )
+
+ # Input function arguments
+ parser.add_argument(
+ '--train-steps',
+ help="""\
+ Steps to run the training job for. If --num-epochs is not specified,
+ this must be. Otherwise the training job will run indefinitely.\
+ """,
+ type=int,
+ required=True
+ )
+ parser.add_argument(
+ '--eval-steps',
+ help='Number of steps to run evalution for at each checkpoint',
+ default=100,
+ type=int
+ )
+ parser.add_argument(
+ '--trainer-type',
+ help='Which trainer to use (spam or component)',
+ choices=['spam', 'component'],
+ required=True
+ )
+
+ args = parser.parse_args()
+
+ logger = logging.getLogger()
+ logger.setLevel(getattr(logging, args.verbosity))
+
+ if not args.num_epochs:
+ args.num_epochs = args.train_steps
+
+ # Set C++ Graph Execution level verbosity.
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(
+ getattr(logging, args.verbosity) / 10)
+
+ # Run the training job.
+ train_and_evaluate_model(
+ config=RunConfig(model_dir=args.job_dir),
+ hparams=vars(args))
diff --git a/tools/ml/trainer2/top_words.py b/tools/ml/trainer2/top_words.py
new file mode 100644
index 0000000..bb57699
--- /dev/null
+++ b/tools/ml/trainer2/top_words.py
@@ -0,0 +1,66 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import os
+
+from trainer2 import train_ml_helpers
+from trainer2.stopwords import STOP_WORDS
+
+
+def GenerateTopWords(word_dict):
+ """Requires ./stopwords.txt exist in folder for the function to run.
+ """
+ stop_words = [s.encode('utf-8') for s in STOP_WORDS]
+ sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
+ top_words = []
+ index = 0
+
+ while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
+ if sorted_words[index] not in stop_words:
+ top_words.append(sorted_words[index])
+ index += 1
+
+ return top_words
+
+
+def parse_words_from_content(contents):
+ """Returns given list of strings, extract the top (most common) words.
+ """
+ word_dict = {}
+ for content in contents:
+ words = content.encode('utf-8').split()
+ for word in words:
+ if word in word_dict:
+ word_dict[word] += 1
+ else:
+ word_dict[word] = 1
+
+ return GenerateTopWords(word_dict)
+
+
+def make_top_words_list(contents, job_dir):
+ """Returns the top (most common) words in the entire dataset for component
+ prediction. If a file is already stored in job_dir containing these words, the
+ words from the file are simply returned. Otherwise, the most common words are
+ determined and written to job_dir, before being returned.
+
+ Returns:
+ A list of the most common words in the dataset (the number of them
+ determined by train_ml_helpers.COMPONENT_FEATURES).
+ """
+ if not os.path.exists(job_dir):
+ os.mkdir(job_dir)
+ if os.access(job_dir + 'topwords.txt', os.R_OK):
+ print("Found topwords.txt")
+ with open(job_dir + 'topwords.txt', 'rb') as f:
+ top_words = f.read().split()
+ else:
+ top_words = parse_words_from_content(contents)
+ with open(job_dir + 'topwords.txt', 'w') as f:
+ for word in top_words:
+ f.write('%s\n' % word.decode('utf-8'))
+ return top_words
diff --git a/tools/ml/trainer2/train_ml_helpers.py b/tools/ml/trainer2/train_ml_helpers.py
new file mode 100644
index 0000000..36113a2
--- /dev/null
+++ b/tools/ml/trainer2/train_ml_helpers.py
@@ -0,0 +1,158 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+"""
+Helper functions for spam and component classification. These are mostly for
+feature extraction, so that the serving code and training code both use the same
+set of features.
+TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
+ (with slight difference). Will eventually be merged to one.
+"""
+
+from __future__ import absolute_import
+
+import csv
+import hashlib
+import re
+import sys
+
+SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
+LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
+DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']
+
+# Must be identical to settings.spam_feature_hashes.
+SPAM_FEATURE_HASHES = 500
+# Must be identical to settings.component_features.
+COMPONENT_FEATURES = 5000
+
+
+def _ComponentFeatures(content, num_features, top_words):
+ """
+ This uses the most common words in the entire dataset as features.
+ The count of common words in the issue comments makes up the features.
+ """
+
+ features = [0] * num_features
+ for blob in content:
+ words = blob.split()
+ for word in words:
+ if word in top_words:
+ features[top_words[word]] += 1
+
+ return features
+
+
+def _SpamHashFeatures(content, num_features):
+ """
+ Feature hashing is a fast and compact way to turn a string of text into a
+ vector of feature values for classification and training.
+ See also: https://en.wikipedia.org/wiki/Feature_hashing
+ This is a simple implementation that doesn't try to minimize collisions
+ or anything else fancy.
+ """
+ features = [0] * num_features
+ total = 0.0
+ for blob in content:
+ words = re.split('|'.join(DELIMITERS).encode('utf-8'), blob)
+ for word in words:
+ feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
+ % num_features)
+ features[feature_index] += 1.0
+ total += 1.0
+
+ if total > 0:
+ features = [f / total for f in features]
+
+ return features
+
+
+def GenerateFeaturesRaw(content, num_features, top_words=None):
+ """Generates a vector of features for a given issue or comment.
+
+ Args:
+ content: The content of the issue's description and comments.
+ num_features: The number of features to generate.
+ """
+ # If we've been passed real unicode strings, convert them to just bytestrings.
+ for idx, value in enumerate(content):
+ content[idx] = value.encode('utf-8')
+ if top_words:
+ return {'word_features': _ComponentFeatures(content,
+ num_features,
+ top_words)}
+
+ return {'word_hashes': _SpamHashFeatures(content, num_features)}
+
+
+def transform_spam_csv_to_features(contents, labels):
+ """Generate arrays of features and targets for spam.
+ """
+ features = []
+ targets = []
+ for i, row in enumerate(contents):
+ subject, content = row
+ label = labels[i]
+ features.append(GenerateFeaturesRaw([str(subject), str(content)],
+ SPAM_FEATURE_HASHES))
+ targets.append(1 if label == 'spam' else 0)
+ return features, targets
+
+
+def transform_component_csv_to_features(contents, labels, top_list):
+ """Generate arrays of features and targets for components.
+ """
+ features = []
+ targets = []
+ top_words = {}
+
+ for i, row in enumerate(top_list):
+ top_words[row] = i
+
+ component_to_index = {}
+ index_to_component = {}
+ component_index = 0
+
+ for i, content in enumerate(contents):
+ component = labels[i]
+ component = str(component).split(",")[0]
+
+ if component not in component_to_index:
+ component_to_index[component] = component_index
+ index_to_component[component_index] = component
+ component_index += 1
+
+ features.append(GenerateFeaturesRaw([content],
+ COMPONENT_FEATURES,
+ top_words))
+ targets.append(component_to_index[component])
+
+ return features, targets, index_to_component
+
+
+def spam_from_file(f):
+ """Reads a training data file and returns arrays of contents and labels."""
+ contents = []
+ labels = []
+ skipped_rows = 0
+ for row in csv.reader(f):
+ if len(row) >= len(LEGACY_CSV_COLUMNS):
+ # Throw out email field.
+ contents.append(row[1:3])
+ labels.append(row[0])
+ else:
+ skipped_rows += 1
+ return contents, labels, skipped_rows
+
+
+def component_from_file(f):
+ """Reads a training data file and returns arrays of contents and labels."""
+ contents = []
+ labels = []
+ csv.field_size_limit(sys.maxsize)
+ for row in csv.reader(f):
+ label, content = row
+ contents.append(content)
+ labels.append(label)
+ return contents, labels