Merge branch 'main' into avm99963-monorail

Merged commit cd4b3b336f1f14afa02990fdc2eec5d9467a827e

GitOrigin-RevId: e67bbf185d5538e1472bb42e0abb2a141f88bac1
diff --git a/tools/build_release.py b/tools/build_release.py
index c516ec8..02ffcf3 100755
--- a/tools/build_release.py
+++ b/tools/build_release.py
@@ -18,8 +18,8 @@
 import json
 import subprocess
 import sys
-import urllib.error
-import urllib.request
+from six.moves.urllib import error
+from six.moves.urllib import request
 
 
 INFRA_GIT = 'https://chromium.googlesource.com/infra/infra'
@@ -36,8 +36,8 @@
     None if there's no such ref, a gitiles commit URL otherwise.
   """
   try:
-    resp = urllib.request.urlopen('%s/+/%s?format=JSON' % (INFRA_GIT, ref))
-  except urllib.error.HTTPError as exc:
+    resp = request.urlopen('%s/+/%s?format=JSON' % (INFRA_GIT, ref))
+  except error.HTTPError as exc:
     if exc.code == 404:
       return None
     raise
diff --git a/tools/ml/Makefile b/tools/ml/Makefile
deleted file mode 100644
index b0a8684..0000000
--- a/tools/ml/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-# Use 'make help' for a list of commands.
-
-OUTPUT_DIR := /tmp/monospam-local-training/
-TIMESTAMP := $(shell date +%s)
-MODEL_DIR := /tmp/monospam-local-training/export/Servo/{TIMESTAMP}/
-SPAM_JOB_NAME := spam_trainer_$(TIMESTAMP)
-COMP_JOB_NAME := comp_trainer_$(TIMESTAMP)
-
-default: help
-
-help:
-	@echo "Available commands:"
-	@sed -n '/^[a-zA-Z0-9_.]*:/s/:.*//p' <Makefile
-
-train_local_spam:
-	gcloud ai-platform local train \
-		--package-path trainer/ \
-		--module-name trainer.task \
-		--job-dir $(OUTPUT_DIR) \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--train-file $(TRAIN_FILE) \
-		--trainer-type spam
-
-train_local_spam_2:
-	gcloud ai-platform local train \
-		--package-path trainer2/ \
-		--module-name trainer2.task \
-		--job-dir $(OUTPUT_DIR) \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--train-file $(TRAIN_FILE) \
-		--trainer-type spam
-
-predict_local_spam:
-	./spam.py local-predict
-	gcloud ai-platform local predict \
-		--model-dir $(MODEL_DIR) \
-		--json-instances /tmp/instances.json
-
-train_from_prod_data_spam:
-	gcloud ai-platform local train \
-		--package-path trainer/ \
-		--module-name trainer.task \
-		--job-dir $(OUTPUT_DIR) \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix spam_training_data \
-		--trainer-type spam
-
-train_from_prod_data_spam_2:
-	gcloud ai-platform local train \
-		--package-path trainer2/ \
-		--module-name trainer2.task \
-		--job-dir $(OUTPUT_DIR) \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix spam_training_data \
-		--trainer-type spam
-
-submit_train_job_spam:
-	@echo ${TIMESTAMP}
-	gcloud ai-platform jobs submit training $(SPAM_JOB_NAME) \
-		--package-path trainer/ \
-		--module-name trainer.task \
-		--runtime-version 1.2 \
-		--job-dir gs://monorail-prod-mlengine/$(SPAM_JOB_NAME) \
-		--region us-central1 \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix spam_training_data \
-		--trainer-type spam
-
-submit_train_job_spam_2:
-	@echo ${TIMESTAMP}
-	gcloud ai-platform jobs submit training $(SPAM_JOB_NAME) \
-		--package-path trainer2/ \
-		--module-name trainer2.task \
-		--runtime-version 2.1 \
-		--python-version 3.7 \
-		--job-dir gs://monorail-prod-mlengine/$(SPAM_JOB_NAME) \
-		--region us-central1 \
-		-- \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix spam_training_data \
-		--trainer-type spam
-
-# VERSION of format 'v_TIMESTAMP' should match TIMESTAMP in SPAM_JOB_NAME and MODEL_BINARIES.
-upload_model_prod_spam:
-ifndef MODEL_BINARIES
-	$(error MODEL_BINARIES not set)
-endif
-ifndef VERSION
-	$(error VERSION not set)
-endif
-	gsutil ls -r gs://monorail-prod-mlengine/$(SPAM_JOB_NAME)
-	gcloud ai-platform versions create $(VERSION) \
-		--model spam_only_words \
-		--origin $(MODEL_BINARIES) \
-		--runtime-version 1.2
-	gcloud ai-platform versions set-default $(VERSION) --model spam_only_words
-
-submit_pred_spam:
-ifndef SUMMARY_PATH
-	$(error SUMMARY_PATH not set)
-endif
-ifndef CONTENT_PATH
-	$(error CONTENT_PATH not set)
-endif
-	./spam.py predict --summary $(SUMMARY_PATH) --content $(CONTENT_PATH)
-
-
-train_from_prod_data_component:
-	gcloud ai-platform local train \
-		--package-path trainer/ \
-		--module-name trainer.task \
-		--job-dir $(OUTPUT_DIR) \
-		-- \
-		--train-steps 10000 \
-		--eval-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix component_training_data \
-		--trainer-type component
-
-submit_train_job_component:
-	gcloud init
-	gcloud ai-platform jobs submit training $(COMP_JOB_NAME) \
-		--package-path trainer/ \
-		--module-name trainer.task \
-		--runtime-version 1.2 \
-		--job-dir gs://monorail-prod-mlengine/$(COMP_JOB_NAME) \
-		--region us-central1 \
-		--scale-tier custom \
-		--config config.json \
-		-- \
-		--train-steps 10000 \
-		--eval-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix component_training_data \
-		--trainer-type component
-
-submit_train_job_component_2:
-	gcloud ai-platform jobs submit training $(COMP_JOB_NAME) \
-		--package-path trainer2/ \
-		--module-name trainer2.task \
-		--runtime-version 2.1 \
-		--python-version 3.7 \
-		--job-dir gs://monorail-prod-mlengine/$(COMP_JOB_NAME) \
-		--region us-central1 \
-		--scale-tier custom \
-		--master-machine-type n1-highmem-8 \
-		-- \
-		--train-steps 10000 \
-		--eval-steps 1000 \
-		--verbosity DEBUG \
-		--gcs-bucket monorail-prod.appspot.com \
-		--gcs-prefix component_training_data \
-		--trainer-type component
-
-# VERSION of format 'v_TIMESTAMP' should match TIMESTAMP in COMP_JOB_NAME and MODEL_BINARIES.
-upload_model_prod_component:
-ifndef MODEL_BINARIES
-	$(error MODEL_BINARIES not set)
-endif
-ifndef VERSION
-	$(error VERSION not set)
-endif
-	gsutil ls -r gs://monorail-prod-mlengine/$(COMP_JOB_NAME)
-	gcloud ai-platform versions create $(VERSION) \
-		--model component_top_words \
-		--origin $(MODEL_BINARIES) \
-		--runtime-version 1.2
-	gcloud ai-platform versions set-default $(VERSION) --model component_top_words
-
-submit_pred_component:
-ifndef CONTENT_PATH
-	$(error CONTENT_PATH not set)
-endif
-	./component.py --project monorail-prod --content $(CONTENT_PATH)
-
-
-### Local Training in TF 2.0
-
-tf2_train_local_spam:
-ifndef TRAIN_FILE
-	$(error TRAIN_FILE not set)
-endif
-	python3 ./trainer2/task.py \
-		--train-file $(TRAIN_FILE) \
-		--job-dir $(OUTPUT_DIR) \
-		--train-steps 1000 \
-		--verbosity DEBUG \
-		--trainer-type spam
-
-tf2_train_local_component:
-ifndef TRAIN_FILE
-	$(error TRAIN_FILE not set)
-endif
-	python3 ./trainer2/task.py \
-		--train-file $(TRAIN_FILE) \
-		--job-dir $(OUTPUT_DIR) \
-		--train-steps 10000 \
-		--eval-steps 1000 \
-		--verbosity DEBUG \
-		--trainer-type component
diff --git a/tools/ml/README.md b/tools/ml/README.md
deleted file mode 100644
index 01b0702..0000000
--- a/tools/ml/README.md
+++ /dev/null
@@ -1,222 +0,0 @@
-# Monorail Machine Learning Classifiers
-
-Monorail has two machine learning classifiers running in ML Engine: a spam classifier and a component predictor.
-
-Whenever a user creates a new issue (or comments on an issue without an assigned component), components are suggested based on the text the user types using Monorail's component predictor.
-
-Monorail also runs each new issue and comment through a spam classifier model.
-
-In order to train a new model locally or in the cloud, follow the instructions below.
-
-> Note: you must be logged into the correct GCP project with `gcloud` in order to run the below commands.
-
-### New model in trainer2/
-
-The new code is used for local training and exporting model using Python3 and TensorFlow 2.0. Future predictor should also be migrated to use the training files in trainer2/.
-
-### Trainer
-
-Both trainers are Python modules that do the following:
-
-1. Download all (spam or component) exported training data from GCS
-2. Define a TensorFlow Estimator and Experiment
-
-ML Engine uses the high-level [`learn_runner`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/learn_runner/run) API (see [`trainer/task.py`](trainer/task.py)) which allows it to train, evaluate, and predict against a model saved in GCS.
-
-## Monorail Spam Classifier
-
-### Run locally
-
-To run any training jobs locally, you'll need Python 2 and TensorFlow 1.2:
-
-```sh
-pip install -r requirements.txt
-```
-
-Run a local training job with placeholder data:
-
-```sh
-make TRAIN_FILE=./sample_spam_training_data.csv train_local_spam
-```
-
-To have the local trainer download and train on the real training data, you'll
-need to be logged into `gcloud` and have access to the `monorail-prod` project.
-
-```sh
-make train_from_prod_data_spam
-```
-
-<!-- TODO: the below has not been reviewed recently. -->
-
-### Submit a local prediction
-
-```sh
-./spam.py local-predict
-gcloud ml-engine local predict --model-dir $OUTPUT_DIR/export/Servo/{TIMESTAMP}/ --json-instances /tmp/instances.json
-```
-
-### Submitting a training job to ML Engine
-
-This will run a job and output a trained model to GCS. Job names must be unique.
-
-First verify you're in the `monorail-prod` GCP project.
-
-```sh
-gcloud init
-```
-
-To submit a training job manually, run:
-
-```sh
-TIMESTAMP=$(date +%s)
-JOB_NAME=spam_trainer_$TIMESTAMP
-gcloud ml-engine jobs submit training $JOB_NAME \
-    --package-path trainer/ \
-    --module-name trainer.task \
-    --runtime-version 1.2 \
-    --job-dir gs://monorail-prod-mlengine/$JOB_NAME \
-    --region us-central1 \
-    -- \
-    --train-steps 1000 \
-    --verbosity DEBUG \
-    --gcs-bucket monorail-prod.appspot.com \
-    --gcs-prefix spam_training_data \
-    --trainer-type spam
-```
-
-### Uploading a model and and promoting it to production
-
-To upload a model you'll need to locate the exported model directory in GCS. To do that, run:
-
-```sh
-gsutil ls -r gs://monorail-prod-mlengine/$JOB_NAME
-
-# Look for a directory that matches the below structure and assign it.
-# It should have the structure $GCS_OUTPUT_LOCATION/export/Servo/$TIMESTAMP/.
-MODEL_BINARIES=gs://monorail-prod-mlengine/spam_trainer_1507059720/export/Servo/1507060043/
-
-VERSION=v_$TIMESTAMP
-gcloud ml-engine versions create $VERSION \
-    --model spam_only_words \
-    --origin $MODEL_BINARIES \
-    --runtime-version 1.2
-```
-
-To promote to production, set that model as default.
-
-```sh
-gcloud ml-engine versions set-default $VERSION --model spam_only_words
-```
-
-### Submit a prediction
-
-Use the script [`spam.py`](spam.py) to make predictions
-from the command line. Files containing text for classification must be provided as summary and content arguments.
-
-```sh
-$ ./spam.py predict --summary summary.txt --content content.txt
-{u'predictions': [{u'classes': [u'0', u'1'], u'scores': [0.4986788034439087, 0.5013211965560913]}]}
-```
-
-A higher probability for class 1 indicates that the text was classified as spam.
-
-### Compare model accuracy
-
-After submitting a job to ML Engine, you can compare the accuracy of two submitted jobs using their trainer names.
-
-```sh
-$ ./spam.py --project monorail-prod compare-accuracy --model1 spam_trainer_1521756634 --model2 spam_trainer_1516759200
-spam_trainer_1521756634:
-AUC: 0.996436  AUC Precision/Recall: 0.997456
-
-spam_trainer_1516759200:
-AUC: 0.982159  AUC Precision/Recall: 0.985069
-```
-
-By default, model1 is the default model running in the specified project. Note that an error will be thrown if the trainer does not contain an eval_data.json file.
-
-## Monorail Component Predictor
-
-### Run locally
-
-To kick off a local training job, run:
-
-```sh
-OUTPUT_DIR=/tmp/monospam-local-training
-rm -rf $OUTPUT_DIR
-gcloud ml-engine local train \
-    --package-path trainer/ \
-    --module-name trainer.task \
-    --job-dir $OUTPUT_DIR \
-    -- \
-    --train-steps 10000 \
-    --eval-steps 1000 \
-    --verbosity DEBUG \
-    --gcs-bucket monorail-prod.appspot.com \
-    --gcs-prefix component_training_data \
-    --trainer-type component
-```
-
-### Submitting a training job to ML Engine
-
-This will run a job and output a trained model to GCS. Job names must be unique.
-
-First verify you're in the `monorail-prod` GCP project.
-
-```sh
-gcloud init
-```
-
-To submit a training job manually, run:
-
-```sh
-TIMESTAMP=$(date +%s)
-JOB_NAME=component_trainer_$TIMESTAMP
-gcloud ml-engine jobs submit training $JOB_NAME \
-    --package-path trainer/ \
-    --module-name trainer.task \
-    --runtime-version 1.2 \
-    --job-dir gs://monorail-prod-mlengine/$JOB_NAME \
-    --region us-central1 \
-    --scale-tier custom \
-    --config config.json \
-    -- \
-    --train-steps 10000 \
-    --eval-steps 1000 \
-    --verbosity DEBUG \
-    --gcs-bucket monorail-prod.appspot.com \
-    --gcs-prefix component_training_data \
-    --trainer-type component
-```
-
-### Uploading a model and and promoting it to production
-
-To upload a model you'll need to locate the exported model directory in GCS. To do that, run:
-
-```sh
-gsutil ls -r gs://monorail-prod-mlengine/$JOB_NAME
-
-# Look for a directory that matches the below structure and assign it.
-# It should have the structure $GCS_OUTPUT_LOCATION/export/Servo/$TIMESTAMP/.
-MODEL_BINARIES=gs://monorail-prod-mlengine/component_trainer_1507059720/export/Servo/1507060043/
-
-VERSION=v_$TIMESTAMP
-gcloud ml-engine versions create $VERSION \
-    --model component_top_words \
-    --origin $MODEL_BINARIES \
-    --runtime-version 1.2
-```
-To promote to production, set that model as default.
-
-```sh
-gcloud ml-engine versions set-default $VERSION --model component_top_words
-```
-
-### Submit a prediction
-
-Use the script [`component.py`](component.py) to make predictions from the command line. A file containing text for classification must be provided as the content argument.
-
-```sh
-$ ./component.py --project monorail-prod --content content.txt
-Most likely component: index 108, component id 36250211
-```
diff --git a/tools/ml/comment-training-export.sql b/tools/ml/comment-training-export.sql
deleted file mode 100644
index 891ed18..0000000
--- a/tools/ml/comment-training-export.sql
+++ /dev/null
@@ -1,16 +0,0 @@
-select
-  IF(v.is_spam, "spam", "ham"),
-  "",
-  REPLACE(cc.content, '\n', '\r'),
-  u.email,
-  CONCAT("https://bugs.chromium.org/p/", p.project_name, "/issues/detail?id=", i.local_id),
-  r.email
-from SpamVerdict v
-  join Comment c on c.id = v.comment_id
-  join CommentContent cc on cc.comment_id = c.id
-  join Project p on p.project_id = c.project_id
-  join Issue i on i.id=c.issue_id
-  join User u on u.user_id = c.commenter_id
-  join User r on r.user_id = v.user_id
-where
-  v.reason='manual' and v.overruled = 0;
diff --git a/tools/ml/component.py b/tools/ml/component.py
deleted file mode 100755
index 9b401f3..0000000
--- a/tools/ml/component.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2018 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-"""
-Component classifier command line tools.
-
-Use this command to submit predictions to the model running
-in production.
-
-Note that in order for this command to work, you must be logged into
-gcloud in the project under which you wish to run commands.
-"""
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-import argparse
-import json
-import os
-import re
-import sys
-
-import googleapiclient
-from googleapiclient import discovery
-from googleapiclient import errors
-from google.cloud.storage import client, bucket, blob
-from apiclient.discovery import build
-from oauth2client.client import GoogleCredentials
-
-import ml_helpers
-
-credentials = GoogleCredentials.get_application_default()
-
-# This must be identical with settings.component_features.
-COMPONENT_FEATURES = 5000
-
-MODEL_NAME = 'component_top_words'
-
-
-def Predict(args):
-  ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
-
-  with open(args.content) as f:
-    content = f.read()
-
-  project_ID = 'projects/%s' % args.project
-  full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
-  model_request = ml.projects().models().get(name=full_model_name)
-  model_response = model_request.execute()
-
-  version_name = model_response['defaultVersion']['name']
-
-  model_name = 'component_trainer_' + re.search("v_(\d+)",
-                                                version_name).group(1)
-
-  client_obj = client.Client(project=args.project)
-  bucket_name = '%s-mlengine' % args.project
-  bucket_obj = bucket.Bucket(client_obj, bucket_name)
-
-  instance = ml_helpers.GenerateFeaturesRaw([content],
-                                            COMPONENT_FEATURES,
-                                            getTopWords(bucket_name,
-                                                        model_name))
-
-
-  request = ml.projects().predict(name=full_model_name, body={
-    'instances': [{'inputs': instance['word_features']}]
-  })
-
-  try:
-    response = request.execute()
-
-
-    bucket_obj.blob = blob.Blob('%s/component_index.json'
-                                % model_name, bucket_obj)
-    component_index = bucket_obj.blob.download_as_string()
-    component_index_dict = json.loads(component_index)
-
-    return read_indexes(response, component_index_dict)
-
-  except googleapiclient.errors.HttpError, err:
-    print('There was an error. Check the details:')
-    print(err._get_reason())
-
-
-def getTopWords(bucket_name, model_name):
-  storage = discovery.build('storage', 'v1', credentials=credentials)
-  objects = storage.objects()
-
-  request = objects.get_media(bucket=bucket_name,
-                              object=model_name + '/topwords.txt')
-  response = request.execute()
-
-  top_list = response.split()
-  top_words = {}
-  for i in range(len(top_list)):
-    top_words[top_list[i]] = i
-
-  return top_words
-
-
-def read_indexes(response, component_index):
-
-  scores = response['predictions'][0]['scores']
-  highest = scores.index(max(scores))
-
-  component_id = component_index[str(highest)]
-
-  return "Most likely component: index %d, component id %d" % (
-      int(highest), int(component_id))
-
-
-def main():
-  if not credentials and 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
-    print(('GOOGLE_APPLICATION_CREDENTIALS environment variable is not set. '
-          'Exiting.'))
-    sys.exit(1)
-
-  parser = argparse.ArgumentParser(
-      description='Component classifier utilities.')
-  parser.add_argument('--project', '-p', default='monorail-staging')
-
-  parser.add_argument('--content', '-c', required=True,
-                      help='A file containing the content.')
-
-  args = parser.parse_args()
-
-  res = Predict(args)
-
-  print(res)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/tools/ml/config.json b/tools/ml/config.json
deleted file mode 100644
index 6c36e3e..0000000
--- a/tools/ml/config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "trainingInput": {
-        "masterType": "large_model"
-    }
-}
diff --git a/tools/ml/issue-training-export.sql b/tools/ml/issue-training-export.sql
deleted file mode 100644
index 73a637b..0000000
--- a/tools/ml/issue-training-export.sql
+++ /dev/null
@@ -1,17 +0,0 @@
-select
-  IF(v.is_spam, "spam", "ham"),
-  REPLACE(s.summary, '\n', '\r'),
-  REPLACE(cc.content, '\n', '\r'),
-  u.email,
-  CONCAT("https://bugs.chromium.org/p/", p.project_name, "/issues/detail?id=", i.local_id),
-  r.email
-from SpamVerdict v
-  join Issue i on i.id = v.issue_id
-  join Comment c on c.issue_id = i.id
-  join CommentContent cc on cc.comment_id = c.id
-  join IssueSummary s on s.issue_id = i.id
-  join Project p on p.project_id = i.project_id
-  join User u on u.user_id = c.commenter_id
-  join User r on r.user_id = v.user_id
-where
-  v.reason='manual' and v.overruled = 0;
diff --git a/tools/ml/ml_helpers.py b/tools/ml/ml_helpers.py
deleted file mode 120000
index 894569b..0000000
--- a/tools/ml/ml_helpers.py
+++ /dev/null
@@ -1 +0,0 @@
-../../services/ml_helpers.py
\ No newline at end of file
diff --git a/tools/ml/requirements.txt b/tools/ml/requirements.txt
deleted file mode 100644
index e0a7166..0000000
--- a/tools/ml/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorflow==1.2
diff --git a/tools/ml/sample_spam_training_data.csv b/tools/ml/sample_spam_training_data.csv
deleted file mode 100644
index 4de2805..0000000
--- a/tools/ml/sample_spam_training_data.csv
+++ /dev/null
@@ -1,36 +0,0 @@
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"ham","","# 1231
- - sdfsdf","ddoman@google.com"
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"ham","","# 1231
- - sdfsdf","ddoman@google.com"
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"ham","","# 1231
- - sdfsdf","ddoman@google.com"
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"ham","","# 1231
- - sdfsdf","ddoman@google.com"
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"ham","","# 1231
- - sdfsdf","ddoman@google.com"
-"ham","","Okay. I think we've found another way to do what we need - thanks, though!","wscalf@gmail.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"spam","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"spam","test","hmmm","zhangtiff@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
-"ham","Chicken","<b>Feature description:</b>  <b>--</b> test <b>PRD:</b>ewre <b>Mocks:</b> <b>Design doc:</b> <b>Test Plan:</b> <b>Metrics (go/CrOSlaunchMetrics):</b>  ","jojwang@google.com"
diff --git a/tools/ml/setup.py b/tools/ml/setup.py
deleted file mode 100644
index 728cd55..0000000
--- a/tools/ml/setup.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-from setuptools import find_packages
-from setuptools import setup
-
-REQUIRED_PACKAGES = ['google-cloud-storage']
-
-setup(
-  name='trainer',
-  version='0.1',
-  install_requires=REQUIRED_PACKAGES,
-  packages=find_packages(),
-  include_package_data=True,
-  description="""Trainer application package for training a spam classification
-                 model in ML Engine and storing the saved model and accuracy
-                 results in GCS."""
-)
diff --git a/tools/ml/spam.py b/tools/ml/spam.py
deleted file mode 100755
index afc9d4d..0000000
--- a/tools/ml/spam.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2016 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-"""
-Spam classifier command line tools.
-
-Use this command to submit predictions locally or to the model running
-in production. See tools/spam/README.md for more context on training
-and model operations.
-
-Note that in order for this command to work, you must be logged into
-gcloud in the project under which you wish to run commands.
-"""
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-import argparse
-import json
-import os
-import re
-import sys
-import googleapiclient
-
-from google.cloud.storage import client, bucket, blob
-import ml_helpers
-from apiclient.discovery import build
-from oauth2client.client import GoogleCredentials
-
-credentials = GoogleCredentials.get_application_default()
-
-# This must be identical with settings.spam_feature_hashes.
-SPAM_FEATURE_HASHES = 500
-
-MODEL_NAME = 'spam_only_words'
-
-
-def Predict(args):
-  ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
-
-  with open(args.summary) as f:
-    summary = f.read()
-  with open(args.content) as f:
-    content = f.read()
-
-  instance = ml_helpers.GenerateFeaturesRaw([summary, content],
-    SPAM_FEATURE_HASHES)
-
-  project_ID = 'projects/%s' % args.project
-  full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
-  request = ml.projects().predict(name=full_model_name, body={
-    'instances': [{'inputs': instance['word_hashes']}]
-  })
-
-  try:
-    response = request.execute()
-    print(response)
-  except googleapiclient.errors.HttpError, err:
-    print('There was an error. Check the details:')
-    print(err._get_reason())
-
-
-def LocalPredict(_):
-  print('This will write /tmp/instances.json.')
-  print('Then you can call:')
-  print(('gcloud ml-engine local predict --json-instances /tmp/instances.json'
-    ' --model-dir {model_dir}'))
-
-  summary = raw_input('Summary: ')
-  description = raw_input('Description: ')
-  instance = ml_helpers.GenerateFeaturesRaw([summary, description],
-    SPAM_FEATURE_HASHES)
-
-  with open('/tmp/instances.json', 'w') as f:
-    json.dump({'inputs': instance['word_hashes']}, f)
-
-
-def get_auc(model_name, bucket_obj):
-  bucket_obj.blob = blob.Blob('%s/eval_data.json' % model_name, bucket_obj)
-  data = bucket_obj.blob.download_as_string()
-  data_dict = json.loads(data)
-  return data_dict['auc'], data_dict['auc_precision_recall']
-
-
-def CompareAccuracy(args):
-  client_obj = client.Client(project=args.project)
-  bucket_name = '%s-mlengine' % args.project
-  bucket_obj = bucket.Bucket(client_obj, bucket_name)
-
-  model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj)
-  print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n'
-        % (args.model1, model1_auc, model1_auc_pr))
-
-  model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj)
-  print('%s:\nAUC: %f\tAUC Precision/Recall: %f'
-        % (args.model2, model2_auc, model2_auc_pr))
-
-
-def main():
-  if not credentials and 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
-    print(('GOOGLE_APPLICATION_CREDENTIALS environment variable is not set. '
-          'Exiting.'))
-    sys.exit(1)
-
-  parser = argparse.ArgumentParser(description='Spam classifier utilities.')
-  parser.add_argument('--project', '-p', default='monorail-staging')
-
-  project = parser.parse_known_args()
-  subparsers = parser.add_subparsers(dest='command')
-
-  predict = subparsers.add_parser('predict',
-    help='Submit a prediction to the default model in ML Engine.')
-  predict.add_argument('--summary', help='A file containing the summary.')
-  predict.add_argument('--content', help='A file containing the content.')
-
-  subparsers.add_parser('local-predict',
-    help='Create an instance on the local filesystem to use in prediction.')
-
-  ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)
-
-  request = ml.projects().models().get(name='projects/%s/models/%s'
-                                       % (project[0].project, MODEL_NAME))
-  response = request.execute()
-
-  default_version = re.search(
-      '.*(spam_trainer_\d+).*',
-      response['defaultVersion']['deploymentUri']).group(1)
-
-  compare = subparsers.add_parser('compare-accuracy',
-                                  help='Compare the accuracy of two models.')
-
-  compare.add_argument('--model1',
-                       default=default_version,
-                       help='The first model to find the auc values of.')
-
-  # TODO(carapew): Make second default the most recently deployed model
-  compare.add_argument('--model2',
-                       default='spam_trainer_1513384515'
-                       if project[0].project == 'monorail-staging' else
-                       'spam_trainer_1522141200',
-                       help='The second model to find the auc values of.')
-
-  args = parser.parse_args()
-
-  cmds = {
-    'predict':  Predict,
-    'local-predict':  LocalPredict,
-    'compare-accuracy': CompareAccuracy,
-  }
-  res = cmds[args.command](args)
-
-  print(json.dumps(res, indent=2))
-
-
-if __name__ == '__main__':
-  main()
diff --git a/tools/ml/trainer/__init__.py b/tools/ml/trainer/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/tools/ml/trainer/__init__.py
+++ /dev/null
diff --git a/tools/ml/trainer/dataset.py b/tools/ml/trainer/dataset.py
deleted file mode 100644
index 0def4b6..0000000
--- a/tools/ml/trainer/dataset.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2018 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-import StringIO
-import tensorflow as tf
-
-import csv
-import sys
-from googleapiclient import discovery
-from googleapiclient import errors
-from oauth2client.client import GoogleCredentials
-
-import trainer.ml_helpers
-
-
-def fetch_training_data(bucket, prefix, trainer_type):
-
-  credentials = GoogleCredentials.get_application_default()
-  storage = discovery.build('storage', 'v1', credentials=credentials)
-  objects = storage.objects()
-
-  request = objects.list(bucket=bucket, prefix=prefix)
-  response = make_api_request(request)
-  items = response.get('items')
-  csv_filepaths = [blob.get('name') for blob in items]
-
-  if trainer_type == 'spam':
-    return fetch_spam(csv_filepaths, bucket, objects)
-  else:
-    return fetch_component(csv_filepaths, bucket, objects)
-
-
-def fetch_spam(csv_filepaths, bucket, objects):
-
-  training_data = []
-  # Add code
-  csv_filepaths = [
-    'spam-training-data/full-android.csv',
-    'spam-training-data/full-support.csv',
-  ] + csv_filepaths
-
-  for filepath in csv_filepaths:
-    media = fetch_training_csv(filepath, objects, bucket)
-    rows, skipped_rows = trainer.ml_helpers.spam_from_file(
-        StringIO.StringIO(media))
-
-    if len(rows):
-      training_data.extend(rows)
-
-    tf.logging.info('{:<40}{:<20}{:<20}'.format(
-        filepath,
-        'added %d rows' % len(rows),
-        'skipped %d rows' % skipped_rows))
-
-  return training_data
-
-
-def fetch_component(csv_filepaths, bucket, objects):
-
-  training_data = []
-  for filepath in csv_filepaths:
-    media = fetch_training_csv(filepath, objects, bucket)
-    rows = trainer.ml_helpers.component_from_file(
-        StringIO.StringIO(media))
-
-    if len(rows):
-      training_data.extend(rows)
-
-    tf.logging.info('{:<40}{:<20}'.format(
-        filepath,
-        'added %d rows' % len(rows)))
-
-  return training_data
-
-
-def fetch_training_csv(filepath, objects, bucket):
-  request = objects.get_media(bucket=bucket, object=filepath)
-  return make_api_request(request)
-
-
-def make_api_request(request):
-  try:
-    return request.execute()
-  except errors.HttpError, err:
-    tf.logging.error('There was an error with the API. Details:')
-    tf.logging.error(err._get_reason())
-    raise
-
-
diff --git a/tools/ml/trainer/ml_helpers.py b/tools/ml/trainer/ml_helpers.py
deleted file mode 120000
index c790a2c..0000000
--- a/tools/ml/trainer/ml_helpers.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../services/ml_helpers.py
\ No newline at end of file
diff --git a/tools/ml/trainer/model.py b/tools/ml/trainer/model.py
deleted file mode 100644
index 3b627a9..0000000
--- a/tools/ml/trainer/model.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2018 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from trainer.ml_helpers import COMPONENT_FEATURES
-from trainer.ml_helpers import SPAM_FEATURE_HASHES
-
-# Important: we assume this list mirrors the output of GenerateFeaturesRaw.
-INPUT_COLUMNS = {'component': [
-                     tf.feature_column.numeric_column(
-                         key='word_features',
-                         shape=(COMPONENT_FEATURES,)),
-                 ],
-                 'spam': [
-                     tf.feature_column.numeric_column(
-                         key='word_hashes',
-                         shape=(SPAM_FEATURE_HASHES,)),
-                 ]}
-
-
-def build_estimator(config, trainer_type, class_count):
-  """Returns a tf.Estimator.
-
-  Args:
-    config: tf.contrib.learn.RunConfig defining the runtime environment for the
-      estimator (including model_dir).
-  Returns:
-    A LinearClassifier
-  """
-  return tf.contrib.learn.DNNClassifier(
-    config=config,
-    feature_columns=(INPUT_COLUMNS[trainer_type]),
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.train.AdamOptimizer(learning_rate=0.001,
-      beta1=0.9,
-      beta2=0.999,
-      epsilon=1e-08,
-      use_locking=False,
-      name='Adam'),
-    n_classes=class_count
-  )
-
-
-def feature_list_to_dict(X, trainer_type):
-  """Converts an array of feature dicts into to one dict of
-    {feature_name: [feature_values]}.
-
-  Important: this assumes the ordering of X and INPUT_COLUMNS is the same.
-
-  Args:
-    X: an array of feature dicts
-  Returns:
-    A dictionary where each key is a feature name its value is a numpy array of
-    shape (len(X),).
-  """
-  feature_dict = {}
-
-  for feature_column in INPUT_COLUMNS[trainer_type]:
-    feature_dict[feature_column.name] = []
-
-  for instance in X:
-    for key in instance.keys():
-      feature_dict[key].append(instance[key])
-
-  for key in [f.name for f in INPUT_COLUMNS[trainer_type]]:
-    feature_dict[key] = np.array(feature_dict[key])
-
-  return feature_dict
-
-
-def generate_json_serving_input_fn(trainer_type):
-  def json_serving_input_fn():
-    """Build the serving inputs.
-
-    Returns:
-      An InputFnOps containing features with placeholders.
-    """
-    features_placeholders = {}
-    for column in INPUT_COLUMNS[trainer_type]:
-      name = '%s_placeholder' % column.name
-
-      # Special case non-scalar features.
-      if column.shape[0] > 1:
-        shape = [None, column.shape[0]]
-      else:
-        shape = [None]
-
-      placeholder = tf.placeholder(tf.float32, shape, name=name)
-      features_placeholders[column.name] = placeholder
-
-    labels = None # Unknown at serving time
-    return tf.contrib.learn.InputFnOps(features_placeholders, labels,
-      features_placeholders)
-
-  return json_serving_input_fn
-
-
-SERVING_FUNCTIONS = {
-    'JSON-component': generate_json_serving_input_fn('component'),
-    'JSON-spam':  generate_json_serving_input_fn('spam')
-}
diff --git a/tools/ml/trainer/task.py b/tools/ml/trainer/task.py
deleted file mode 100644
index 7416c68..0000000
--- a/tools/ml/trainer/task.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Copyright 2018 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import json
-import os
-import re
-
-import numpy as np
-import tensorflow as tf
-from googleapiclient import discovery
-from googleapiclient import errors
-from oauth2client.client import GoogleCredentials
-from sklearn.model_selection import train_test_split
-from tensorflow.contrib.learn.python.learn import learn_runner
-from tensorflow.contrib.learn.python.learn.estimators import run_config
-from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
-from tensorflow.contrib.training.python.training import hparam
-
-from google.cloud.storage import blob, bucket, client
-
-import trainer.dataset
-import trainer.model
-import trainer.ml_helpers
-import trainer.top_words
-
-def generate_experiment_fn(**experiment_args):
-  """Create an experiment function.
-
-  Args:
-    experiment_args: keyword arguments to be passed through to experiment
-      See `tf.contrib.learn.Experiment` for full args.
-  Returns:
-    A function:
-      (tf.contrib.learn.RunConfig, tf.contrib.training.HParams) -> Experiment
-
-    This function is used by learn_runner to create an Experiment which
-    executes model code provided in the form of an Estimator and
-    input functions.
-  """
-  def _experiment_fn(config, hparams):
-    index_to_component = {}
-
-    if hparams.train_file:
-      with open(hparams.train_file) as f:
-        if hparams.trainer_type == 'spam':
-          training_data = trainer.ml_helpers.spam_from_file(f)
-        else:
-          training_data = trainer.ml_helpers.component_from_file(f)
-    else:
-      training_data = trainer.dataset.fetch_training_data(hparams.gcs_bucket,
-        hparams.gcs_prefix, hparams.trainer_type)
-
-    tf.logging.info('Training data received. Len: %d' % len(training_data))
-
-    if hparams.trainer_type == 'spam':
-      X, y = trainer.ml_helpers.transform_spam_csv_to_features(
-          training_data)
-    else:
-      top_list = trainer.top_words.make_top_words_list(hparams.job_dir)
-      X, y, index_to_component = trainer.ml_helpers \
-          .transform_component_csv_to_features(training_data, top_list)
-
-    tf.logging.info('Features generated')
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
-      random_state=42)
-
-    train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x=trainer.model.feature_list_to_dict(X_train, hparams.trainer_type),
-      y=np.array(y_train),
-      num_epochs=hparams.num_epochs,
-      batch_size=hparams.train_batch_size,
-      shuffle=True
-    )
-    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x=trainer.model.feature_list_to_dict(X_test, hparams.trainer_type),
-      y=np.array(y_test),
-      num_epochs=None,
-      batch_size=hparams.eval_batch_size,
-      shuffle=False # Don't shuffle evaluation data
-    )
-
-    tf.logging.info('Numpy fns created')
-    if hparams.trainer_type == 'component':
-      store_component_conversion(hparams.job_dir, index_to_component)
-
-    return tf.contrib.learn.Experiment(
-      trainer.model.build_estimator(config=config,
-                                    trainer_type=hparams.trainer_type,
-                                    class_count=len(set(y))),
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      **experiment_args
-    )
-  return _experiment_fn
-
-
-def store_component_conversion(job_dir, data):
-
-  tf.logging.info('job_dir: %s' % job_dir)
-  job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)',
-                       job_dir)
-
-  # Check if training is being done on GAE or locally.
-  if job_info:
-    project = job_info.group(1)
-    job_name = job_info.group(2)
-
-    client_obj = client.Client(project=project)
-    bucket_name = '%s-mlengine' % project
-    bucket_obj = bucket.Bucket(client_obj, bucket_name)
-
-    bucket_obj.blob = blob.Blob(job_name + '/component_index.json', bucket_obj)
-
-    bucket_obj.blob.upload_from_string(json.dumps(data),
-                                       content_type='application/json')
-
-  else:
-    paths = job_dir.split('/')
-    for y, _ in enumerate(list(range(1, len(paths))), 1):
-      if not os.path.exists("/".join(paths[:y+1])):
-        os.makedirs('/'.join(paths[:y+1]))
-    with open(job_dir + '/component_index.json', 'w') as f:
-      f.write(json.dumps(data))
-
-
-def store_eval(job_dir, results):
-
-  tf.logging.info('job_dir: %s' % job_dir)
-  job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)',
-                       job_dir)
-
-  # Only upload eval data if this is not being run locally.
-  if job_info:
-    project = job_info.group(1)
-    job_name = job_info.group(2)
-
-    tf.logging.info('project: %s' % project)
-    tf.logging.info('job_name: %s' % job_name)
-
-    client_obj = client.Client(project=project)
-    bucket_name = '%s-mlengine' % project
-    bucket_obj = bucket.Bucket(client_obj, bucket_name)
-
-    bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj)
-    for key, value in results[0].items():
-      if isinstance(value, np.float32):
-        results[0][key] = value.item()
-
-    bucket_obj.blob.upload_from_string(json.dumps(results[0]),
-                                       content_type='application/json')
-
-  else:
-    tf.logging.error('Could not find bucket "%s" to output evalution to.'
-                     % job_dir)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-
-  # Input Arguments
-  parser.add_argument(
-    '--train-file',
-    help='GCS or local path to training data',
-  )
-  parser.add_argument(
-    '--gcs-bucket',
-    help='GCS bucket for training data.',
-  )
-  parser.add_argument(
-    '--gcs-prefix',
-    help='Training data path prefix inside GCS bucket.',
-  )
-  parser.add_argument(
-    '--num-epochs',
-    help="""\
-    Maximum number of training data epochs on which to train.
-    If both --max-steps and --num-epochs are specified,
-    the training job will run for --max-steps or --num-epochs,
-    whichever occurs first. If unspecified will run for --max-steps.\
-    """,
-    type=int,
-  )
-  parser.add_argument(
-    '--train-batch-size',
-    help='Batch size for training steps',
-    type=int,
-    default=128
-  )
-  parser.add_argument(
-    '--eval-batch-size',
-    help='Batch size for evaluation steps',
-    type=int,
-    default=128
-  )
-
-  # Training arguments
-  parser.add_argument(
-    '--job-dir',
-    help='GCS location to write checkpoints and export models',
-    required=True
-  )
-
-  # Logging arguments
-  parser.add_argument(
-    '--verbosity',
-    choices=[
-        'DEBUG',
-        'ERROR',
-        'FATAL',
-        'INFO',
-        'WARN'
-    ],
-    default='INFO',
-  )
-
-  # Experiment arguments
-  parser.add_argument(
-    '--eval-delay-secs',
-    help='How long to wait before running first evaluation',
-    default=10,
-    type=int
-  )
-  parser.add_argument(
-    '--min-eval-frequency',
-    help='Minimum number of training steps between evaluations',
-    default=None,  # Use TensorFlow's default (currently, 1000)
-    type=int
-  )
-  parser.add_argument(
-    '--train-steps',
-    help="""\
-    Steps to run the training job for. If --num-epochs is not specified,
-    this must be. Otherwise the training job will run indefinitely.\
-    """,
-    type=int
-  )
-  parser.add_argument(
-    '--eval-steps',
-    help='Number of steps to run evalution for at each checkpoint',
-    default=100,
-    type=int
-  )
-  parser.add_argument(
-    '--trainer-type',
-    help='Which trainer to use (spam or component)',
-    choices=['spam', 'component'],
-    required=True
-  )
-
-  args = parser.parse_args()
-
-  tf.logging.set_verbosity(args.verbosity)
-
-  # Run the training job
-  # learn_runner pulls configuration information from environment
-  # variables using tf.learn.RunConfig and uses this configuration
-  # to conditionally execute Experiment, or param server code.
-  eval_results = learn_runner.run(
-    generate_experiment_fn(
-      min_eval_frequency=args.min_eval_frequency,
-      eval_delay_secs=args.eval_delay_secs,
-      train_steps=args.train_steps,
-      eval_steps=args.eval_steps,
-      export_strategies=[saved_model_export_utils.make_export_strategy(
-        trainer.model.SERVING_FUNCTIONS['JSON-' + args.trainer_type],
-        exports_to_keep=1,
-        default_output_alternative_key=None,
-      )],
-    ),
-    run_config=run_config.RunConfig(model_dir=args.job_dir),
-    hparams=hparam.HParams(**args.__dict__)
-  )
-
-  # Store a json blob in GCS with the results of training job (AUC of
-  # precision/recall, etc).
-  if args.trainer_type == 'spam':
-    store_eval(args.job_dir, eval_results)
diff --git a/tools/ml/trainer/top_words.py b/tools/ml/trainer/top_words.py
deleted file mode 100644
index 26da211..0000000
--- a/tools/ml/trainer/top_words.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2018 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file or at
-# https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import csv
-import os
-import re
-import StringIO
-import sys
-import tensorflow as tf
-import time
-
-from googleapiclient import discovery
-from googleapiclient import errors
-from oauth2client.client import GoogleCredentials
-import google
-from google.cloud.storage import blob, bucket, client
-
-import trainer.ml_helpers
-import trainer.dataset
-
-
-TOP_WORDS = 'topwords.txt'
-STOP_WORDS = 'stopwords.txt'
-
-
-def fetch_stop_words(project_id, objects):
-  request = objects.get_media(bucket=project_id + '-mlengine',
-                              object=STOP_WORDS)
-  response = trainer.dataset.make_api_request(request)
-  return response.split()
-
-
-def fetch_training_csv(filepath, objects, b):
-  request = objects.get_media(bucket=b, object=filepath)
-  return trainer.dataset.make_api_request(request)
-
-
-def GenerateTopWords(objects, word_dict, project_id):
-  stop_words = fetch_stop_words(project_id, objects)
-  sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
-
-  top_words = []
-  index = 0
-
-  while len(top_words) < trainer.ml_helpers.COMPONENT_FEATURES:
-    if sorted_words[index] not in stop_words:
-      top_words.append(sorted_words[index])
-    index += 1
-
-  return top_words
-
-
-def make_top_words_list(job_dir):
-  """Returns the top (most common) words in the entire dataset for component
-  prediction. If a file is already stored in GCS containing these words, the
-  words from the file are simply returned. Otherwise, the most common words are
-  determined and written to GCS, before being returned.
-
-  Returns:
-    A list of the most common words in the dataset (the number of them
-    determined by ml_helpers.COMPONENT_FEATURES).
-  """
-
-  credentials = GoogleCredentials.get_application_default()
-  storage = discovery.build('storage', 'v1', credentials=credentials)
-  objects = storage.objects()
-
-  subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)',
-                      job_dir)
-
-  if subpaths:
-    project_id = subpaths.group(1)
-    trainer_folder = subpaths.group(2)
-  else:
-    project_id = 'monorail-prod'
-
-  storage_bucket = project_id + '.appspot.com'
-  request = objects.list(bucket=storage_bucket,
-                         prefix='component_training_data')
-
-  response = trainer.dataset.make_api_request(request)
-
-  items = response.get('items')
-  csv_filepaths = [b.get('name') for b in items]
-
-  final_string = ''
-
-  for word in parse_words(csv_filepaths, objects, storage_bucket, project_id):
-    final_string += word + '\n'
-
-  if subpaths:
-    client_obj = client.Client(project=project_id)
-    bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine')
-
-    bucket_obj.blob = google.cloud.storage.blob.Blob(trainer_folder
-                                                   + '/'
-                                                   + TOP_WORDS,
-                                                   bucket_obj)
-    bucket_obj.blob.upload_from_string(final_string,
-                                       content_type='text/plain')
-  return final_string.split()
-
-
-def parse_words(files, objects, b, project_id):
-  word_dict = {}
-
-  csv.field_size_limit(sys.maxsize)
-  for filepath in files:
-    media = fetch_training_csv(filepath, objects, b)
-
-    for row in csv.reader(StringIO.StringIO(media)):
-      _, content = row
-      words = content.split()
-
-      for word in words:
-        if word in word_dict:
-          word_dict[word] += 1
-        else:
-          word_dict[word] = 1
-
-  return GenerateTopWords(objects, word_dict, project_id)
diff --git a/tools/ml/trainer2/README.md b/tools/ml/trainer2/README.md
deleted file mode 100644
index d32c8bf..0000000
--- a/tools/ml/trainer2/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-### Trainer
-
-## Monorail Spam Classifier
-
-To have the trainer run locally, you'll need to supply the
-`--train-file` arguments.
-
-```sh
-TRAIN_FILE=./spam_training_examples.csv
-OUTPUT_DIR=/tmp/monospam-local-training/
-rm -rf $OUTPUT_DIR
-python3 ./task.py \
-    --train-file $TRAIN_FILE \
-    --job-dir $OUTPUT_DIR \
-    --train-steps 1000 \
-    --verbosity DEBUG \
-    --trainer-type spam
-```
-## Monorail Component Predictor
-
-To have the trainer run locally, you'll need to supply the
-`--train-file` arguments.
-
-```sh
-TRAIN_FILE=./component_training_examples.csv
-OUTPUT_DIR=/tmp/monospam-local-training/
-rm -rf $OUTPUT_DIR
-python3 ./task.py \
-    --train-file $TRAIN_FILE \
-    --job-dir $OUTPUT_DIR \
-    --train-steps 10000 \
-    --eval-steps 1000 \
-    --verbosity DEBUG \
-    --trainer-type component
-```
\ No newline at end of file
diff --git a/tools/ml/trainer2/__init__.py b/tools/ml/trainer2/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/tools/ml/trainer2/__init__.py
+++ /dev/null
diff --git a/tools/ml/trainer2/dataset.py b/tools/ml/trainer2/dataset.py
deleted file mode 100644
index 9e7ae77..0000000
--- a/tools/ml/trainer2/dataset.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2020 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-import io
-import tensorflow as tf
-
-from googleapiclient import discovery
-from googleapiclient import errors
-from oauth2client.client import GoogleCredentials
-
-from trainer2 import train_ml_helpers
-
-
-def fetch_training_data(bucket, prefix, trainer_type):
-
-  credentials = GoogleCredentials.get_application_default()
-  storage = discovery.build('storage', 'v1', credentials=credentials)
-  objects = storage.objects()
-
-  request = objects.list(bucket=bucket, prefix=prefix)
-  response = make_api_request(request)
-  items = response.get('items')
-  csv_filepaths = [blob.get('name') for blob in items]
-
-  if trainer_type == 'spam':
-    return fetch_spam(csv_filepaths, bucket, objects)
-  else:
-    return fetch_component(csv_filepaths, bucket, objects)
-
-
-def fetch_spam(csv_filepaths, bucket, objects):
-
-  all_contents = []
-  all_labels = []
-  # Add code
-  csv_filepaths = [
-      'spam-training-data/full-android.csv',
-      'spam-training-data/full-support.csv',
-  ] + csv_filepaths
-
-  for filepath in csv_filepaths:
-    media = fetch_training_csv(filepath, objects, bucket)
-    contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
-        io.StringIO(media))
-
-    # Sanity check: the contents and labels should be matched pairs.
-    if len(contents) == len(labels) != 0:
-      all_contents.extend(contents)
-      all_labels.extend(labels)
-
-    tf.get_logger().info(
-        '{:<40}{:<20}{:<20}'.format(
-            filepath, 'added %d rows' % len(contents),
-            'skipped %d rows' % skipped_rows))
-
-  return all_contents, all_labels
-
-
-def fetch_component(csv_filepaths, bucket, objects):
-
-  all_contents = []
-  all_labels = []
-  for filepath in csv_filepaths:
-    media = fetch_training_csv(filepath, objects, bucket)
-    contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))
-
-    # Sanity check: the contents and labels should be matched pairs.
-    if len(contents) == len(labels) != 0:
-      all_contents.extend(contents)
-      all_labels.extend(labels)
-
-    tf.get_logger().info(
-        '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))
-
-  return all_contents, all_labels
-
-
-def fetch_training_csv(filepath, objects, bucket):
-  request = objects.get_media(bucket=bucket, object=filepath)
-  return str(make_api_request(request), 'utf-8')
-
-
-def make_api_request(request):
-  try:
-    return request.execute()
-  except errors.HttpError as err:
-    tf.get_logger().error('There was an error with the API. Details:')
-    tf.get_logger().error(err._get_reason())
-    raise
diff --git a/tools/ml/trainer2/model.py b/tools/ml/trainer2/model.py
deleted file mode 100644
index 823d0d1..0000000
--- a/tools/ml/trainer2/model.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-
-import tensorflow as tf
-
-from trainer2.train_ml_helpers import COMPONENT_FEATURES
-from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
-
-# Important: we assume this list mirrors the output of GenerateFeaturesRaw.
-INPUT_COLUMNS = {'component': [
-                     tf.feature_column.numeric_column(
-                         key='word_features',
-                         shape=(COMPONENT_FEATURES,)),
-                 ],
-                 'spam': [
-                     tf.feature_column.numeric_column(
-                         key='word_hashes',
-                         shape=(SPAM_FEATURE_HASHES,)),
-                 ]}
-
-def build_estimator(config, job_dir, trainer_type, class_count):
-  """Returns a tf.Estimator.
-
-  Args:
-    config: tf.contrib.learn.RunConfig defining the runtime environment for the
-      estimator (including model_dir).
-  Returns:
-    A LinearClassifier
-  """
-  return tf.estimator.DNNClassifier(
-    config=config,
-    model_dir=job_dir,
-    feature_columns=(INPUT_COLUMNS[trainer_type]),
-    hidden_units=[1024, 512, 256],
-    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-08,
-      name='Adam'),
-    n_classes=class_count
-  )
diff --git a/tools/ml/trainer2/requirements.txt b/tools/ml/trainer2/requirements.txt
deleted file mode 100644
index 7ff5ef7..0000000
--- a/tools/ml/trainer2/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-google-cloud-storage==1.26.0
-tensorflow==2.1.0
-scikit-learn[alldeps]
diff --git a/tools/ml/trainer2/stopwords.py b/tools/ml/trainer2/stopwords.py
deleted file mode 100644
index c4e4c31..0000000
--- a/tools/ml/trainer2/stopwords.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-# A list of stopwords to parse text in component predictor.
-STOP_WORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
-  'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
-  'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
-  'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
-  'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
-  'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
-  'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
-  'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
-  'with', 'about', 'against', 'between', 'into', 'through', 'during',
-  'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
-  'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
-  'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
-  'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
-  'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
-  'can', 'will', 'just', 'don', 'should', 'now']
diff --git a/tools/ml/trainer2/task.py b/tools/ml/trainer2/task.py
deleted file mode 100644
index 2fa8580..0000000
--- a/tools/ml/trainer2/task.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-
-import argparse
-import json
-import logging
-import os
-
-import tensorflow as tf
-from tensorflow.estimator import RunConfig
-from sklearn.model_selection import train_test_split
-
-from trainer2 import dataset
-from trainer2 import model
-from trainer2 import top_words
-from trainer2 import train_ml_helpers
-from trainer2.train_ml_helpers import COMPONENT_FEATURES
-from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
-
-INPUT_TYPE_MAP = {
-  'component': {'key': 'word_features', 'shape': (COMPONENT_FEATURES,)},
-  'spam': {'key': 'word_hashes', 'shape': (SPAM_FEATURE_HASHES,)}
-}
-
-
-def make_input_fn(trainer_type, features, targets,
-  num_epochs=None, shuffle=True, batch_size=128):
-  """Generate input function for training and testing.
-
-  Args:
-    trainer_type: spam / component
-    features: an array of features shape like INPUT_TYPE_MAP
-    targets: an array of labels with the same length of features
-    num_epochs: training epochs
-    batch_size: dataset batch size
-
-  Returns:
-    input function to feed into TrainSpec and EvalSpec.
-  """
-  def _input_fn():
-    def gen():
-      """Generator function to format feature and target. """
-      for feature, target in zip(features, targets):
-        yield feature[INPUT_TYPE_MAP[trainer_type]['key']], target
-
-    data = tf.data.Dataset.from_generator(
-        gen, (tf.float64, tf.int32),
-        output_shapes=(INPUT_TYPE_MAP[trainer_type]['shape'], ()))
-    data = data.map(lambda x, y: ({INPUT_TYPE_MAP[trainer_type]['key']: x}, y))
-    if shuffle:
-      data = data.shuffle(buffer_size=batch_size * 10)
-    data = data.repeat(num_epochs).batch(batch_size)
-    return data
-
-  return _input_fn
-
-
-def generate_json_input_fn(trainer_type):
-  """Generate ServingInputReceiver function for testing.
-
-  Args:
-    trainer_type: spam / component
-
-  Returns:
-    ServingInputReceiver function to feed into exporter.
-  """
-  feature_spec = {
-    INPUT_TYPE_MAP[trainer_type]['key']:
-    tf.io.FixedLenFeature(INPUT_TYPE_MAP[trainer_type]['shape'], tf.float32)
-  }
-  return tf.estimator.export.build_parsing_serving_input_receiver_fn(
-    feature_spec)
-
-
-def train_and_evaluate_model(config, hparams):
-  """Runs the local training job given provided command line arguments.
-
-  Args:
-    config: RunConfig object
-    hparams: dictionary passed by command line arguments
-
-  """
-
-  if hparams['train_file']:
-    with open(hparams['train_file']) as f:
-      if hparams['trainer_type'] == 'spam':
-        contents, labels, _ = train_ml_helpers.spam_from_file(f)
-      else:
-        contents, labels = train_ml_helpers.component_from_file(f)
-  else:
-    contents, labels = dataset.fetch_training_data(
-        hparams['gcs_bucket'], hparams['gcs_prefix'], hparams['trainer_type'])
-
-  logger.info('Training data received. Len: %d' % len(contents))
-
-  # Generate features and targets from extracted contents and labels.
-  if hparams['trainer_type'] == 'spam':
-    features, targets = train_ml_helpers \
-      .transform_spam_csv_to_features(contents, labels)
-  else:
-    #top_list = top_words.make_top_words_list(contents, hparams['job_dir'])
-    top_list = top_words.parse_words_from_content(contents)
-    features, targets, index_to_component = train_ml_helpers \
-      .transform_component_csv_to_features(contents, labels, top_list)
-
-  # Split training and testing set.
-  logger.info('Features generated')
-  features_train, features_test, targets_train, targets_test = train_test_split(
-      features, targets, test_size=0.2, random_state=42)
-
-  # Generate TrainSpec and EvalSpec for train and evaluate.
-  estimator = model.build_estimator(config=config,
-                                    job_dir=hparams['job_dir'],
-                                    trainer_type=hparams['trainer_type'],
-                                    class_count=len(set(labels)))
-  exporter = tf.estimator.LatestExporter(name='saved_model',
-    serving_input_receiver_fn=generate_json_input_fn(hparams['trainer_type']))
-
-  train_spec = tf.estimator.TrainSpec(
-    input_fn=make_input_fn(hparams['trainer_type'],
-    features_train, targets_train, num_epochs=hparams['num_epochs'],
-    batch_size=hparams['train_batch_size']),
-    max_steps=hparams['train_steps'])
-  eval_spec = tf.estimator.EvalSpec(
-    input_fn=make_input_fn(hparams['trainer_type'],
-    features_test, targets_test, shuffle=False,
-    batch_size=hparams['eval_batch_size']),
-    exporters=exporter, steps=hparams['eval_steps'])
-
-  if hparams['trainer_type'] == 'component':
-    store_component_conversion(hparams['job_dir'], index_to_component)
-
-  result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-  logging.info(result)
-
-  parsing_spec = tf.feature_column.make_parse_example_spec(
-      model.INPUT_COLUMNS[hparams['trainer_type']])
-  serving_input_fn = (
-      tf.estimator.export.build_parsing_serving_input_receiver_fn(parsing_spec))
-  estimator.export_saved_model(hparams['job_dir'], serving_input_fn)
-
-
-def store_component_conversion(job_dir, data):
-  logger.info('job_dir: %s' % job_dir)
-
-  # Store component conversion locally.
-  paths = job_dir.split('/')
-  for y, _ in enumerate(list(range(1, len(paths))), 1):
-    if not os.path.exists("/".join(paths[:y+1])):
-      os.makedirs('/'.join(paths[:y+1]))
-  with open(job_dir + '/component_index.json', 'w') as f:
-    f.write(json.dumps(data))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-
-  # Input Arguments
-  parser.add_argument(
-      '--train-file',
-      help='GCS or local path to training data',
-  )
-  parser.add_argument(
-      '--gcs-bucket',
-      help='GCS bucket for training data.',
-  )
-  parser.add_argument(
-      '--gcs-prefix',
-      help='Training data path prefix inside GCS bucket.',
-  )
-  parser.add_argument(
-    '--num-epochs',
-    help="""\
-    Maximum number of training data epochs on which to train.
-    If both --train-steps and --num-epochs are specified,
-    the training job will run for --num-epochs.
-    If unspecified will run for --train-steps.\
-    """,
-    type=int,
-  )
-  parser.add_argument(
-    '--train-batch-size',
-    help='Batch size for training steps',
-    type=int,
-    default=128
-  )
-  parser.add_argument(
-    '--eval-batch-size',
-    help='Batch size for evaluation steps',
-    type=int,
-    default=128
-  )
-
-  # Training arguments
-  parser.add_argument(
-    '--job-dir',
-    help='GCS location to write checkpoints and export models',
-    required=True
-  )
-
-  # Logging arguments
-  parser.add_argument(
-    '--verbosity',
-    choices=[
-        'DEBUG',
-        'ERROR',
-        'CRITICAL',
-        'INFO',
-        'WARNING'
-    ],
-    default='INFO',
-  )
-
-  # Input function arguments
-  parser.add_argument(
-    '--train-steps',
-    help="""\
-    Steps to run the training job for. If --num-epochs is not specified,
-    this must be. Otherwise the training job will run indefinitely.\
-    """,
-    type=int,
-    required=True
-  )
-  parser.add_argument(
-    '--eval-steps',
-    help='Number of steps to run evalution for at each checkpoint',
-    default=100,
-    type=int
-  )
-  parser.add_argument(
-    '--trainer-type',
-    help='Which trainer to use (spam or component)',
-    choices=['spam', 'component'],
-    required=True
-  )
-
-  args = parser.parse_args()
-
-  logger = logging.getLogger()
-  logger.setLevel(getattr(logging, args.verbosity))
-
-  if not args.num_epochs:
-    args.num_epochs = args.train_steps
-
-  # Set C++ Graph Execution level verbosity.
-  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(
-    getattr(logging, args.verbosity) / 10)
-
-  # Run the training job.
-  train_and_evaluate_model(
-    config=RunConfig(model_dir=args.job_dir),
-    hparams=vars(args))
diff --git a/tools/ml/trainer2/top_words.py b/tools/ml/trainer2/top_words.py
deleted file mode 100644
index bb57699..0000000
--- a/tools/ml/trainer2/top_words.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-from __future__ import absolute_import
-
-import os
-
-from trainer2 import train_ml_helpers
-from trainer2.stopwords import STOP_WORDS
-
-
-def GenerateTopWords(word_dict):
-  """Requires ./stopwords.txt exist in folder for the function to run.
-  """
-  stop_words = [s.encode('utf-8') for s in STOP_WORDS]
-  sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
-  top_words = []
-  index = 0
-
-  while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
-    if sorted_words[index] not in stop_words:
-      top_words.append(sorted_words[index])
-    index += 1
-
-  return top_words
-
-
-def parse_words_from_content(contents):
-  """Returns given list of strings, extract the top (most common) words.
-  """
-  word_dict = {}
-  for content in contents:
-    words = content.encode('utf-8').split()
-    for word in words:
-      if word in word_dict:
-        word_dict[word] += 1
-      else:
-        word_dict[word] = 1
-
-  return GenerateTopWords(word_dict)
-
-
-def make_top_words_list(contents, job_dir):
-  """Returns the top (most common) words in the entire dataset for component
-  prediction. If a file is already stored in job_dir containing these words, the
-  words from the file are simply returned. Otherwise, the most common words are
-  determined and written to job_dir, before being returned.
-
-  Returns:
-    A list of the most common words in the dataset (the number of them
-    determined by train_ml_helpers.COMPONENT_FEATURES).
-  """
-  if not os.path.exists(job_dir):
-    os.mkdir(job_dir)
-  if os.access(job_dir + 'topwords.txt', os.R_OK):
-    print("Found topwords.txt")
-    with open(job_dir + 'topwords.txt', 'rb') as f:
-      top_words = f.read().split()
-  else:
-    top_words = parse_words_from_content(contents)
-    with open(job_dir + 'topwords.txt', 'w') as f:
-      for word in top_words:
-        f.write('%s\n' % word.decode('utf-8'))
-  return top_words
diff --git a/tools/ml/trainer2/train_ml_helpers.py b/tools/ml/trainer2/train_ml_helpers.py
deleted file mode 100644
index 36113a2..0000000
--- a/tools/ml/trainer2/train_ml_helpers.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2019 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-# Or at https://developers.google.com/open-source/licenses/bsd
-
-"""
-Helper functions for spam and component classification. These are mostly for
-feature extraction, so that the serving code and training code both use the same
-set of features.
-TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
-  (with slight difference). Will eventually be merged to one.
-"""
-
-from __future__ import absolute_import
-
-import csv
-import hashlib
-import re
-import sys
-
-SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
-LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
-DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']
-
-# Must be identical to settings.spam_feature_hashes.
-SPAM_FEATURE_HASHES = 500
-# Must be identical to settings.component_features.
-COMPONENT_FEATURES = 5000
-
-
-def _ComponentFeatures(content, num_features, top_words):
-  """
-    This uses the most common words in the entire dataset as features.
-    The count of common words in the issue comments makes up the features.
-  """
-
-  features = [0] * num_features
-  for blob in content:
-    words = blob.split()
-    for word in words:
-      if word in top_words:
-        features[top_words[word]] += 1
-
-  return features
-
-
-def _SpamHashFeatures(content, num_features):
-  """
-    Feature hashing is a fast and compact way to turn a string of text into a
-    vector of feature values for classification and training.
-    See also: https://en.wikipedia.org/wiki/Feature_hashing
-    This is a simple implementation that doesn't try to minimize collisions
-    or anything else fancy.
-  """
-  features = [0] * num_features
-  total = 0.0
-  for blob in content:
-    words = re.split('|'.join(DELIMITERS).encode('utf-8'), blob)
-    for word in words:
-      feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
-                          % num_features)
-      features[feature_index] += 1.0
-      total += 1.0
-
-  if total > 0:
-    features = [f / total for f in features]
-
-  return features
-
-
-def GenerateFeaturesRaw(content, num_features, top_words=None):
-  """Generates a vector of features for a given issue or comment.
-
-  Args:
-    content: The content of the issue's description and comments.
-    num_features: The number of features to generate.
-  """
-  # If we've been passed real unicode strings, convert them to just bytestrings.
-  for idx, value in enumerate(content):
-    content[idx] = value.encode('utf-8')
-  if top_words:
-    return {'word_features': _ComponentFeatures(content,
-                                                   num_features,
-                                                   top_words)}
-
-  return {'word_hashes': _SpamHashFeatures(content, num_features)}
-
-
-def transform_spam_csv_to_features(contents, labels):
-  """Generate arrays of features and targets for spam.
-  """
-  features = []
-  targets = []
-  for i, row in enumerate(contents):
-    subject, content = row
-    label = labels[i]
-    features.append(GenerateFeaturesRaw([str(subject), str(content)],
-                                 SPAM_FEATURE_HASHES))
-    targets.append(1 if label == 'spam' else 0)
-  return features, targets
-
-
-def transform_component_csv_to_features(contents, labels, top_list):
-  """Generate arrays of features and targets for components.
-  """
-  features = []
-  targets = []
-  top_words = {}
-
-  for i, row in enumerate(top_list):
-    top_words[row] = i
-
-  component_to_index = {}
-  index_to_component = {}
-  component_index = 0
-
-  for i, content in enumerate(contents):
-    component = labels[i]
-    component = str(component).split(",")[0]
-
-    if component not in component_to_index:
-      component_to_index[component] = component_index
-      index_to_component[component_index] = component
-      component_index += 1
-
-    features.append(GenerateFeaturesRaw([content],
-                                 COMPONENT_FEATURES,
-                                 top_words))
-    targets.append(component_to_index[component])
-
-  return features, targets, index_to_component
-
-
-def spam_from_file(f):
-  """Reads a training data file and returns arrays of contents and labels."""
-  contents = []
-  labels = []
-  skipped_rows = 0
-  for row in csv.reader(f):
-    if len(row) >= len(LEGACY_CSV_COLUMNS):
-      # Throw out email field.
-      contents.append(row[1:3])
-      labels.append(row[0])
-    else:
-      skipped_rows += 1
-  return contents, labels, skipped_rows
-
-
-def component_from_file(f):
-  """Reads a training data file and returns arrays of contents and labels."""
-  contents = []
-  labels = []
-  csv.field_size_limit(sys.maxsize)
-  for row in csv.reader(f):
-    label, content = row
-    contents.append(content)
-    labels.append(label)
-  return contents, labels