Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/ml/trainer2/README.md b/tools/ml/trainer2/README.md
new file mode 100644
index 0000000..d32c8bf
--- /dev/null
+++ b/tools/ml/trainer2/README.md
@@ -0,0 +1,35 @@
+### Trainer
+
+## Monorail Spam Classifier
+
+To have the trainer run locally, you'll need to supply the
+`--train-file` arguments.
+
+```sh
+TRAIN_FILE=./spam_training_examples.csv
+OUTPUT_DIR=/tmp/monospam-local-training/
+rm -rf $OUTPUT_DIR
+python3 ./task.py \
+    --train-file $TRAIN_FILE \
+    --job-dir $OUTPUT_DIR \
+    --train-steps 1000 \
+    --verbosity DEBUG \
+    --trainer-type spam
+```
+## Monorail Component Predictor
+
+To have the trainer run locally, you'll need to supply the
+`--train-file` arguments.
+
+```sh
+TRAIN_FILE=./component_training_examples.csv
+OUTPUT_DIR=/tmp/monospam-local-training/
+rm -rf $OUTPUT_DIR
+python3 ./task.py \
+    --train-file $TRAIN_FILE \
+    --job-dir $OUTPUT_DIR \
+    --train-steps 10000 \
+    --eval-steps 1000 \
+    --verbosity DEBUG \
+    --trainer-type component
+```
\ No newline at end of file
diff --git a/tools/ml/trainer2/__init__.py b/tools/ml/trainer2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tools/ml/trainer2/__init__.py
diff --git a/tools/ml/trainer2/dataset.py b/tools/ml/trainer2/dataset.py
new file mode 100644
index 0000000..9e7ae77
--- /dev/null
+++ b/tools/ml/trainer2/dataset.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import io
+import tensorflow as tf
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from trainer2 import train_ml_helpers
+
+
+def fetch_training_data(bucket, prefix, trainer_type):
+
+  credentials = GoogleCredentials.get_application_default()
+  storage = discovery.build('storage', 'v1', credentials=credentials)
+  objects = storage.objects()
+
+  request = objects.list(bucket=bucket, prefix=prefix)
+  response = make_api_request(request)
+  items = response.get('items')
+  csv_filepaths = [blob.get('name') for blob in items]
+
+  if trainer_type == 'spam':
+    return fetch_spam(csv_filepaths, bucket, objects)
+  else:
+    return fetch_component(csv_filepaths, bucket, objects)
+
+
+def fetch_spam(csv_filepaths, bucket, objects):
+
+  all_contents = []
+  all_labels = []
+  # Add code
+  csv_filepaths = [
+      'spam-training-data/full-android.csv',
+      'spam-training-data/full-support.csv',
+  ] + csv_filepaths
+
+  for filepath in csv_filepaths:
+    media = fetch_training_csv(filepath, objects, bucket)
+    contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
+        io.StringIO(media))
+
+    # Sanity check: the contents and labels should be matched pairs.
+    if len(contents) == len(labels) != 0:
+      all_contents.extend(contents)
+      all_labels.extend(labels)
+
+    tf.get_logger().info(
+        '{:<40}{:<20}{:<20}'.format(
+            filepath, 'added %d rows' % len(contents),
+            'skipped %d rows' % skipped_rows))
+
+  return all_contents, all_labels
+
+
+def fetch_component(csv_filepaths, bucket, objects):
+
+  all_contents = []
+  all_labels = []
+  for filepath in csv_filepaths:
+    media = fetch_training_csv(filepath, objects, bucket)
+    contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))
+
+    # Sanity check: the contents and labels should be matched pairs.
+    if len(contents) == len(labels) != 0:
+      all_contents.extend(contents)
+      all_labels.extend(labels)
+
+    tf.get_logger().info(
+        '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))
+
+  return all_contents, all_labels
+
+
+def fetch_training_csv(filepath, objects, bucket):
+  request = objects.get_media(bucket=bucket, object=filepath)
+  return str(make_api_request(request), 'utf-8')
+
+
+def make_api_request(request):
+  try:
+    return request.execute()
+  except errors.HttpError as err:
+    tf.get_logger().error('There was an error with the API. Details:')
+    tf.get_logger().error(err._get_reason())
+    raise
diff --git a/tools/ml/trainer2/model.py b/tools/ml/trainer2/model.py
new file mode 100644
index 0000000..823d0d1
--- /dev/null
+++ b/tools/ml/trainer2/model.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import tensorflow as tf
+
+from trainer2.train_ml_helpers import COMPONENT_FEATURES
+from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
+
+# Important: we assume this list mirrors the output of GenerateFeaturesRaw.
+INPUT_COLUMNS = {'component': [
+                     tf.feature_column.numeric_column(
+                         key='word_features',
+                         shape=(COMPONENT_FEATURES,)),
+                 ],
+                 'spam': [
+                     tf.feature_column.numeric_column(
+                         key='word_hashes',
+                         shape=(SPAM_FEATURE_HASHES,)),
+                 ]}
+
+def build_estimator(config, job_dir, trainer_type, class_count):
+  """Returns a tf.Estimator.
+
+  Args:
+    config: tf.contrib.learn.RunConfig defining the runtime environment for the
+      estimator (including model_dir).
+  Returns:
+    A LinearClassifier
+  """
+  return tf.estimator.DNNClassifier(
+    config=config,
+    model_dir=job_dir,
+    feature_columns=(INPUT_COLUMNS[trainer_type]),
+    hidden_units=[1024, 512, 256],
+    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-08,
+      name='Adam'),
+    n_classes=class_count
+  )
diff --git a/tools/ml/trainer2/requirements.txt b/tools/ml/trainer2/requirements.txt
new file mode 100644
index 0000000..7ff5ef7
--- /dev/null
+++ b/tools/ml/trainer2/requirements.txt
@@ -0,0 +1,3 @@
+google-cloud-storage==1.26.0
+tensorflow==2.1.0
+scikit-learn[alldeps]
diff --git a/tools/ml/trainer2/stopwords.py b/tools/ml/trainer2/stopwords.py
new file mode 100644
index 0000000..c4e4c31
--- /dev/null
+++ b/tools/ml/trainer2/stopwords.py
@@ -0,0 +1,21 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+# A list of stopwords to parse text in component predictor.
+STOP_WORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
+  'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
+  'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
+  'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
+  'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
+  'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+  'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
+  'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
+  'with', 'about', 'against', 'between', 'into', 'through', 'during',
+  'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
+  'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
+  'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
+  'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
+  'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
+  'can', 'will', 'just', 'don', 'should', 'now']
diff --git a/tools/ml/trainer2/task.py b/tools/ml/trainer2/task.py
new file mode 100644
index 0000000..2fa8580
--- /dev/null
+++ b/tools/ml/trainer2/task.py
@@ -0,0 +1,256 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import argparse
+import json
+import logging
+import os
+
+import tensorflow as tf
+from tensorflow.estimator import RunConfig
+from sklearn.model_selection import train_test_split
+
+from trainer2 import dataset
+from trainer2 import model
+from trainer2 import top_words
+from trainer2 import train_ml_helpers
+from trainer2.train_ml_helpers import COMPONENT_FEATURES
+from trainer2.train_ml_helpers import SPAM_FEATURE_HASHES
+
+INPUT_TYPE_MAP = {
+  'component': {'key': 'word_features', 'shape': (COMPONENT_FEATURES,)},
+  'spam': {'key': 'word_hashes', 'shape': (SPAM_FEATURE_HASHES,)}
+}
+
+
+def make_input_fn(trainer_type, features, targets,
+  num_epochs=None, shuffle=True, batch_size=128):
+  """Generate input function for training and testing.
+
+  Args:
+    trainer_type: spam / component
+    features: an array of features shape like INPUT_TYPE_MAP
+    targets: an array of labels with the same length of features
+    num_epochs: training epochs
+    batch_size: dataset batch size
+
+  Returns:
+    input function to feed into TrainSpec and EvalSpec.
+  """
+  def _input_fn():
+    def gen():
+      """Generator function to format feature and target. """
+      for feature, target in zip(features, targets):
+        yield feature[INPUT_TYPE_MAP[trainer_type]['key']], target
+
+    data = tf.data.Dataset.from_generator(
+        gen, (tf.float64, tf.int32),
+        output_shapes=(INPUT_TYPE_MAP[trainer_type]['shape'], ()))
+    data = data.map(lambda x, y: ({INPUT_TYPE_MAP[trainer_type]['key']: x}, y))
+    if shuffle:
+      data = data.shuffle(buffer_size=batch_size * 10)
+    data = data.repeat(num_epochs).batch(batch_size)
+    return data
+
+  return _input_fn
+
+
+def generate_json_input_fn(trainer_type):
+  """Generate ServingInputReceiver function for testing.
+
+  Args:
+    trainer_type: spam / component
+
+  Returns:
+    ServingInputReceiver function to feed into exporter.
+  """
+  feature_spec = {
+    INPUT_TYPE_MAP[trainer_type]['key']:
+    tf.io.FixedLenFeature(INPUT_TYPE_MAP[trainer_type]['shape'], tf.float32)
+  }
+  return tf.estimator.export.build_parsing_serving_input_receiver_fn(
+    feature_spec)
+
+
+def train_and_evaluate_model(config, hparams):
+  """Runs the local training job given provided command line arguments.
+
+  Args:
+    config: RunConfig object
+    hparams: dictionary passed by command line arguments
+
+  """
+
+  if hparams['train_file']:
+    with open(hparams['train_file']) as f:
+      if hparams['trainer_type'] == 'spam':
+        contents, labels, _ = train_ml_helpers.spam_from_file(f)
+      else:
+        contents, labels = train_ml_helpers.component_from_file(f)
+  else:
+    contents, labels = dataset.fetch_training_data(
+        hparams['gcs_bucket'], hparams['gcs_prefix'], hparams['trainer_type'])
+
+  logger.info('Training data received. Len: %d' % len(contents))
+
+  # Generate features and targets from extracted contents and labels.
+  if hparams['trainer_type'] == 'spam':
+    features, targets = train_ml_helpers \
+      .transform_spam_csv_to_features(contents, labels)
+  else:
+    #top_list = top_words.make_top_words_list(contents, hparams['job_dir'])
+    top_list = top_words.parse_words_from_content(contents)
+    features, targets, index_to_component = train_ml_helpers \
+      .transform_component_csv_to_features(contents, labels, top_list)
+
+  # Split training and testing set.
+  logger.info('Features generated')
+  features_train, features_test, targets_train, targets_test = train_test_split(
+      features, targets, test_size=0.2, random_state=42)
+
+  # Generate TrainSpec and EvalSpec for train and evaluate.
+  estimator = model.build_estimator(config=config,
+                                    job_dir=hparams['job_dir'],
+                                    trainer_type=hparams['trainer_type'],
+                                    class_count=len(set(labels)))
+  exporter = tf.estimator.LatestExporter(name='saved_model',
+    serving_input_receiver_fn=generate_json_input_fn(hparams['trainer_type']))
+
+  train_spec = tf.estimator.TrainSpec(
+    input_fn=make_input_fn(hparams['trainer_type'],
+    features_train, targets_train, num_epochs=hparams['num_epochs'],
+    batch_size=hparams['train_batch_size']),
+    max_steps=hparams['train_steps'])
+  eval_spec = tf.estimator.EvalSpec(
+    input_fn=make_input_fn(hparams['trainer_type'],
+    features_test, targets_test, shuffle=False,
+    batch_size=hparams['eval_batch_size']),
+    exporters=exporter, steps=hparams['eval_steps'])
+
+  if hparams['trainer_type'] == 'component':
+    store_component_conversion(hparams['job_dir'], index_to_component)
+
+  result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+  logging.info(result)
+
+  parsing_spec = tf.feature_column.make_parse_example_spec(
+      model.INPUT_COLUMNS[hparams['trainer_type']])
+  serving_input_fn = (
+      tf.estimator.export.build_parsing_serving_input_receiver_fn(parsing_spec))
+  estimator.export_saved_model(hparams['job_dir'], serving_input_fn)
+
+
+def store_component_conversion(job_dir, data):
+  logger.info('job_dir: %s' % job_dir)
+
+  # Store component conversion locally.
+  paths = job_dir.split('/')
+  for y, _ in enumerate(list(range(1, len(paths))), 1):
+    if not os.path.exists("/".join(paths[:y+1])):
+      os.makedirs('/'.join(paths[:y+1]))
+  with open(job_dir + '/component_index.json', 'w') as f:
+    f.write(json.dumps(data))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+
+  # Input Arguments
+  parser.add_argument(
+      '--train-file',
+      help='GCS or local path to training data',
+  )
+  parser.add_argument(
+      '--gcs-bucket',
+      help='GCS bucket for training data.',
+  )
+  parser.add_argument(
+      '--gcs-prefix',
+      help='Training data path prefix inside GCS bucket.',
+  )
+  parser.add_argument(
+    '--num-epochs',
+    help="""\
+    Maximum number of training data epochs on which to train.
+    If both --train-steps and --num-epochs are specified,
+    the training job will run for --num-epochs.
+    If unspecified will run for --train-steps.\
+    """,
+    type=int,
+  )
+  parser.add_argument(
+    '--train-batch-size',
+    help='Batch size for training steps',
+    type=int,
+    default=128
+  )
+  parser.add_argument(
+    '--eval-batch-size',
+    help='Batch size for evaluation steps',
+    type=int,
+    default=128
+  )
+
+  # Training arguments
+  parser.add_argument(
+    '--job-dir',
+    help='GCS location to write checkpoints and export models',
+    required=True
+  )
+
+  # Logging arguments
+  parser.add_argument(
+    '--verbosity',
+    choices=[
+        'DEBUG',
+        'ERROR',
+        'CRITICAL',
+        'INFO',
+        'WARNING'
+    ],
+    default='INFO',
+  )
+
+  # Input function arguments
+  parser.add_argument(
+    '--train-steps',
+    help="""\
+    Steps to run the training job for. If --num-epochs is not specified,
+    this must be. Otherwise the training job will run indefinitely.\
+    """,
+    type=int,
+    required=True
+  )
+  parser.add_argument(
+    '--eval-steps',
+    help='Number of steps to run evalution for at each checkpoint',
+    default=100,
+    type=int
+  )
+  parser.add_argument(
+    '--trainer-type',
+    help='Which trainer to use (spam or component)',
+    choices=['spam', 'component'],
+    required=True
+  )
+
+  args = parser.parse_args()
+
+  logger = logging.getLogger()
+  logger.setLevel(getattr(logging, args.verbosity))
+
+  if not args.num_epochs:
+    args.num_epochs = args.train_steps
+
+  # Set C++ Graph Execution level verbosity.
+  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(
+    getattr(logging, args.verbosity) / 10)
+
+  # Run the training job.
+  train_and_evaluate_model(
+    config=RunConfig(model_dir=args.job_dir),
+    hparams=vars(args))
diff --git a/tools/ml/trainer2/top_words.py b/tools/ml/trainer2/top_words.py
new file mode 100644
index 0000000..bb57699
--- /dev/null
+++ b/tools/ml/trainer2/top_words.py
@@ -0,0 +1,66 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import absolute_import
+
+import os
+
+from trainer2 import train_ml_helpers
+from trainer2.stopwords import STOP_WORDS
+
+
+def GenerateTopWords(word_dict):
+  """Requires ./stopwords.txt exist in folder for the function to run.
+  """
+  stop_words = [s.encode('utf-8') for s in STOP_WORDS]
+  sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
+  top_words = []
+  index = 0
+
+  while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
+    if sorted_words[index] not in stop_words:
+      top_words.append(sorted_words[index])
+    index += 1
+
+  return top_words
+
+
+def parse_words_from_content(contents):
+  """Returns given list of strings, extract the top (most common) words.
+  """
+  word_dict = {}
+  for content in contents:
+    words = content.encode('utf-8').split()
+    for word in words:
+      if word in word_dict:
+        word_dict[word] += 1
+      else:
+        word_dict[word] = 1
+
+  return GenerateTopWords(word_dict)
+
+
+def make_top_words_list(contents, job_dir):
+  """Returns the top (most common) words in the entire dataset for component
+  prediction. If a file is already stored in job_dir containing these words, the
+  words from the file are simply returned. Otherwise, the most common words are
+  determined and written to job_dir, before being returned.
+
+  Returns:
+    A list of the most common words in the dataset (the number of them
+    determined by train_ml_helpers.COMPONENT_FEATURES).
+  """
+  if not os.path.exists(job_dir):
+    os.mkdir(job_dir)
+  if os.access(job_dir + 'topwords.txt', os.R_OK):
+    print("Found topwords.txt")
+    with open(job_dir + 'topwords.txt', 'rb') as f:
+      top_words = f.read().split()
+  else:
+    top_words = parse_words_from_content(contents)
+    with open(job_dir + 'topwords.txt', 'w') as f:
+      for word in top_words:
+        f.write('%s\n' % word.decode('utf-8'))
+  return top_words
diff --git a/tools/ml/trainer2/train_ml_helpers.py b/tools/ml/trainer2/train_ml_helpers.py
new file mode 100644
index 0000000..36113a2
--- /dev/null
+++ b/tools/ml/trainer2/train_ml_helpers.py
@@ -0,0 +1,158 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+"""
+Helper functions for spam and component classification. These are mostly for
+feature extraction, so that the serving code and training code both use the same
+set of features.
+TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
+  (with slight difference). Will eventually be merged to one.
+"""
+
+from __future__ import absolute_import
+
+import csv
+import hashlib
+import re
+import sys
+
+SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
+LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
+DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']
+
+# Must be identical to settings.spam_feature_hashes.
+SPAM_FEATURE_HASHES = 500
+# Must be identical to settings.component_features.
+COMPONENT_FEATURES = 5000
+
+
+def _ComponentFeatures(content, num_features, top_words):
+  """
+    This uses the most common words in the entire dataset as features.
+    The count of common words in the issue comments makes up the features.
+  """
+
+  features = [0] * num_features
+  for blob in content:
+    words = blob.split()
+    for word in words:
+      if word in top_words:
+        features[top_words[word]] += 1
+
+  return features
+
+
+def _SpamHashFeatures(content, num_features):
+  """
+    Feature hashing is a fast and compact way to turn a string of text into a
+    vector of feature values for classification and training.
+    See also: https://en.wikipedia.org/wiki/Feature_hashing
+    This is a simple implementation that doesn't try to minimize collisions
+    or anything else fancy.
+  """
+  features = [0] * num_features
+  total = 0.0
+  for blob in content:
+    words = re.split('|'.join(DELIMITERS).encode('utf-8'), blob)
+    for word in words:
+      feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
+                          % num_features)
+      features[feature_index] += 1.0
+      total += 1.0
+
+  if total > 0:
+    features = [f / total for f in features]
+
+  return features
+
+
+def GenerateFeaturesRaw(content, num_features, top_words=None):
+  """Generates a vector of features for a given issue or comment.
+
+  Args:
+    content: The content of the issue's description and comments.
+    num_features: The number of features to generate.
+  """
+  # If we've been passed real unicode strings, convert them to just bytestrings.
+  for idx, value in enumerate(content):
+    content[idx] = value.encode('utf-8')
+  if top_words:
+    return {'word_features': _ComponentFeatures(content,
+                                                   num_features,
+                                                   top_words)}
+
+  return {'word_hashes': _SpamHashFeatures(content, num_features)}
+
+
+def transform_spam_csv_to_features(contents, labels):
+  """Generate arrays of features and targets for spam.
+  """
+  features = []
+  targets = []
+  for i, row in enumerate(contents):
+    subject, content = row
+    label = labels[i]
+    features.append(GenerateFeaturesRaw([str(subject), str(content)],
+                                 SPAM_FEATURE_HASHES))
+    targets.append(1 if label == 'spam' else 0)
+  return features, targets
+
+
+def transform_component_csv_to_features(contents, labels, top_list):
+  """Generate arrays of features and targets for components.
+  """
+  features = []
+  targets = []
+  top_words = {}
+
+  for i, row in enumerate(top_list):
+    top_words[row] = i
+
+  component_to_index = {}
+  index_to_component = {}
+  component_index = 0
+
+  for i, content in enumerate(contents):
+    component = labels[i]
+    component = str(component).split(",")[0]
+
+    if component not in component_to_index:
+      component_to_index[component] = component_index
+      index_to_component[component_index] = component
+      component_index += 1
+
+    features.append(GenerateFeaturesRaw([content],
+                                 COMPONENT_FEATURES,
+                                 top_words))
+    targets.append(component_to_index[component])
+
+  return features, targets, index_to_component
+
+
+def spam_from_file(f):
+  """Reads a training data file and returns arrays of contents and labels."""
+  contents = []
+  labels = []
+  skipped_rows = 0
+  for row in csv.reader(f):
+    if len(row) >= len(LEGACY_CSV_COLUMNS):
+      # Throw out email field.
+      contents.append(row[1:3])
+      labels.append(row[0])
+    else:
+      skipped_rows += 1
+  return contents, labels, skipped_rows
+
+
+def component_from_file(f):
+  """Reads a training data file and returns arrays of contents and labels."""
+  contents = []
+  labels = []
+  csv.field_size_limit(sys.maxsize)
+  for row in csv.reader(f):
+    label, content = row
+    contents.append(content)
+    labels.append(label)
+  return contents, labels