Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/ml/trainer2/dataset.py b/tools/ml/trainer2/dataset.py
new file mode 100644
index 0000000..9e7ae77
--- /dev/null
+++ b/tools/ml/trainer2/dataset.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+# Or at https://developers.google.com/open-source/licenses/bsd
+
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import io
+import tensorflow as tf
+
+from googleapiclient import discovery
+from googleapiclient import errors
+from oauth2client.client import GoogleCredentials
+
+from trainer2 import train_ml_helpers
+
+
+def fetch_training_data(bucket, prefix, trainer_type):
+
+ credentials = GoogleCredentials.get_application_default()
+ storage = discovery.build('storage', 'v1', credentials=credentials)
+ objects = storage.objects()
+
+ request = objects.list(bucket=bucket, prefix=prefix)
+ response = make_api_request(request)
+ items = response.get('items')
+ csv_filepaths = [blob.get('name') for blob in items]
+
+ if trainer_type == 'spam':
+ return fetch_spam(csv_filepaths, bucket, objects)
+ else:
+ return fetch_component(csv_filepaths, bucket, objects)
+
+
+def fetch_spam(csv_filepaths, bucket, objects):
+
+ all_contents = []
+ all_labels = []
+ # Add code
+ csv_filepaths = [
+ 'spam-training-data/full-android.csv',
+ 'spam-training-data/full-support.csv',
+ ] + csv_filepaths
+
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
+ io.StringIO(media))
+
+ # Sanity check: the contents and labels should be matched pairs.
+ if len(contents) == len(labels) != 0:
+ all_contents.extend(contents)
+ all_labels.extend(labels)
+
+ tf.get_logger().info(
+ '{:<40}{:<20}{:<20}'.format(
+ filepath, 'added %d rows' % len(contents),
+ 'skipped %d rows' % skipped_rows))
+
+ return all_contents, all_labels
+
+
+def fetch_component(csv_filepaths, bucket, objects):
+
+ all_contents = []
+ all_labels = []
+ for filepath in csv_filepaths:
+ media = fetch_training_csv(filepath, objects, bucket)
+ contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))
+
+ # Sanity check: the contents and labels should be matched pairs.
+ if len(contents) == len(labels) != 0:
+ all_contents.extend(contents)
+ all_labels.extend(labels)
+
+ tf.get_logger().info(
+ '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))
+
+ return all_contents, all_labels
+
+
+def fetch_training_csv(filepath, objects, bucket):
+ request = objects.get_media(bucket=bucket, object=filepath)
+ return str(make_api_request(request), 'utf-8')
+
+
+def make_api_request(request):
+ try:
+ return request.execute()
+ except errors.HttpError as err:
+ tf.get_logger().error('There was an error with the API. Details:')
+ tf.get_logger().error(err._get_reason())
+ raise