blob: 9e7ae7710a65453a8c2eb7b38cb8b8d3fcac58e6 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2020 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4# Or at https://developers.google.com/open-source/licenses/bsd
5
6from __future__ import print_function
7from __future__ import division
8from __future__ import absolute_import
9
10import io
11import tensorflow as tf
12
13from googleapiclient import discovery
14from googleapiclient import errors
15from oauth2client.client import GoogleCredentials
16
17from trainer2 import train_ml_helpers
18
19
20def fetch_training_data(bucket, prefix, trainer_type):
21
22 credentials = GoogleCredentials.get_application_default()
23 storage = discovery.build('storage', 'v1', credentials=credentials)
24 objects = storage.objects()
25
26 request = objects.list(bucket=bucket, prefix=prefix)
27 response = make_api_request(request)
28 items = response.get('items')
29 csv_filepaths = [blob.get('name') for blob in items]
30
31 if trainer_type == 'spam':
32 return fetch_spam(csv_filepaths, bucket, objects)
33 else:
34 return fetch_component(csv_filepaths, bucket, objects)
35
36
37def fetch_spam(csv_filepaths, bucket, objects):
38
39 all_contents = []
40 all_labels = []
41 # Add code
42 csv_filepaths = [
43 'spam-training-data/full-android.csv',
44 'spam-training-data/full-support.csv',
45 ] + csv_filepaths
46
47 for filepath in csv_filepaths:
48 media = fetch_training_csv(filepath, objects, bucket)
49 contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
50 io.StringIO(media))
51
52 # Sanity check: the contents and labels should be matched pairs.
53 if len(contents) == len(labels) != 0:
54 all_contents.extend(contents)
55 all_labels.extend(labels)
56
57 tf.get_logger().info(
58 '{:<40}{:<20}{:<20}'.format(
59 filepath, 'added %d rows' % len(contents),
60 'skipped %d rows' % skipped_rows))
61
62 return all_contents, all_labels
63
64
65def fetch_component(csv_filepaths, bucket, objects):
66
67 all_contents = []
68 all_labels = []
69 for filepath in csv_filepaths:
70 media = fetch_training_csv(filepath, objects, bucket)
71 contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))
72
73 # Sanity check: the contents and labels should be matched pairs.
74 if len(contents) == len(labels) != 0:
75 all_contents.extend(contents)
76 all_labels.extend(labels)
77
78 tf.get_logger().info(
79 '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))
80
81 return all_contents, all_labels
82
83
84def fetch_training_csv(filepath, objects, bucket):
85 request = objects.get_media(bucket=bucket, object=filepath)
86 return str(make_api_request(request), 'utf-8')
87
88
89def make_api_request(request):
90 try:
91 return request.execute()
92 except errors.HttpError as err:
93 tf.get_logger().error('There was an error with the API. Details:')
94 tf.get_logger().error(err._get_reason())
95 raise