Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame^] | 1 | # Copyright 2020 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | # Or at https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | from __future__ import print_function |
| 7 | from __future__ import division |
| 8 | from __future__ import absolute_import |
| 9 | |
| 10 | import io |
| 11 | import tensorflow as tf |
| 12 | |
| 13 | from googleapiclient import discovery |
| 14 | from googleapiclient import errors |
| 15 | from oauth2client.client import GoogleCredentials |
| 16 | |
| 17 | from trainer2 import train_ml_helpers |
| 18 | |
| 19 | |
| 20 | def fetch_training_data(bucket, prefix, trainer_type): |
| 21 | |
| 22 | credentials = GoogleCredentials.get_application_default() |
| 23 | storage = discovery.build('storage', 'v1', credentials=credentials) |
| 24 | objects = storage.objects() |
| 25 | |
| 26 | request = objects.list(bucket=bucket, prefix=prefix) |
| 27 | response = make_api_request(request) |
| 28 | items = response.get('items') |
| 29 | csv_filepaths = [blob.get('name') for blob in items] |
| 30 | |
| 31 | if trainer_type == 'spam': |
| 32 | return fetch_spam(csv_filepaths, bucket, objects) |
| 33 | else: |
| 34 | return fetch_component(csv_filepaths, bucket, objects) |
| 35 | |
| 36 | |
| 37 | def fetch_spam(csv_filepaths, bucket, objects): |
| 38 | |
| 39 | all_contents = [] |
| 40 | all_labels = [] |
| 41 | # Add code |
| 42 | csv_filepaths = [ |
| 43 | 'spam-training-data/full-android.csv', |
| 44 | 'spam-training-data/full-support.csv', |
| 45 | ] + csv_filepaths |
| 46 | |
| 47 | for filepath in csv_filepaths: |
| 48 | media = fetch_training_csv(filepath, objects, bucket) |
| 49 | contents, labels, skipped_rows = train_ml_helpers.spam_from_file( |
| 50 | io.StringIO(media)) |
| 51 | |
| 52 | # Sanity check: the contents and labels should be matched pairs. |
| 53 | if len(contents) == len(labels) != 0: |
| 54 | all_contents.extend(contents) |
| 55 | all_labels.extend(labels) |
| 56 | |
| 57 | tf.get_logger().info( |
| 58 | '{:<40}{:<20}{:<20}'.format( |
| 59 | filepath, 'added %d rows' % len(contents), |
| 60 | 'skipped %d rows' % skipped_rows)) |
| 61 | |
| 62 | return all_contents, all_labels |
| 63 | |
| 64 | |
| 65 | def fetch_component(csv_filepaths, bucket, objects): |
| 66 | |
| 67 | all_contents = [] |
| 68 | all_labels = [] |
| 69 | for filepath in csv_filepaths: |
| 70 | media = fetch_training_csv(filepath, objects, bucket) |
| 71 | contents, labels = train_ml_helpers.component_from_file(io.StringIO(media)) |
| 72 | |
| 73 | # Sanity check: the contents and labels should be matched pairs. |
| 74 | if len(contents) == len(labels) != 0: |
| 75 | all_contents.extend(contents) |
| 76 | all_labels.extend(labels) |
| 77 | |
| 78 | tf.get_logger().info( |
| 79 | '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents))) |
| 80 | |
| 81 | return all_contents, all_labels |
| 82 | |
| 83 | |
| 84 | def fetch_training_csv(filepath, objects, bucket): |
| 85 | request = objects.get_media(bucket=bucket, object=filepath) |
| 86 | return str(make_api_request(request), 'utf-8') |
| 87 | |
| 88 | |
| 89 | def make_api_request(request): |
| 90 | try: |
| 91 | return request.execute() |
| 92 | except errors.HttpError as err: |
| 93 | tf.get_logger().error('There was an error with the API. Details:') |
| 94 | tf.get_logger().error(err._get_reason()) |
| 95 | raise |