tools/ml/trainer2/dataset.py - monorail-avm99963 - Gitiles

 # Copyright 2020 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 # Or at https://developers.google.com/open-source/licenses/bsd

 from __future__ import print_function
 from __future__ import division
 from __future__ import absolute_import

 import io
 import tensorflow as tf

 from googleapiclient import discovery
 from googleapiclient import errors
 from oauth2client.client import GoogleCredentials

 from trainer2 import train_ml_helpers


 def fetch_training_data(bucket, prefix, trainer_type):

   credentials = GoogleCredentials.get_application_default()
   storage = discovery.build('storage', 'v1', credentials=credentials)
   objects = storage.objects()

   request = objects.list(bucket=bucket, prefix=prefix)
   response = make_api_request(request)
   items = response.get('items')
   csv_filepaths = [blob.get('name') for blob in items]

   if trainer_type == 'spam':
     return fetch_spam(csv_filepaths, bucket, objects)
   else:
     return fetch_component(csv_filepaths, bucket, objects)


 def fetch_spam(csv_filepaths, bucket, objects):

   all_contents = []
   all_labels = []
   # Add code
   csv_filepaths = [
       'spam-training-data/full-android.csv',
       'spam-training-data/full-support.csv',
   ] + csv_filepaths

   for filepath in csv_filepaths:
     media = fetch_training_csv(filepath, objects, bucket)
     contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
         io.StringIO(media))

     # Sanity check: the contents and labels should be matched pairs.
     if len(contents) == len(labels) != 0:
       all_contents.extend(contents)
       all_labels.extend(labels)

     tf.get_logger().info(
         '{:<40}{:<20}{:<20}'.format(
             filepath, 'added %d rows' % len(contents),
             'skipped %d rows' % skipped_rows))

   return all_contents, all_labels


 def fetch_component(csv_filepaths, bucket, objects):

   all_contents = []
   all_labels = []
   for filepath in csv_filepaths:
     media = fetch_training_csv(filepath, objects, bucket)
     contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))

     # Sanity check: the contents and labels should be matched pairs.
     if len(contents) == len(labels) != 0:
       all_contents.extend(contents)
       all_labels.extend(labels)

     tf.get_logger().info(
         '{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))

   return all_contents, all_labels


 def fetch_training_csv(filepath, objects, bucket):
   request = objects.get_media(bucket=bucket, object=filepath)
   return str(make_api_request(request), 'utf-8')


 def make_api_request(request):
   try:
     return request.execute()
   except errors.HttpError as err:
     tf.get_logger().error('There was an error with the API. Details:')
     tf.get_logger().error(err._get_reason())
     raise
	# Copyright 2020 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	# Or at https://developers.google.com/open-source/licenses/bsd

	from __future__ import print_function
	from __future__ import division
	from __future__ import absolute_import

	import io
	import tensorflow as tf

	from googleapiclient import discovery
	from googleapiclient import errors
	from oauth2client.client import GoogleCredentials

	from trainer2 import train_ml_helpers


	def fetch_training_data(bucket, prefix, trainer_type):

	credentials = GoogleCredentials.get_application_default()
	storage = discovery.build('storage', 'v1', credentials=credentials)
	objects = storage.objects()

	request = objects.list(bucket=bucket, prefix=prefix)
	response = make_api_request(request)
	items = response.get('items')
	csv_filepaths = [blob.get('name') for blob in items]

	if trainer_type == 'spam':
	return fetch_spam(csv_filepaths, bucket, objects)
	else:
	return fetch_component(csv_filepaths, bucket, objects)


	def fetch_spam(csv_filepaths, bucket, objects):

	all_contents = []
	all_labels = []
	# Add code
	csv_filepaths = [
	'spam-training-data/full-android.csv',
	'spam-training-data/full-support.csv',
	] + csv_filepaths

	for filepath in csv_filepaths:
	media = fetch_training_csv(filepath, objects, bucket)
	contents, labels, skipped_rows = train_ml_helpers.spam_from_file(
	io.StringIO(media))

	# Sanity check: the contents and labels should be matched pairs.
	if len(contents) == len(labels) != 0:
	all_contents.extend(contents)
	all_labels.extend(labels)

	tf.get_logger().info(
	'{:<40}{:<20}{:<20}'.format(
	filepath, 'added %d rows' % len(contents),
	'skipped %d rows' % skipped_rows))

	return all_contents, all_labels


	def fetch_component(csv_filepaths, bucket, objects):

	all_contents = []
	all_labels = []
	for filepath in csv_filepaths:
	media = fetch_training_csv(filepath, objects, bucket)
	contents, labels = train_ml_helpers.component_from_file(io.StringIO(media))

	# Sanity check: the contents and labels should be matched pairs.
	if len(contents) == len(labels) != 0:
	all_contents.extend(contents)
	all_labels.extend(labels)

	tf.get_logger().info(
	'{:<40}{:<20}'.format(filepath, 'added %d rows' % len(contents)))

	return all_contents, all_labels


	def fetch_training_csv(filepath, objects, bucket):
	request = objects.get_media(bucket=bucket, object=filepath)
	return str(make_api_request(request), 'utf-8')


	def make_api_request(request):
	try:
	return request.execute()
	except errors.HttpError as err:
	tf.get_logger().error('There was an error with the API. Details:')
	tf.get_logger().error(err._get_reason())
	raise