Blame - features/component_helpers.py - monorail-avm99963

blob: 1392f0ba6bf321a3d4f9f74ffd47130f59b9225d [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	# Copyright 2018 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style
				3	# license that can be found in the LICENSE file or at
				4	# https://developers.google.com/open-source/licenses/bsd
				5
				6	from __future__ import print_function
				7	from __future__ import division
				8	from __future__ import absolute_import
				9
				10	import json
				11	import logging
				12	import re
				13
				14	import settings
				15	import cloudstorage
				16
				17	from features import generate_dataset
				18	from framework import framework_helpers
				19	from services import ml_helpers
				20	from tracker import tracker_bizobj
				21
				22	from googleapiclient import discovery
				23	from oauth2client.client import GoogleCredentials
				24
				25
				26	MODEL_NAME = 'projects/{}/models/{}'.format(
				27	settings.classifier_project_id, settings.component_model_name)
				28
				29
				30	def _GetTopWords(trainer_name): # pragma: no cover
				31	# TODO(carapew): Use memcache to get top words rather than storing as a
				32	# variable.
				33	credentials = GoogleCredentials.get_application_default()
				34	storage = discovery.build('storage', 'v1', credentials=credentials)
				35	request = storage.objects().get_media(
				36	bucket=settings.component_ml_bucket,
				37	object=trainer_name + '/topwords.txt')
				38	response = request.execute()
				39
				40	# This turns the top words list into a dictionary for faster feature
				41	# generation.
				42	return {word: idx for idx, word in enumerate(response.split())}
				43
				44
				45	def _GetComponentsByIndex(trainer_name):
				46	# TODO(carapew): Memcache the index mapping file.
				47	mapping_path = '/%s/%s/component_index.json' % (
				48	settings.component_ml_bucket, trainer_name)
				49	logging.info('Mapping path full name: %r', mapping_path)
				50
				51	with cloudstorage.open(mapping_path, 'r') as index_mapping_file:
				52	logging.info('Index component mapping opened')
				53	mapping = index_mapping_file.read()
				54	logging.info(mapping)
				55	return json.loads(mapping)
				56
				57
				58	@framework_helpers.retry(3)
				59	def _GetComponentPrediction(ml_engine, instance):
				60	"""Predict the component from the default model based on the provided text.
				61
				62	Args:
				63	ml_engine: An ML Engine instance for making predictions.
				64	instance: The dict object returned from ml_helpers.GenerateFeaturesRaw
				65	containing the features generated from the provided text.
				66
				67	Returns:
				68	The index of the component with the highest score. ML engine's predict
				69	api returns a dict of the format
				70	{'predictions': [{'classes': ['0', '1', ...], 'scores': [.00234, ...]}]}
				71	where each class has a score at the same index. Classes are sequential,
				72	so the index of the highest score also happens to be the component's
				73	index.
				74	"""
				75	body = {'instances': [{'inputs': instance['word_features']}]}
				76	request = ml_engine.projects().predict(name=MODEL_NAME, body=body)
				77	response = request.execute()
				78
				79	logging.info('ML Engine API response: %r' % response)
				80	scores = response['predictions'][0]['scores']
				81
				82	return scores.index(max(scores))
				83
				84
				85	def PredictComponent(raw_text, config):
				86	"""Get the component ID predicted for the given text.
				87
				88	Args:
				89	raw_text: The raw text for which we want to predict a component.
				90	config: The config of the project. Used to decide if the predicted component
				91	is valid.
				92
				93	Returns:
				94	The component ID predicted for the provided component, or None if no
				95	component was predicted.
				96	"""
				97	# Set-up ML engine.
				98	ml_engine = ml_helpers.setup_ml_engine()
				99
				100	# Gets the timestamp number from the folder containing the model's trainer
				101	# in order to get the correct files for mappings and features.
				102	request = ml_engine.projects().models().get(name=MODEL_NAME)
				103	response = request.execute()
				104
				105	version = re.search(r'v_(\d+)', response['defaultVersion']['name']).group(1)
				106	trainer_name = 'component_trainer_%s' % version
				107
				108	top_words = _GetTopWords(trainer_name)
				109	components_by_index = _GetComponentsByIndex(trainer_name)
				110	logging.info('Length of top words list: %s', len(top_words))
				111
				112	clean_text = generate_dataset.CleanText(raw_text)
				113	instance = ml_helpers.GenerateFeaturesRaw(
				114	[clean_text], settings.component_features, top_words)
				115
				116	# Get the component id with the highest prediction score. Component ids are
				117	# stored in GCS as strings, but represented in the app as longs.
				118	best_score_index = _GetComponentPrediction(ml_engine, instance)
				119	component_id = components_by_index.get(str(best_score_index))
				120	if component_id:
				121	component_id = int(component_id)
				122
				123	# The predicted component id might not exist.
				124	if tracker_bizobj.FindComponentDefByID(component_id, config) is None:
				125	return None
				126
				127	return component_id