Blame - services/ml_helpers.py - monorail-avm99963

blob: 6db23d480919d2a0116af3ddb813dca090a1d181 [file] [log] [blame]

Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	1	# Copyright 2018 The Chromium Authors
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	4
				5	"""
				6	Helper functions for spam and component classification. These are mostly for
				7	feature extraction, so that the serving code and training code both use the same
				8	set of features.
				9	"""
Adrià Vilanova Martínez	de94280	2022-07-15 14:06:55 +0200	[diff] [blame]	10	# TODO(crbug.com/monorail/7515): DELETE THIS FILE and all references.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	11
				12	from __future__ import division
				13	from __future__ import print_function
				14	from __future__ import absolute_import
				15
				16	import csv
				17	import hashlib
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	18	import logging
				19	import re
				20	import sys
				21
				22	from six import text_type
				23
				24	from apiclient.discovery import build
				25	from apiclient.errors import Error as ApiClientError
				26	from oauth2client.client import GoogleCredentials
				27	from oauth2client.client import Error as Oauth2ClientError
				28
				29
				30	SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
				31	LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	32	DELIMITERS = [r'\s', r'\,', r'\.', r'\?', '!', r'\:', r'\(', r'\)']
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	33
				34	# Must be identical to settings.spam_feature_hashes.
				35	SPAM_FEATURE_HASHES = 500
				36	# Must be identical to settings.component_features.
				37	COMPONENT_FEATURES = 5000
				38
				39
				40	def _ComponentFeatures(content, num_features, top_words):
				41	"""
				42	This uses the most common words in the entire dataset as features.
				43	The count of common words in the issue comments makes up the features.
				44	"""
				45
				46	features = [0] * num_features
				47	for blob in content:
				48	words = blob.split()
				49	for word in words:
				50	if word in top_words:
				51	features[top_words[word]] += 1
				52
				53	return features
				54
				55
				56	def _SpamHashFeatures(content, num_features):
				57	"""
				58	Feature hashing is a fast and compact way to turn a string of text into a
				59	vector of feature values for classification and training.
				60	See also: https://en.wikipedia.org/wiki/Feature_hashing
				61	This is a simple implementation that doesn't try to minimize collisions
				62	or anything else fancy.
				63	"""
				64	features = [0] * num_features
				65	total = 0.0
				66	for blob in content:
				67	words = re.split('\|'.join(DELIMITERS), blob)
				68	for word in words:
				69	encoded_word = word
				70	# If we've been passed real unicode strings, convert them to bytestrings.
				71	if isinstance(word, text_type):
				72	encoded_word = word.encode('utf-8')
				73	feature_index = int(
				74	int(hashlib.sha1(encoded_word).hexdigest(), 16) % num_features)
				75	features[feature_index] += 1.0
				76	total += 1.0
				77
				78	if total > 0:
				79	features = [ f / total for f in features ]
				80
				81	return features
				82
				83
				84	def GenerateFeaturesRaw(content, num_features, top_words=None):
				85	"""Generates a vector of features for a given issue or comment.
				86
				87	Args:
				88	content: The content of the issue's description and comments.
				89	num_features: The number of features to generate.
				90	"""
				91	if top_words:
				92	return { 'word_features': _ComponentFeatures(content,
				93	num_features,
				94	top_words)}
				95
				96	return { 'word_hashes': _SpamHashFeatures(content, num_features)}
				97
				98
				99	def transform_spam_csv_to_features(csv_training_data):
				100	X = []
				101	y = []
				102
				103	# Handle if the list is double-wrapped.
				104	if csv_training_data and len(csv_training_data[0]) > 4:
				105	csv_training_data = csv_training_data[0]
				106
				107	for row in csv_training_data:
				108	if len(row) == 4:
				109	verdict, subject, content, _email = row
				110	else:
				111	verdict, subject, content = row
				112	X.append(GenerateFeaturesRaw([str(subject), str(content)],
				113	SPAM_FEATURE_HASHES))
				114	y.append(1 if verdict == 'spam' else 0)
				115	return X, y
				116
				117
				118	def transform_component_csv_to_features(csv_training_data, top_list):
				119	X = []
				120	y = []
				121	top_words = {}
				122
				123	for i in range(len(top_list)):
				124	top_words[top_list[i]] = i
				125
				126	component_to_index = {}
				127	index_to_component = {}
				128	component_index = 0
				129
				130	for row in csv_training_data:
				131	component, content = row
				132	component = str(component).split(",")[0]
				133
				134	if component not in component_to_index:
				135	component_to_index[component] = component_index
				136	index_to_component[component_index] = component
				137	component_index += 1
				138
				139	X.append(GenerateFeaturesRaw([content],
				140	COMPONENT_FEATURES,
				141	top_words))
				142	y.append(component_to_index[component])
				143
				144	return X, y, index_to_component
				145
				146
				147	def spam_from_file(f):
				148	"""Reads a training data file and returns an array."""
				149	rows = []
				150	skipped_rows = 0
				151	for row in csv.reader(f):
				152	if len(row) == len(SPAM_COLUMNS):
				153	# Throw out email field.
				154	rows.append(row[:3])
				155	elif len(row) == len(LEGACY_CSV_COLUMNS):
				156	rows.append(row)
				157	else:
				158	skipped_rows += 1
				159	return rows, skipped_rows
				160
				161
				162	def component_from_file(f):
				163	"""Reads a training data file and returns an array."""
				164	rows = []
				165	csv.field_size_limit(sys.maxsize)
				166	for row in csv.reader(f):
				167	rows.append(row)
				168
				169	return rows
				170
				171
				172	def setup_ml_engine():
				173	"""Sets up an instance of ml engine for ml classes."""
				174	try:
				175	credentials = GoogleCredentials.get_application_default()
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	176	ml_engine = build('ml', 'v1', credentials=credentials)
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	177	return ml_engine
				178
				179	except (Oauth2ClientError, ApiClientError):
				180	logging.error("Error setting up ML Engine API: %s" % sys.exc_info()[0])