Blame - tools/ml/trainer2/train_ml_helpers.py - monorail-avm99963

blob: 36113a295546128cc7f1f9d6c3b49076a77400b9 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	# Copyright 2019 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4	# Or at https://developers.google.com/open-source/licenses/bsd
				5
				6	"""
				7	Helper functions for spam and component classification. These are mostly for
				8	feature extraction, so that the serving code and training code both use the same
				9	set of features.
				10	TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
				11	(with slight difference). Will eventually be merged to one.
				12	"""
				13
				14	from __future__ import absolute_import
				15
				16	import csv
				17	import hashlib
				18	import re
				19	import sys
				20
				21	SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
				22	LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
				23	DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']
				24
				25	# Must be identical to settings.spam_feature_hashes.
				26	SPAM_FEATURE_HASHES = 500
				27	# Must be identical to settings.component_features.
				28	COMPONENT_FEATURES = 5000
				29
				30
				31	def _ComponentFeatures(content, num_features, top_words):
				32	"""
				33	This uses the most common words in the entire dataset as features.
				34	The count of common words in the issue comments makes up the features.
				35	"""
				36
				37	features = [0] * num_features
				38	for blob in content:
				39	words = blob.split()
				40	for word in words:
				41	if word in top_words:
				42	features[top_words[word]] += 1
				43
				44	return features
				45
				46
				47	def _SpamHashFeatures(content, num_features):
				48	"""
				49	Feature hashing is a fast and compact way to turn a string of text into a
				50	vector of feature values for classification and training.
				51	See also: https://en.wikipedia.org/wiki/Feature_hashing
				52	This is a simple implementation that doesn't try to minimize collisions
				53	or anything else fancy.
				54	"""
				55	features = [0] * num_features
				56	total = 0.0
				57	for blob in content:
				58	words = re.split('\|'.join(DELIMITERS).encode('utf-8'), blob)
				59	for word in words:
				60	feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
				61	% num_features)
				62	features[feature_index] += 1.0
				63	total += 1.0
				64
				65	if total > 0:
				66	features = [f / total for f in features]
				67
				68	return features
				69
				70
				71	def GenerateFeaturesRaw(content, num_features, top_words=None):
				72	"""Generates a vector of features for a given issue or comment.
				73
				74	Args:
				75	content: The content of the issue's description and comments.
				76	num_features: The number of features to generate.
				77	"""
				78	# If we've been passed real unicode strings, convert them to just bytestrings.
				79	for idx, value in enumerate(content):
				80	content[idx] = value.encode('utf-8')
				81	if top_words:
				82	return {'word_features': _ComponentFeatures(content,
				83	num_features,
				84	top_words)}
				85
				86	return {'word_hashes': _SpamHashFeatures(content, num_features)}
				87
				88
				89	def transform_spam_csv_to_features(contents, labels):
				90	"""Generate arrays of features and targets for spam.
				91	"""
				92	features = []
				93	targets = []
				94	for i, row in enumerate(contents):
				95	subject, content = row
				96	label = labels[i]
				97	features.append(GenerateFeaturesRaw([str(subject), str(content)],
				98	SPAM_FEATURE_HASHES))
				99	targets.append(1 if label == 'spam' else 0)
				100	return features, targets
				101
				102
				103	def transform_component_csv_to_features(contents, labels, top_list):
				104	"""Generate arrays of features and targets for components.
				105	"""
				106	features = []
				107	targets = []
				108	top_words = {}
				109
				110	for i, row in enumerate(top_list):
				111	top_words[row] = i
				112
				113	component_to_index = {}
				114	index_to_component = {}
				115	component_index = 0
				116
				117	for i, content in enumerate(contents):
				118	component = labels[i]
				119	component = str(component).split(",")[0]
				120
				121	if component not in component_to_index:
				122	component_to_index[component] = component_index
				123	index_to_component[component_index] = component
				124	component_index += 1
				125
				126	features.append(GenerateFeaturesRaw([content],
				127	COMPONENT_FEATURES,
				128	top_words))
				129	targets.append(component_to_index[component])
				130
				131	return features, targets, index_to_component
				132
				133
				134	def spam_from_file(f):
				135	"""Reads a training data file and returns arrays of contents and labels."""
				136	contents = []
				137	labels = []
				138	skipped_rows = 0
				139	for row in csv.reader(f):
				140	if len(row) >= len(LEGACY_CSV_COLUMNS):
				141	# Throw out email field.
				142	contents.append(row[1:3])
				143	labels.append(row[0])
				144	else:
				145	skipped_rows += 1
				146	return contents, labels, skipped_rows
				147
				148
				149	def component_from_file(f):
				150	"""Reads a training data file and returns arrays of contents and labels."""
				151	contents = []
				152	labels = []
				153	csv.field_size_limit(sys.maxsize)
				154	for row in csv.reader(f):
				155	label, content = row
				156	contents.append(content)
				157	labels.append(label)
				158	return contents, labels