tools/ml/trainer2/train_ml_helpers.py - monorail-avm99963 - Gitiles

 # Copyright 2019 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 # Or at https://developers.google.com/open-source/licenses/bsd

 """
 Helper functions for spam and component classification. These are mostly for
 feature extraction, so that the serving code and training code both use the same
 set of features.
 TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
   (with slight difference). Will eventually be merged to one.
 """

 from __future__ import absolute_import

 import csv
 import hashlib
 import re
 import sys

 SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
 LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
 DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']

 # Must be identical to settings.spam_feature_hashes.
 SPAM_FEATURE_HASHES = 500
 # Must be identical to settings.component_features.
 COMPONENT_FEATURES = 5000


 def _ComponentFeatures(content, num_features, top_words):
   """
     This uses the most common words in the entire dataset as features.
     The count of common words in the issue comments makes up the features.
   """

   features = [0] * num_features
   for blob in content:
     words = blob.split()
     for word in words:
       if word in top_words:
         features[top_words[word]] += 1

   return features


 def _SpamHashFeatures(content, num_features):
   """
     Feature hashing is a fast and compact way to turn a string of text into a
     vector of feature values for classification and training.
     See also: https://en.wikipedia.org/wiki/Feature_hashing
     This is a simple implementation that doesn't try to minimize collisions
     or anything else fancy.
   """
   features = [0] * num_features
   total = 0.0
   for blob in content:
     words = re.split('|'.join(DELIMITERS).encode('utf-8'), blob)
     for word in words:
       feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
                           % num_features)
       features[feature_index] += 1.0
       total += 1.0

   if total > 0:
     features = [f / total for f in features]

   return features


 def GenerateFeaturesRaw(content, num_features, top_words=None):
   """Generates a vector of features for a given issue or comment.

   Args:
     content: The content of the issue's description and comments.
     num_features: The number of features to generate.
   """
   # If we've been passed real unicode strings, convert them to just bytestrings.
   for idx, value in enumerate(content):
     content[idx] = value.encode('utf-8')
   if top_words:
     return {'word_features': _ComponentFeatures(content,
                                                    num_features,
                                                    top_words)}

   return {'word_hashes': _SpamHashFeatures(content, num_features)}


 def transform_spam_csv_to_features(contents, labels):
   """Generate arrays of features and targets for spam.
   """
   features = []
   targets = []
   for i, row in enumerate(contents):
     subject, content = row
     label = labels[i]
     features.append(GenerateFeaturesRaw([str(subject), str(content)],
                                  SPAM_FEATURE_HASHES))
     targets.append(1 if label == 'spam' else 0)
   return features, targets


 def transform_component_csv_to_features(contents, labels, top_list):
   """Generate arrays of features and targets for components.
   """
   features = []
   targets = []
   top_words = {}

   for i, row in enumerate(top_list):
     top_words[row] = i

   component_to_index = {}
   index_to_component = {}
   component_index = 0

   for i, content in enumerate(contents):
     component = labels[i]
     component = str(component).split(",")[0]

     if component not in component_to_index:
       component_to_index[component] = component_index
       index_to_component[component_index] = component
       component_index += 1

     features.append(GenerateFeaturesRaw([content],
                                  COMPONENT_FEATURES,
                                  top_words))
     targets.append(component_to_index[component])

   return features, targets, index_to_component


 def spam_from_file(f):
   """Reads a training data file and returns arrays of contents and labels."""
   contents = []
   labels = []
   skipped_rows = 0
   for row in csv.reader(f):
     if len(row) >= len(LEGACY_CSV_COLUMNS):
       # Throw out email field.
       contents.append(row[1:3])
       labels.append(row[0])
     else:
       skipped_rows += 1
   return contents, labels, skipped_rows


 def component_from_file(f):
   """Reads a training data file and returns arrays of contents and labels."""
   contents = []
   labels = []
   csv.field_size_limit(sys.maxsize)
   for row in csv.reader(f):
     label, content = row
     contents.append(content)
     labels.append(label)
   return contents, labels
	# Copyright 2019 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	# Or at https://developers.google.com/open-source/licenses/bsd

	"""
	Helper functions for spam and component classification. These are mostly for
	feature extraction, so that the serving code and training code both use the same
	set of features.
	TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
	(with slight difference). Will eventually be merged to one.
	"""

	from __future__ import absolute_import

	import csv
	import hashlib
	import re
	import sys

	SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
	LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
	DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']

	# Must be identical to settings.spam_feature_hashes.
	SPAM_FEATURE_HASHES = 500
	# Must be identical to settings.component_features.
	COMPONENT_FEATURES = 5000


	def _ComponentFeatures(content, num_features, top_words):
	"""
	This uses the most common words in the entire dataset as features.
	The count of common words in the issue comments makes up the features.
	"""

	features = [0] * num_features
	for blob in content:
	words = blob.split()
	for word in words:
	if word in top_words:
	features[top_words[word]] += 1

	return features


	def _SpamHashFeatures(content, num_features):
	"""
	Feature hashing is a fast and compact way to turn a string of text into a
	vector of feature values for classification and training.
	See also: https://en.wikipedia.org/wiki/Feature_hashing
	This is a simple implementation that doesn't try to minimize collisions
	or anything else fancy.
	"""
	features = [0] * num_features
	total = 0.0
	for blob in content:
	words = re.split('\|'.join(DELIMITERS).encode('utf-8'), blob)
	for word in words:
	feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
	% num_features)
	features[feature_index] += 1.0
	total += 1.0

	if total > 0:
	features = [f / total for f in features]

	return features


	def GenerateFeaturesRaw(content, num_features, top_words=None):
	"""Generates a vector of features for a given issue or comment.

	Args:
	content: The content of the issue's description and comments.
	num_features: The number of features to generate.
	"""
	# If we've been passed real unicode strings, convert them to just bytestrings.
	for idx, value in enumerate(content):
	content[idx] = value.encode('utf-8')
	if top_words:
	return {'word_features': _ComponentFeatures(content,
	num_features,
	top_words)}

	return {'word_hashes': _SpamHashFeatures(content, num_features)}


	def transform_spam_csv_to_features(contents, labels):
	"""Generate arrays of features and targets for spam.
	"""
	features = []
	targets = []
	for i, row in enumerate(contents):
	subject, content = row
	label = labels[i]
	features.append(GenerateFeaturesRaw([str(subject), str(content)],
	SPAM_FEATURE_HASHES))
	targets.append(1 if label == 'spam' else 0)
	return features, targets


	def transform_component_csv_to_features(contents, labels, top_list):
	"""Generate arrays of features and targets for components.
	"""
	features = []
	targets = []
	top_words = {}

	for i, row in enumerate(top_list):
	top_words[row] = i

	component_to_index = {}
	index_to_component = {}
	component_index = 0

	for i, content in enumerate(contents):
	component = labels[i]
	component = str(component).split(",")[0]

	if component not in component_to_index:
	component_to_index[component] = component_index
	index_to_component[component_index] = component
	component_index += 1

	features.append(GenerateFeaturesRaw([content],
	COMPONENT_FEATURES,
	top_words))
	targets.append(component_to_index[component])

	return features, targets, index_to_component


	def spam_from_file(f):
	"""Reads a training data file and returns arrays of contents and labels."""
	contents = []
	labels = []
	skipped_rows = 0
	for row in csv.reader(f):
	if len(row) >= len(LEGACY_CSV_COLUMNS):
	# Throw out email field.
	contents.append(row[1:3])
	labels.append(row[0])
	else:
	skipped_rows += 1
	return contents, labels, skipped_rows


	def component_from_file(f):
	"""Reads a training data file and returns arrays of contents and labels."""
	contents = []
	labels = []
	csv.field_size_limit(sys.maxsize)
	for row in csv.reader(f):
	label, content = row
	contents.append(content)
	labels.append(label)
	return contents, labels