Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/services/ml_helpers.py b/services/ml_helpers.py
new file mode 100644
index 0000000..c4650b4
--- /dev/null
+++ b/services/ml_helpers.py
@@ -0,0 +1,181 @@
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""
+Helper functions for spam and component classification. These are mostly for
+feature extraction, so that the serving code and training code both use the same
+set of features.
+"""
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+
+import csv
+import hashlib
+import httplib2
+import logging
+import re
+import sys
+
+from six import text_type
+
+from apiclient.discovery import build
+from apiclient.errors import Error as ApiClientError
+from oauth2client.client import GoogleCredentials
+from oauth2client.client import Error as Oauth2ClientError
+
+
+SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
+LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
+DELIMITERS = ['\s', '\,', '\.', '\?', '!', '\:', '\(', '\)']
+
+# Must be identical to settings.spam_feature_hashes.
+SPAM_FEATURE_HASHES = 500
+# Must be identical to settings.component_features.
+COMPONENT_FEATURES = 5000
+
+
+def _ComponentFeatures(content, num_features, top_words):
+  """
+    This uses the most common words in the entire dataset as features.
+    The count of common words in the issue comments makes up the features.
+  """
+
+  features = [0] * num_features
+  for blob in content:
+    words = blob.split()
+    for word in words:
+      if word in top_words:
+        features[top_words[word]] += 1
+
+  return features
+
+
+def _SpamHashFeatures(content, num_features):
+  """
+    Feature hashing is a fast and compact way to turn a string of text into a
+    vector of feature values for classification and training.
+    See also: https://en.wikipedia.org/wiki/Feature_hashing
+    This is a simple implementation that doesn't try to minimize collisions
+    or anything else fancy.
+  """
+  features = [0] * num_features
+  total = 0.0
+  for blob in content:
+    words = re.split('|'.join(DELIMITERS), blob)
+    for word in words:
+      encoded_word = word
+      # If we've been passed real unicode strings, convert them to bytestrings.
+      if isinstance(word, text_type):
+        encoded_word = word.encode('utf-8')
+      feature_index = int(
+          int(hashlib.sha1(encoded_word).hexdigest(), 16) % num_features)
+      features[feature_index] += 1.0
+      total += 1.0
+
+  if total > 0:
+    features = [ f / total for f in features ]
+
+  return features
+
+
+def GenerateFeaturesRaw(content, num_features, top_words=None):
+  """Generates a vector of features for a given issue or comment.
+
+  Args:
+    content: The content of the issue's description and comments.
+    num_features: The number of features to generate.
+  """
+  if top_words:
+    return { 'word_features': _ComponentFeatures(content,
+                                                   num_features,
+                                                   top_words)}
+
+  return { 'word_hashes': _SpamHashFeatures(content, num_features)}
+
+
+def transform_spam_csv_to_features(csv_training_data):
+  X = []
+  y = []
+
+  # Handle if the list is double-wrapped.
+  if csv_training_data and len(csv_training_data[0]) > 4:
+    csv_training_data = csv_training_data[0]
+
+  for row in csv_training_data:
+    if len(row) == 4:
+      verdict, subject, content, _email = row
+    else:
+      verdict, subject, content = row
+    X.append(GenerateFeaturesRaw([str(subject), str(content)],
+                                 SPAM_FEATURE_HASHES))
+    y.append(1 if verdict == 'spam' else 0)
+  return X, y
+
+
+def transform_component_csv_to_features(csv_training_data, top_list):
+  X = []
+  y = []
+  top_words = {}
+
+  for i in range(len(top_list)):
+    top_words[top_list[i]] = i
+
+  component_to_index = {}
+  index_to_component = {}
+  component_index = 0
+
+  for row in csv_training_data:
+    component, content = row
+    component = str(component).split(",")[0]
+
+    if component not in component_to_index:
+      component_to_index[component] = component_index
+      index_to_component[component_index] = component
+      component_index += 1
+
+    X.append(GenerateFeaturesRaw([content],
+                                 COMPONENT_FEATURES,
+                                 top_words))
+    y.append(component_to_index[component])
+
+  return X, y, index_to_component
+
+
+def spam_from_file(f):
+  """Reads a training data file and returns an array."""
+  rows = []
+  skipped_rows = 0
+  for row in csv.reader(f):
+    if len(row) == len(SPAM_COLUMNS):
+      # Throw out email field.
+      rows.append(row[:3])
+    elif len(row) == len(LEGACY_CSV_COLUMNS):
+      rows.append(row)
+    else:
+      skipped_rows += 1
+  return rows, skipped_rows
+
+
+def component_from_file(f):
+  """Reads a training data file and returns an array."""
+  rows = []
+  csv.field_size_limit(sys.maxsize)
+  for row in csv.reader(f):
+    rows.append(row)
+
+  return rows
+
+
+def setup_ml_engine():
+  """Sets up an instance of ml engine for ml classes."""
+  try:
+    credentials = GoogleCredentials.get_application_default()
+    ml_engine = build('ml', 'v1', http=httplib2.Http(), credentials=credentials)
+    return ml_engine
+
+  except (Oauth2ClientError, ApiClientError):
+    logging.error("Error setting up ML Engine API: %s" % sys.exc_info()[0])