Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/services/test/ml_helpers_test.py b/services/test/ml_helpers_test.py
new file mode 100644
index 0000000..45a29cc
--- /dev/null
+++ b/services/test/ml_helpers_test.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+
+import io
+import unittest
+
+from services import ml_helpers
+
+
+NUM_WORD_HASHES = 5
+
+TOP_WORDS = {'cat': 0, 'dog': 1, 'bunny': 2, 'chinchilla': 3, 'hamster': 4}
+NUM_COMPONENT_FEATURES = len(TOP_WORDS)
+
+
+class MLHelpersTest(unittest.TestCase):
+
+  def testSpamHashFeatures(self):
+    hashes = ml_helpers._SpamHashFeatures(tuple(), NUM_WORD_HASHES)
+    self.assertEqual([0, 0, 0, 0, 0], hashes)
+
+    hashes = ml_helpers._SpamHashFeatures(('', ''), NUM_WORD_HASHES)
+    self.assertEqual([1.0, 0, 0, 0, 0], hashes)
+
+    hashes = ml_helpers._SpamHashFeatures(('abc', 'abc def'), NUM_WORD_HASHES)
+    self.assertEqual([0, 0, 2 / 3, 0, 1 / 3], hashes)
+
+  def testComponentFeatures(self):
+
+    features = ml_helpers._ComponentFeatures(['cat dog is not bunny'
+                                              ' chinchilla hamster'],
+                                             NUM_COMPONENT_FEATURES,
+                                             TOP_WORDS)
+    self.assertEqual([1, 1, 1, 1, 1], features)
+
+    features = ml_helpers._ComponentFeatures(['none of these are features'],
+                                             NUM_COMPONENT_FEATURES,
+                                             TOP_WORDS)
+    self.assertEqual([0, 0, 0, 0, 0], features)
+
+    features = ml_helpers._ComponentFeatures(['do hamsters look like a'
+                                             ' chinchilla'],
+                                             NUM_COMPONENT_FEATURES,
+                                             TOP_WORDS)
+    self.assertEqual([0, 0, 0, 1, 0], features)
+
+    features = ml_helpers._ComponentFeatures([''],
+                                             NUM_COMPONENT_FEATURES,
+                                             TOP_WORDS)
+    self.assertEqual([0, 0, 0, 0, 0], features)
+
+  def testGenerateFeaturesRaw(self):
+
+    features = ml_helpers.GenerateFeaturesRaw(
+        ['abc', 'abc def http://www.google.com http://www.google.com'],
+      NUM_WORD_HASHES)
+    self.assertEqual(
+        [1 / 2.75, 0.0, 1 / 5.5, 0.0, 1 / 2.2], features['word_hashes'])
+
+    features = ml_helpers.GenerateFeaturesRaw(['abc', 'abc def'],
+      NUM_WORD_HASHES)
+    self.assertEqual([0.0, 0.0, 2 / 3, 0.0, 1 / 3], features['word_hashes'])
+
+    features = ml_helpers.GenerateFeaturesRaw(['do hamsters look like a'
+                                               ' chinchilla'],
+                                              NUM_COMPONENT_FEATURES,
+                                              TOP_WORDS)
+    self.assertEqual([0, 0, 0, 1, 0], features['word_features'])
+
+    # BMP Unicode
+    features = ml_helpers.GenerateFeaturesRaw(
+        [u'abc’', u'abc ’ def'], NUM_WORD_HASHES)
+    self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
+
+    # Non-BMP Unicode
+    features = ml_helpers.GenerateFeaturesRaw([u'abc國', u'abc 國 def'],
+      NUM_WORD_HASHES)
+    self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
+
+    # A non-unicode bytestring containing unicode characters
+    features = ml_helpers.GenerateFeaturesRaw(['abc…', 'abc … def'],
+      NUM_WORD_HASHES)
+    self.assertEqual([0.25, 0.0, 0.25, 0.25, 0.25], features['word_hashes'])
+
+    # Empty input
+    features = ml_helpers.GenerateFeaturesRaw(['', ''], NUM_WORD_HASHES)
+    self.assertEqual([1.0, 0.0, 0.0, 0.0, 0.0], features['word_hashes'])
+
+  def test_from_file(self):
+    csv_file = io.StringIO(
+        u'''
+      "spam","the subject 1","the contents 1","spammer@gmail.com"
+      "ham","the subject 2"
+      "spam","the subject 3","the contents 2","spammer2@gmail.com"
+    '''.strip())
+    samples, skipped = ml_helpers.spam_from_file(csv_file)
+    self.assertEqual(len(samples), 2)
+    self.assertEqual(skipped, 1)
+    self.assertEqual(len(samples[1]), 3, 'Strips email')
+    self.assertEqual(samples[1][2], 'the contents 2')
+
+  def test_transform_csv_to_features(self):
+    training_data = [
+      ['spam', 'subject 1', 'contents 1'],
+      ['ham', 'subject 2', 'contents 2'],
+      ['spam', 'subject 3', 'contents 3'],
+    ]
+    X, y = ml_helpers.transform_spam_csv_to_features(training_data)
+
+    self.assertIsInstance(X, list)
+    self.assertIsInstance(X[0], dict)
+    self.assertIsInstance(y, list)
+
+    self.assertEqual(len(X), 3)
+    self.assertEqual(len(y), 3)
+
+    self.assertEqual(len(X[0]['word_hashes']), 500)
+    self.assertEqual(y, [1, 0, 1])