Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/services/test/ml_helpers_test.py b/services/test/ml_helpers_test.py
new file mode 100644
index 0000000..45a29cc
--- /dev/null
+++ b/services/test/ml_helpers_test.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+
+import io
+import unittest
+
+from services import ml_helpers
+
+
+NUM_WORD_HASHES = 5
+
+TOP_WORDS = {'cat': 0, 'dog': 1, 'bunny': 2, 'chinchilla': 3, 'hamster': 4}
+NUM_COMPONENT_FEATURES = len(TOP_WORDS)
+
+
+class MLHelpersTest(unittest.TestCase):
+
+ def testSpamHashFeatures(self):
+ hashes = ml_helpers._SpamHashFeatures(tuple(), NUM_WORD_HASHES)
+ self.assertEqual([0, 0, 0, 0, 0], hashes)
+
+ hashes = ml_helpers._SpamHashFeatures(('', ''), NUM_WORD_HASHES)
+ self.assertEqual([1.0, 0, 0, 0, 0], hashes)
+
+ hashes = ml_helpers._SpamHashFeatures(('abc', 'abc def'), NUM_WORD_HASHES)
+ self.assertEqual([0, 0, 2 / 3, 0, 1 / 3], hashes)
+
+ def testComponentFeatures(self):
+
+ features = ml_helpers._ComponentFeatures(['cat dog is not bunny'
+ ' chinchilla hamster'],
+ NUM_COMPONENT_FEATURES,
+ TOP_WORDS)
+ self.assertEqual([1, 1, 1, 1, 1], features)
+
+ features = ml_helpers._ComponentFeatures(['none of these are features'],
+ NUM_COMPONENT_FEATURES,
+ TOP_WORDS)
+ self.assertEqual([0, 0, 0, 0, 0], features)
+
+ features = ml_helpers._ComponentFeatures(['do hamsters look like a'
+ ' chinchilla'],
+ NUM_COMPONENT_FEATURES,
+ TOP_WORDS)
+ self.assertEqual([0, 0, 0, 1, 0], features)
+
+ features = ml_helpers._ComponentFeatures([''],
+ NUM_COMPONENT_FEATURES,
+ TOP_WORDS)
+ self.assertEqual([0, 0, 0, 0, 0], features)
+
+ def testGenerateFeaturesRaw(self):
+
+ features = ml_helpers.GenerateFeaturesRaw(
+ ['abc', 'abc def http://www.google.com http://www.google.com'],
+ NUM_WORD_HASHES)
+ self.assertEqual(
+ [1 / 2.75, 0.0, 1 / 5.5, 0.0, 1 / 2.2], features['word_hashes'])
+
+ features = ml_helpers.GenerateFeaturesRaw(['abc', 'abc def'],
+ NUM_WORD_HASHES)
+ self.assertEqual([0.0, 0.0, 2 / 3, 0.0, 1 / 3], features['word_hashes'])
+
+ features = ml_helpers.GenerateFeaturesRaw(['do hamsters look like a'
+ ' chinchilla'],
+ NUM_COMPONENT_FEATURES,
+ TOP_WORDS)
+ self.assertEqual([0, 0, 0, 1, 0], features['word_features'])
+
+ # BMP Unicode
+ features = ml_helpers.GenerateFeaturesRaw(
+ [u'abc’', u'abc ’ def'], NUM_WORD_HASHES)
+ self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
+
+ # Non-BMP Unicode
+ features = ml_helpers.GenerateFeaturesRaw([u'abc國', u'abc 國 def'],
+ NUM_WORD_HASHES)
+ self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
+
+ # A non-unicode bytestring containing unicode characters
+ features = ml_helpers.GenerateFeaturesRaw(['abc…', 'abc … def'],
+ NUM_WORD_HASHES)
+ self.assertEqual([0.25, 0.0, 0.25, 0.25, 0.25], features['word_hashes'])
+
+ # Empty input
+ features = ml_helpers.GenerateFeaturesRaw(['', ''], NUM_WORD_HASHES)
+ self.assertEqual([1.0, 0.0, 0.0, 0.0, 0.0], features['word_hashes'])
+
+ def test_from_file(self):
+ csv_file = io.StringIO(
+ u'''
+ "spam","the subject 1","the contents 1","spammer@gmail.com"
+ "ham","the subject 2"
+ "spam","the subject 3","the contents 2","spammer2@gmail.com"
+ '''.strip())
+ samples, skipped = ml_helpers.spam_from_file(csv_file)
+ self.assertEqual(len(samples), 2)
+ self.assertEqual(skipped, 1)
+ self.assertEqual(len(samples[1]), 3, 'Strips email')
+ self.assertEqual(samples[1][2], 'the contents 2')
+
+ def test_transform_csv_to_features(self):
+ training_data = [
+ ['spam', 'subject 1', 'contents 1'],
+ ['ham', 'subject 2', 'contents 2'],
+ ['spam', 'subject 3', 'contents 3'],
+ ]
+ X, y = ml_helpers.transform_spam_csv_to_features(training_data)
+
+ self.assertIsInstance(X, list)
+ self.assertIsInstance(X[0], dict)
+ self.assertIsInstance(y, list)
+
+ self.assertEqual(len(X), 3)
+ self.assertEqual(len(y), 3)
+
+ self.assertEqual(len(X[0]['word_hashes']), 500)
+ self.assertEqual(y, [1, 0, 1])