blob: 45a29cc31ed43a019adfc9b3941fd299f7aa28fa [file] [log] [blame]
# coding=utf-8
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
import io
import unittest
from services import ml_helpers
NUM_WORD_HASHES = 5
TOP_WORDS = {'cat': 0, 'dog': 1, 'bunny': 2, 'chinchilla': 3, 'hamster': 4}
NUM_COMPONENT_FEATURES = len(TOP_WORDS)
class MLHelpersTest(unittest.TestCase):
def testSpamHashFeatures(self):
hashes = ml_helpers._SpamHashFeatures(tuple(), NUM_WORD_HASHES)
self.assertEqual([0, 0, 0, 0, 0], hashes)
hashes = ml_helpers._SpamHashFeatures(('', ''), NUM_WORD_HASHES)
self.assertEqual([1.0, 0, 0, 0, 0], hashes)
hashes = ml_helpers._SpamHashFeatures(('abc', 'abc def'), NUM_WORD_HASHES)
self.assertEqual([0, 0, 2 / 3, 0, 1 / 3], hashes)
def testComponentFeatures(self):
features = ml_helpers._ComponentFeatures(['cat dog is not bunny'
' chinchilla hamster'],
NUM_COMPONENT_FEATURES,
TOP_WORDS)
self.assertEqual([1, 1, 1, 1, 1], features)
features = ml_helpers._ComponentFeatures(['none of these are features'],
NUM_COMPONENT_FEATURES,
TOP_WORDS)
self.assertEqual([0, 0, 0, 0, 0], features)
features = ml_helpers._ComponentFeatures(['do hamsters look like a'
' chinchilla'],
NUM_COMPONENT_FEATURES,
TOP_WORDS)
self.assertEqual([0, 0, 0, 1, 0], features)
features = ml_helpers._ComponentFeatures([''],
NUM_COMPONENT_FEATURES,
TOP_WORDS)
self.assertEqual([0, 0, 0, 0, 0], features)
def testGenerateFeaturesRaw(self):
features = ml_helpers.GenerateFeaturesRaw(
['abc', 'abc def http://www.google.com http://www.google.com'],
NUM_WORD_HASHES)
self.assertEqual(
[1 / 2.75, 0.0, 1 / 5.5, 0.0, 1 / 2.2], features['word_hashes'])
features = ml_helpers.GenerateFeaturesRaw(['abc', 'abc def'],
NUM_WORD_HASHES)
self.assertEqual([0.0, 0.0, 2 / 3, 0.0, 1 / 3], features['word_hashes'])
features = ml_helpers.GenerateFeaturesRaw(['do hamsters look like a'
' chinchilla'],
NUM_COMPONENT_FEATURES,
TOP_WORDS)
self.assertEqual([0, 0, 0, 1, 0], features['word_features'])
# BMP Unicode
features = ml_helpers.GenerateFeaturesRaw(
[u'abc’', u'abc ’ def'], NUM_WORD_HASHES)
self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
# Non-BMP Unicode
features = ml_helpers.GenerateFeaturesRaw([u'abc國', u'abc 國 def'],
NUM_WORD_HASHES)
self.assertEqual([0.0, 0.0, 0.25, 0.25, 0.5], features['word_hashes'])
# A non-unicode bytestring containing unicode characters
features = ml_helpers.GenerateFeaturesRaw(['abc…', 'abc … def'],
NUM_WORD_HASHES)
self.assertEqual([0.25, 0.0, 0.25, 0.25, 0.25], features['word_hashes'])
# Empty input
features = ml_helpers.GenerateFeaturesRaw(['', ''], NUM_WORD_HASHES)
self.assertEqual([1.0, 0.0, 0.0, 0.0, 0.0], features['word_hashes'])
def test_from_file(self):
csv_file = io.StringIO(
u'''
"spam","the subject 1","the contents 1","spammer@gmail.com"
"ham","the subject 2"
"spam","the subject 3","the contents 2","spammer2@gmail.com"
'''.strip())
samples, skipped = ml_helpers.spam_from_file(csv_file)
self.assertEqual(len(samples), 2)
self.assertEqual(skipped, 1)
self.assertEqual(len(samples[1]), 3, 'Strips email')
self.assertEqual(samples[1][2], 'the contents 2')
def test_transform_csv_to_features(self):
training_data = [
['spam', 'subject 1', 'contents 1'],
['ham', 'subject 2', 'contents 2'],
['spam', 'subject 3', 'contents 3'],
]
X, y = ml_helpers.transform_spam_csv_to_features(training_data)
self.assertIsInstance(X, list)
self.assertIsInstance(X[0], dict)
self.assertIsInstance(y, list)
self.assertEqual(len(X), 3)
self.assertEqual(len(y), 3)
self.assertEqual(len(X[0]['word_hashes']), 500)
self.assertEqual(y, [1, 0, 1])