blob: 36113a295546128cc7f1f9d6c3b49076a77400b9 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2019 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4# Or at https://developers.google.com/open-source/licenses/bsd
5
6"""
7Helper functions for spam and component classification. These are mostly for
8feature extraction, so that the serving code and training code both use the same
9set of features.
10TODO(jeffcarp): This file is duplicate of services/ml_helpers.py
11 (with slight difference). Will eventually be merged to one.
12"""
13
14from __future__ import absolute_import
15
16import csv
17import hashlib
18import re
19import sys
20
21SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
22LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
23DELIMITERS = [r'\s', r'\,', r'\.', r'\?', r'!', r'\:', r'\(', r'\)']
24
25# Must be identical to settings.spam_feature_hashes.
26SPAM_FEATURE_HASHES = 500
27# Must be identical to settings.component_features.
28COMPONENT_FEATURES = 5000
29
30
31def _ComponentFeatures(content, num_features, top_words):
32 """
33 This uses the most common words in the entire dataset as features.
34 The count of common words in the issue comments makes up the features.
35 """
36
37 features = [0] * num_features
38 for blob in content:
39 words = blob.split()
40 for word in words:
41 if word in top_words:
42 features[top_words[word]] += 1
43
44 return features
45
46
47def _SpamHashFeatures(content, num_features):
48 """
49 Feature hashing is a fast and compact way to turn a string of text into a
50 vector of feature values for classification and training.
51 See also: https://en.wikipedia.org/wiki/Feature_hashing
52 This is a simple implementation that doesn't try to minimize collisions
53 or anything else fancy.
54 """
55 features = [0] * num_features
56 total = 0.0
57 for blob in content:
58 words = re.split('|'.join(DELIMITERS).encode('utf-8'), blob)
59 for word in words:
60 feature_index = int(int(hashlib.sha1(word).hexdigest(), 16)
61 % num_features)
62 features[feature_index] += 1.0
63 total += 1.0
64
65 if total > 0:
66 features = [f / total for f in features]
67
68 return features
69
70
71def GenerateFeaturesRaw(content, num_features, top_words=None):
72 """Generates a vector of features for a given issue or comment.
73
74 Args:
75 content: The content of the issue's description and comments.
76 num_features: The number of features to generate.
77 """
78 # If we've been passed real unicode strings, convert them to just bytestrings.
79 for idx, value in enumerate(content):
80 content[idx] = value.encode('utf-8')
81 if top_words:
82 return {'word_features': _ComponentFeatures(content,
83 num_features,
84 top_words)}
85
86 return {'word_hashes': _SpamHashFeatures(content, num_features)}
87
88
89def transform_spam_csv_to_features(contents, labels):
90 """Generate arrays of features and targets for spam.
91 """
92 features = []
93 targets = []
94 for i, row in enumerate(contents):
95 subject, content = row
96 label = labels[i]
97 features.append(GenerateFeaturesRaw([str(subject), str(content)],
98 SPAM_FEATURE_HASHES))
99 targets.append(1 if label == 'spam' else 0)
100 return features, targets
101
102
103def transform_component_csv_to_features(contents, labels, top_list):
104 """Generate arrays of features and targets for components.
105 """
106 features = []
107 targets = []
108 top_words = {}
109
110 for i, row in enumerate(top_list):
111 top_words[row] = i
112
113 component_to_index = {}
114 index_to_component = {}
115 component_index = 0
116
117 for i, content in enumerate(contents):
118 component = labels[i]
119 component = str(component).split(",")[0]
120
121 if component not in component_to_index:
122 component_to_index[component] = component_index
123 index_to_component[component_index] = component
124 component_index += 1
125
126 features.append(GenerateFeaturesRaw([content],
127 COMPONENT_FEATURES,
128 top_words))
129 targets.append(component_to_index[component])
130
131 return features, targets, index_to_component
132
133
134def spam_from_file(f):
135 """Reads a training data file and returns arrays of contents and labels."""
136 contents = []
137 labels = []
138 skipped_rows = 0
139 for row in csv.reader(f):
140 if len(row) >= len(LEGACY_CSV_COLUMNS):
141 # Throw out email field.
142 contents.append(row[1:3])
143 labels.append(row[0])
144 else:
145 skipped_rows += 1
146 return contents, labels, skipped_rows
147
148
149def component_from_file(f):
150 """Reads a training data file and returns arrays of contents and labels."""
151 contents = []
152 labels = []
153 csv.field_size_limit(sys.maxsize)
154 for row in csv.reader(f):
155 label, content = row
156 contents.append(content)
157 labels.append(label)
158 return contents, labels