blob: 6db23d480919d2a0116af3ddb813dca090a1d181 [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2018 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004
5"""
6Helper functions for spam and component classification. These are mostly for
7feature extraction, so that the serving code and training code both use the same
8set of features.
9"""
Adrià Vilanova Martínezde942802022-07-15 14:06:55 +020010# TODO(crbug.com/monorail/7515): DELETE THIS FILE and all references.
Copybara854996b2021-09-07 19:36:02 +000011
12from __future__ import division
13from __future__ import print_function
14from __future__ import absolute_import
15
16import csv
17import hashlib
Copybara854996b2021-09-07 19:36:02 +000018import logging
19import re
20import sys
21
22from six import text_type
23
24from apiclient.discovery import build
25from apiclient.errors import Error as ApiClientError
26from oauth2client.client import GoogleCredentials
27from oauth2client.client import Error as Oauth2ClientError
28
29
30SPAM_COLUMNS = ['verdict', 'subject', 'content', 'email']
31LEGACY_CSV_COLUMNS = ['verdict', 'subject', 'content']
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +010032DELIMITERS = [r'\s', r'\,', r'\.', r'\?', '!', r'\:', r'\(', r'\)']
Copybara854996b2021-09-07 19:36:02 +000033
34# Must be identical to settings.spam_feature_hashes.
35SPAM_FEATURE_HASHES = 500
36# Must be identical to settings.component_features.
37COMPONENT_FEATURES = 5000
38
39
40def _ComponentFeatures(content, num_features, top_words):
41 """
42 This uses the most common words in the entire dataset as features.
43 The count of common words in the issue comments makes up the features.
44 """
45
46 features = [0] * num_features
47 for blob in content:
48 words = blob.split()
49 for word in words:
50 if word in top_words:
51 features[top_words[word]] += 1
52
53 return features
54
55
56def _SpamHashFeatures(content, num_features):
57 """
58 Feature hashing is a fast and compact way to turn a string of text into a
59 vector of feature values for classification and training.
60 See also: https://en.wikipedia.org/wiki/Feature_hashing
61 This is a simple implementation that doesn't try to minimize collisions
62 or anything else fancy.
63 """
64 features = [0] * num_features
65 total = 0.0
66 for blob in content:
67 words = re.split('|'.join(DELIMITERS), blob)
68 for word in words:
69 encoded_word = word
70 # If we've been passed real unicode strings, convert them to bytestrings.
71 if isinstance(word, text_type):
72 encoded_word = word.encode('utf-8')
73 feature_index = int(
74 int(hashlib.sha1(encoded_word).hexdigest(), 16) % num_features)
75 features[feature_index] += 1.0
76 total += 1.0
77
78 if total > 0:
79 features = [ f / total for f in features ]
80
81 return features
82
83
84def GenerateFeaturesRaw(content, num_features, top_words=None):
85 """Generates a vector of features for a given issue or comment.
86
87 Args:
88 content: The content of the issue's description and comments.
89 num_features: The number of features to generate.
90 """
91 if top_words:
92 return { 'word_features': _ComponentFeatures(content,
93 num_features,
94 top_words)}
95
96 return { 'word_hashes': _SpamHashFeatures(content, num_features)}
97
98
99def transform_spam_csv_to_features(csv_training_data):
100 X = []
101 y = []
102
103 # Handle if the list is double-wrapped.
104 if csv_training_data and len(csv_training_data[0]) > 4:
105 csv_training_data = csv_training_data[0]
106
107 for row in csv_training_data:
108 if len(row) == 4:
109 verdict, subject, content, _email = row
110 else:
111 verdict, subject, content = row
112 X.append(GenerateFeaturesRaw([str(subject), str(content)],
113 SPAM_FEATURE_HASHES))
114 y.append(1 if verdict == 'spam' else 0)
115 return X, y
116
117
118def transform_component_csv_to_features(csv_training_data, top_list):
119 X = []
120 y = []
121 top_words = {}
122
123 for i in range(len(top_list)):
124 top_words[top_list[i]] = i
125
126 component_to_index = {}
127 index_to_component = {}
128 component_index = 0
129
130 for row in csv_training_data:
131 component, content = row
132 component = str(component).split(",")[0]
133
134 if component not in component_to_index:
135 component_to_index[component] = component_index
136 index_to_component[component_index] = component
137 component_index += 1
138
139 X.append(GenerateFeaturesRaw([content],
140 COMPONENT_FEATURES,
141 top_words))
142 y.append(component_to_index[component])
143
144 return X, y, index_to_component
145
146
147def spam_from_file(f):
148 """Reads a training data file and returns an array."""
149 rows = []
150 skipped_rows = 0
151 for row in csv.reader(f):
152 if len(row) == len(SPAM_COLUMNS):
153 # Throw out email field.
154 rows.append(row[:3])
155 elif len(row) == len(LEGACY_CSV_COLUMNS):
156 rows.append(row)
157 else:
158 skipped_rows += 1
159 return rows, skipped_rows
160
161
162def component_from_file(f):
163 """Reads a training data file and returns an array."""
164 rows = []
165 csv.field_size_limit(sys.maxsize)
166 for row in csv.reader(f):
167 rows.append(row)
168
169 return rows
170
171
172def setup_ml_engine():
173 """Sets up an instance of ml engine for ml classes."""
174 try:
175 credentials = GoogleCredentials.get_application_default()
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100176 ml_engine = build('ml', 'v1', credentials=credentials)
Copybara854996b2021-09-07 19:36:02 +0000177 return ml_engine
178
179 except (Oauth2ClientError, ApiClientError):
180 logging.error("Error setting up ML Engine API: %s" % sys.exc_info()[0])