blob: bb5769930f337a3fb08f4862d1bdaf4f0086dc8d [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001# Copyright 2019 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4# Or at https://developers.google.com/open-source/licenses/bsd
5
6from __future__ import absolute_import
7
8import os
9
10from trainer2 import train_ml_helpers
11from trainer2.stopwords import STOP_WORDS
12
13
14def GenerateTopWords(word_dict):
15 """Requires ./stopwords.txt exist in folder for the function to run.
16 """
17 stop_words = [s.encode('utf-8') for s in STOP_WORDS]
18 sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
19 top_words = []
20 index = 0
21
22 while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
23 if sorted_words[index] not in stop_words:
24 top_words.append(sorted_words[index])
25 index += 1
26
27 return top_words
28
29
30def parse_words_from_content(contents):
31 """Returns given list of strings, extract the top (most common) words.
32 """
33 word_dict = {}
34 for content in contents:
35 words = content.encode('utf-8').split()
36 for word in words:
37 if word in word_dict:
38 word_dict[word] += 1
39 else:
40 word_dict[word] = 1
41
42 return GenerateTopWords(word_dict)
43
44
45def make_top_words_list(contents, job_dir):
46 """Returns the top (most common) words in the entire dataset for component
47 prediction. If a file is already stored in job_dir containing these words, the
48 words from the file are simply returned. Otherwise, the most common words are
49 determined and written to job_dir, before being returned.
50
51 Returns:
52 A list of the most common words in the dataset (the number of them
53 determined by train_ml_helpers.COMPONENT_FEATURES).
54 """
55 if not os.path.exists(job_dir):
56 os.mkdir(job_dir)
57 if os.access(job_dir + 'topwords.txt', os.R_OK):
58 print("Found topwords.txt")
59 with open(job_dir + 'topwords.txt', 'rb') as f:
60 top_words = f.read().split()
61 else:
62 top_words = parse_words_from_content(contents)
63 with open(job_dir + 'topwords.txt', 'w') as f:
64 for word in top_words:
65 f.write('%s\n' % word.decode('utf-8'))
66 return top_words