Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | # Copyright 2019 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | # Or at https://developers.google.com/open-source/licenses/bsd |
| 5 | |
| 6 | from __future__ import absolute_import |
| 7 | |
| 8 | import os |
| 9 | |
| 10 | from trainer2 import train_ml_helpers |
| 11 | from trainer2.stopwords import STOP_WORDS |
| 12 | |
| 13 | |
| 14 | def GenerateTopWords(word_dict): |
| 15 | """Requires ./stopwords.txt exist in folder for the function to run. |
| 16 | """ |
| 17 | stop_words = [s.encode('utf-8') for s in STOP_WORDS] |
| 18 | sorted_words = sorted(word_dict, key=word_dict.get, reverse=True) |
| 19 | top_words = [] |
| 20 | index = 0 |
| 21 | |
| 22 | while len(top_words) < train_ml_helpers.COMPONENT_FEATURES: |
| 23 | if sorted_words[index] not in stop_words: |
| 24 | top_words.append(sorted_words[index]) |
| 25 | index += 1 |
| 26 | |
| 27 | return top_words |
| 28 | |
| 29 | |
| 30 | def parse_words_from_content(contents): |
| 31 | """Returns given list of strings, extract the top (most common) words. |
| 32 | """ |
| 33 | word_dict = {} |
| 34 | for content in contents: |
| 35 | words = content.encode('utf-8').split() |
| 36 | for word in words: |
| 37 | if word in word_dict: |
| 38 | word_dict[word] += 1 |
| 39 | else: |
| 40 | word_dict[word] = 1 |
| 41 | |
| 42 | return GenerateTopWords(word_dict) |
| 43 | |
| 44 | |
| 45 | def make_top_words_list(contents, job_dir): |
| 46 | """Returns the top (most common) words in the entire dataset for component |
| 47 | prediction. If a file is already stored in job_dir containing these words, the |
| 48 | words from the file are simply returned. Otherwise, the most common words are |
| 49 | determined and written to job_dir, before being returned. |
| 50 | |
| 51 | Returns: |
| 52 | A list of the most common words in the dataset (the number of them |
| 53 | determined by train_ml_helpers.COMPONENT_FEATURES). |
| 54 | """ |
| 55 | if not os.path.exists(job_dir): |
| 56 | os.mkdir(job_dir) |
| 57 | if os.access(job_dir + 'topwords.txt', os.R_OK): |
| 58 | print("Found topwords.txt") |
| 59 | with open(job_dir + 'topwords.txt', 'rb') as f: |
| 60 | top_words = f.read().split() |
| 61 | else: |
| 62 | top_words = parse_words_from_content(contents) |
| 63 | with open(job_dir + 'topwords.txt', 'w') as f: |
| 64 | for word in top_words: |
| 65 | f.write('%s\n' % word.decode('utf-8')) |
| 66 | return top_words |