Blame - tools/ml/trainer2/top_words.py - monorail-avm99963

blob: bb5769930f337a3fb08f4862d1bdaf4f0086dc8d [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	# Copyright 2019 The Chromium Authors. All rights reserved.
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
				4	# Or at https://developers.google.com/open-source/licenses/bsd
				5
				6	from __future__ import absolute_import
				7
				8	import os
				9
				10	from trainer2 import train_ml_helpers
				11	from trainer2.stopwords import STOP_WORDS
				12
				13
				14	def GenerateTopWords(word_dict):
				15	"""Requires ./stopwords.txt exist in folder for the function to run.
				16	"""
				17	stop_words = [s.encode('utf-8') for s in STOP_WORDS]
				18	sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
				19	top_words = []
				20	index = 0
				21
				22	while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
				23	if sorted_words[index] not in stop_words:
				24	top_words.append(sorted_words[index])
				25	index += 1
				26
				27	return top_words
				28
				29
				30	def parse_words_from_content(contents):
				31	"""Returns given list of strings, extract the top (most common) words.
				32	"""
				33	word_dict = {}
				34	for content in contents:
				35	words = content.encode('utf-8').split()
				36	for word in words:
				37	if word in word_dict:
				38	word_dict[word] += 1
				39	else:
				40	word_dict[word] = 1
				41
				42	return GenerateTopWords(word_dict)
				43
				44
				45	def make_top_words_list(contents, job_dir):
				46	"""Returns the top (most common) words in the entire dataset for component
				47	prediction. If a file is already stored in job_dir containing these words, the
				48	words from the file are simply returned. Otherwise, the most common words are
				49	determined and written to job_dir, before being returned.
				50
				51	Returns:
				52	A list of the most common words in the dataset (the number of them
				53	determined by train_ml_helpers.COMPONENT_FEATURES).
				54	"""
				55	if not os.path.exists(job_dir):
				56	os.mkdir(job_dir)
				57	if os.access(job_dir + 'topwords.txt', os.R_OK):
				58	print("Found topwords.txt")
				59	with open(job_dir + 'topwords.txt', 'rb') as f:
				60	top_words = f.read().split()
				61	else:
				62	top_words = parse_words_from_content(contents)
				63	with open(job_dir + 'topwords.txt', 'w') as f:
				64	for word in top_words:
				65	f.write('%s\n' % word.decode('utf-8'))
				66	return top_words