tools/ml/trainer2/top_words.py - monorail-avm99963 - Gitiles

 # Copyright 2019 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 # Or at https://developers.google.com/open-source/licenses/bsd

 from __future__ import absolute_import

 import os

 from trainer2 import train_ml_helpers
 from trainer2.stopwords import STOP_WORDS


 def GenerateTopWords(word_dict):
   """Requires ./stopwords.txt exist in folder for the function to run.
   """
   stop_words = [s.encode('utf-8') for s in STOP_WORDS]
   sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
   top_words = []
   index = 0

   while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
     if sorted_words[index] not in stop_words:
       top_words.append(sorted_words[index])
     index += 1

   return top_words


 def parse_words_from_content(contents):
   """Returns given list of strings, extract the top (most common) words.
   """
   word_dict = {}
   for content in contents:
     words = content.encode('utf-8').split()
     for word in words:
       if word in word_dict:
         word_dict[word] += 1
       else:
         word_dict[word] = 1

   return GenerateTopWords(word_dict)


 def make_top_words_list(contents, job_dir):
   """Returns the top (most common) words in the entire dataset for component
   prediction. If a file is already stored in job_dir containing these words, the
   words from the file are simply returned. Otherwise, the most common words are
   determined and written to job_dir, before being returned.

   Returns:
     A list of the most common words in the dataset (the number of them
     determined by train_ml_helpers.COMPONENT_FEATURES).
   """
   if not os.path.exists(job_dir):
     os.mkdir(job_dir)
   if os.access(job_dir + 'topwords.txt', os.R_OK):
     print("Found topwords.txt")
     with open(job_dir + 'topwords.txt', 'rb') as f:
       top_words = f.read().split()
   else:
     top_words = parse_words_from_content(contents)
     with open(job_dir + 'topwords.txt', 'w') as f:
       for word in top_words:
         f.write('%s\n' % word.decode('utf-8'))
   return top_words
	# Copyright 2019 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	# Or at https://developers.google.com/open-source/licenses/bsd

	from __future__ import absolute_import

	import os

	from trainer2 import train_ml_helpers
	from trainer2.stopwords import STOP_WORDS


	def GenerateTopWords(word_dict):
	"""Requires ./stopwords.txt exist in folder for the function to run.
	"""
	stop_words = [s.encode('utf-8') for s in STOP_WORDS]
	sorted_words = sorted(word_dict, key=word_dict.get, reverse=True)
	top_words = []
	index = 0

	while len(top_words) < train_ml_helpers.COMPONENT_FEATURES:
	if sorted_words[index] not in stop_words:
	top_words.append(sorted_words[index])
	index += 1

	return top_words


	def parse_words_from_content(contents):
	"""Returns given list of strings, extract the top (most common) words.
	"""
	word_dict = {}
	for content in contents:
	words = content.encode('utf-8').split()
	for word in words:
	if word in word_dict:
	word_dict[word] += 1
	else:
	word_dict[word] = 1

	return GenerateTopWords(word_dict)


	def make_top_words_list(contents, job_dir):
	"""Returns the top (most common) words in the entire dataset for component
	prediction. If a file is already stored in job_dir containing these words, the
	words from the file are simply returned. Otherwise, the most common words are
	determined and written to job_dir, before being returned.

	Returns:
	A list of the most common words in the dataset (the number of them
	determined by train_ml_helpers.COMPONENT_FEATURES).
	"""
	if not os.path.exists(job_dir):
	os.mkdir(job_dir)
	if os.access(job_dir + 'topwords.txt', os.R_OK):
	print("Found topwords.txt")
	with open(job_dir + 'topwords.txt', 'rb') as f:
	top_words = f.read().split()
	else:
	top_words = parse_words_from_content(contents)
	with open(job_dir + 'topwords.txt', 'w') as f:
	for word in top_words:
	f.write('%s\n' % word.decode('utf-8'))
	return top_words