features/generate_dataset.py - monorail-avm99963 - Gitiles

 # Copyright 2018 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """This module is used to go from raw data to a csv dataset to build models for
    component prediction.
 """
 from __future__ import print_function
 from __future__ import division
 from __future__ import absolute_import

 import argparse
 import string
 import sys
 import csv
 import re
 import logging
 import random
 import time
 import os
 import settings
 from framework import sql

 if not settings.unit_test_mode:
   import MySQLdb as mdb
 ISSUE_LIMIT = 7000
 ISSUES_PER_RUN = 50
 COMPONENT_PREDICTOR_PROJECT = 16

 def build_component_dataset(issue, csv_file):
   """Main function to build dataset for training models.

   Args:
     issue: The issue service with set up data.
     csv_file: The csv file path to store the dataset.
   """

   logging.info('Building dataset')
   con = sql.MonorailConnection()

   csv_writer = csv.writer(csv_file)

   logging.info('Downloading the dataset from database.')

   issue_table = sql.SQLTableManager('Issue')
   issue_component_table = sql.SQLTableManager('Issue2Component')
   closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')

   close = closed_index_table.SelectValue(con, col='closed_index')

   last_close = issue_table.Select(con,
                                   cols=['closed'],
                                   where=[('closed > %s', [str(close)]),
                                          ('project_id = %s',
                                           [str(COMPONENT_PREDICTOR_PROJECT)])],
                                   order_by=[('closed', [])],
                                   limit=ISSUE_LIMIT)[-1][0]

   issue_ids = issue_table.Select(con,
                               cols=['id'],
                               where=[('closed > %s', [str(close)]),
                                      ('closed <= %s', [str(last_close)]),
                                      ('project_id = %s',
                                       [str(COMPONENT_PREDICTOR_PROJECT)])])


   logging.info('Close: ' + str(close))
   logging.info('Last close: ' + str(last_close))

   # Get the comments and components for 50 issues at a time so as to not
   # overwhelm a single shard with all 7000 issues at once
   for i in range(0, len(issue_ids), ISSUES_PER_RUN):
     issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]

     comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)

     shard_id = random.randint(0, settings.num_logical_shards - 1)

     components = issue_component_table.Select(con,
                                         cols=['issue_id',
                                               'GROUP_CONCAT(component_id '
                                               + 'SEPARATOR \',\')'],
                                         joins=[('ComponentDef ON '
                                                 'ComponentDef.id = '
                                                 'Issue2Component.component_id',
                                                 [])],
                                         where=[('(deprecated = %s OR deprecated'
                                                 ' IS NULL)', [False]),
                                                 ('is_deleted = %s', [False])],
                                         group_by=['issue_id'],
                                         shard_id=shard_id,
                                         issue_id=issue_list)

     for issue_id, component_ids in components:
       comment_string = ' '.join(
           [comment.content for comment in comments[issue_id]])

       final_text = CleanText(comment_string)

       final_issue = component_ids, final_text
       csv_writer.writerow(final_issue)

   closed_index_table.Update(con, delta={'closed_index' : last_close})

   return csv_file


 def CleanText(text):
   """Cleans provided text by lower casing words, removing punctuation, and
   normalizing spacing so that there is exactly one space between each word.

   Args:
     text: Raw text to be cleaned.

   Returns:
     Cleaned version of text.

   """

   pretty_issue = text.lower().strip()

   quoteless_issue = re.sub('\'', '', pretty_issue)
   no_punctuation_issue = re.sub(r'[^\w\s]|_+', ' ', quoteless_issue)
   one_space_issue = ' '.join(no_punctuation_issue.split())

   return one_space_issue
	# Copyright 2018 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""This module is used to go from raw data to a csv dataset to build models for
	component prediction.
	"""
	from __future__ import print_function
	from __future__ import division
	from __future__ import absolute_import

	import argparse
	import string
	import sys
	import csv
	import re
	import logging
	import random
	import time
	import os
	import settings
	from framework import sql

	if not settings.unit_test_mode:
	import MySQLdb as mdb
	ISSUE_LIMIT = 7000
	ISSUES_PER_RUN = 50
	COMPONENT_PREDICTOR_PROJECT = 16

	def build_component_dataset(issue, csv_file):
	"""Main function to build dataset for training models.

	Args:
	issue: The issue service with set up data.
	csv_file: The csv file path to store the dataset.
	"""

	logging.info('Building dataset')
	con = sql.MonorailConnection()

	csv_writer = csv.writer(csv_file)

	logging.info('Downloading the dataset from database.')

	issue_table = sql.SQLTableManager('Issue')
	issue_component_table = sql.SQLTableManager('Issue2Component')
	closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')

	close = closed_index_table.SelectValue(con, col='closed_index')

	last_close = issue_table.Select(con,
	cols=['closed'],
	where=[('closed > %s', [str(close)]),
	('project_id = %s',
	[str(COMPONENT_PREDICTOR_PROJECT)])],
	order_by=[('closed', [])],
	limit=ISSUE_LIMIT)[-1][0]

	issue_ids = issue_table.Select(con,
	cols=['id'],
	where=[('closed > %s', [str(close)]),
	('closed <= %s', [str(last_close)]),
	('project_id = %s',
	[str(COMPONENT_PREDICTOR_PROJECT)])])


	logging.info('Close: ' + str(close))
	logging.info('Last close: ' + str(last_close))

	# Get the comments and components for 50 issues at a time so as to not
	# overwhelm a single shard with all 7000 issues at once
	for i in range(0, len(issue_ids), ISSUES_PER_RUN):
	issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]

	comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)

	shard_id = random.randint(0, settings.num_logical_shards - 1)

	components = issue_component_table.Select(con,
	cols=['issue_id',
	'GROUP_CONCAT(component_id '
	+ 'SEPARATOR \',\')'],
	joins=[('ComponentDef ON '
	'ComponentDef.id = '
	'Issue2Component.component_id',
	[])],
	where=[('(deprecated = %s OR deprecated'
	' IS NULL)', [False]),
	('is_deleted = %s', [False])],
	group_by=['issue_id'],
	shard_id=shard_id,
	issue_id=issue_list)

	for issue_id, component_ids in components:
	comment_string = ' '.join(
	[comment.content for comment in comments[issue_id]])

	final_text = CleanText(comment_string)

	final_issue = component_ids, final_text
	csv_writer.writerow(final_issue)

	closed_index_table.Update(con, delta={'closed_index' : last_close})

	return csv_file


	def CleanText(text):
	"""Cleans provided text by lower casing words, removing punctuation, and
	normalizing spacing so that there is exactly one space between each word.

	Args:
	text: Raw text to be cleaned.

	Returns:
	Cleaned version of text.

	"""

	pretty_issue = text.lower().strip()

	quoteless_issue = re.sub('\'', '', pretty_issue)
	no_punctuation_issue = re.sub(r'[^\w\s]\|_+', ' ', quoteless_issue)
	one_space_issue = ' '.join(no_punctuation_issue.split())

	return one_space_issue