Blame - features/generate_dataset.py - monorail-avm99963

blob: b13ae8893d0e721d2d8f42dc9156108db56277b9 [file] [log] [blame]

Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	1	"""This module is used to go from raw data to a csv dataset to build models for
				2	component prediction.
				3	"""
				4	from __future__ import print_function
				5	from __future__ import division
				6	from __future__ import absolute_import
				7
				8	import argparse
				9	import string
				10	import sys
				11	import csv
				12	import re
				13	import logging
				14	import random
				15	import time
				16	import os
				17	import settings
				18	from framework import sql
				19	from framework import servlet
				20
				21	if not settings.unit_test_mode:
				22	import MySQLdb as mdb
				23	ISSUE_LIMIT = 7000
				24	ISSUES_PER_RUN = 50
				25	COMPONENT_PREDICTOR_PROJECT = 16
				26
				27	def build_component_dataset(issue, csv_file):
				28	"""Main function to build dataset for training models.
				29
				30	Args:
				31	issue: The issue service with set up data.
				32	csv_file: The csv file path to store the dataset.
				33	"""
				34
				35	logging.info('Building dataset')
				36	con = sql.MonorailConnection()
				37
				38	csv_writer = csv.writer(csv_file)
				39
				40	logging.info('Downloading the dataset from database.')
				41
				42	issue_table = sql.SQLTableManager('Issue')
				43	issue_component_table = sql.SQLTableManager('Issue2Component')
				44	closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
				45
				46	close = closed_index_table.SelectValue(con, col='closed_index')
				47
				48	last_close = issue_table.Select(con,
				49	cols=['closed'],
				50	where=[('closed > %s', [str(close)]),
				51	('project_id = %s',
				52	[str(COMPONENT_PREDICTOR_PROJECT)])],
				53	order_by=[('closed', [])],
				54	limit=ISSUE_LIMIT)[-1][0]
				55
				56	issue_ids = issue_table.Select(con,
				57	cols=['id'],
				58	where=[('closed > %s', [str(close)]),
				59	('closed <= %s', [str(last_close)]),
				60	('project_id = %s',
				61	[str(COMPONENT_PREDICTOR_PROJECT)])])
				62
				63
				64	logging.info('Close: ' + str(close))
				65	logging.info('Last close: ' + str(last_close))
				66
				67	# Get the comments and components for 50 issues at a time so as to not
				68	# overwhelm a single shard with all 7000 issues at once
				69	for i in range(0, len(issue_ids), ISSUES_PER_RUN):
				70	issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
				71
				72	comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
				73
				74	shard_id = random.randint(0, settings.num_logical_shards - 1)
				75
				76	components = issue_component_table.Select(con,
				77	cols=['issue_id',
				78	'GROUP_CONCAT(component_id '
				79	+ 'SEPARATOR \',\')'],
				80	joins=[('ComponentDef ON '
				81	'ComponentDef.id = '
				82	'Issue2Component.component_id',
				83	[])],
				84	where=[('(deprecated = %s OR deprecated'
				85	' IS NULL)', [False]),
				86	('is_deleted = %s', [False])],
				87	group_by=['issue_id'],
				88	shard_id=shard_id,
				89	issue_id=issue_list)
				90
				91	for issue_id, component_ids in components:
				92	comment_string = ' '.join(
				93	[comment.content for comment in comments[issue_id]])
				94
				95	final_text = CleanText(comment_string)
				96
				97	final_issue = component_ids, final_text
				98	csv_writer.writerow(final_issue)
				99
				100	closed_index_table.Update(con, delta={'closed_index' : last_close})
				101
				102	return csv_file
				103
				104
				105	def CleanText(text):
				106	"""Cleans provided text by lower casing words, removing punctuation, and
				107	normalizing spacing so that there is exactly one space between each word.
				108
				109	Args:
				110	text: Raw text to be cleaned.
				111
				112	Returns:
				113	Cleaned version of text.
				114
				115	"""
				116
				117	pretty_issue = text.lower().strip()
				118
				119	quoteless_issue = re.sub('\'', '', pretty_issue)
				120	no_punctuation_issue = re.sub('[^\w\s]\|_+', ' ', quoteless_issue)
				121	one_space_issue = ' '.join(no_punctuation_issue.split())
				122
				123	return one_space_issue