Blame - features/generate_dataset.py - monorail-avm99963

blob: b8974448056fbde89336e95e5cc0c540e19e414e [file] [log] [blame]

Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	1	# Copyright 2018 The Chromium Authors
				2	# Use of this source code is governed by a BSD-style license that can be
				3	# found in the LICENSE file.
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	4	"""This module is used to go from raw data to a csv dataset to build models for
				5	component prediction.
				6	"""
				7	from __future__ import print_function
				8	from __future__ import division
				9	from __future__ import absolute_import
				10
				11	import argparse
				12	import string
				13	import sys
				14	import csv
				15	import re
				16	import logging
				17	import random
				18	import time
				19	import os
				20	import settings
				21	from framework import sql
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	22
				23	if not settings.unit_test_mode:
				24	import MySQLdb as mdb
				25	ISSUE_LIMIT = 7000
				26	ISSUES_PER_RUN = 50
				27	COMPONENT_PREDICTOR_PROJECT = 16
				28
				29	def build_component_dataset(issue, csv_file):
				30	"""Main function to build dataset for training models.
				31
				32	Args:
				33	issue: The issue service with set up data.
				34	csv_file: The csv file path to store the dataset.
				35	"""
				36
				37	logging.info('Building dataset')
				38	con = sql.MonorailConnection()
				39
				40	csv_writer = csv.writer(csv_file)
				41
				42	logging.info('Downloading the dataset from database.')
				43
				44	issue_table = sql.SQLTableManager('Issue')
				45	issue_component_table = sql.SQLTableManager('Issue2Component')
				46	closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
				47
				48	close = closed_index_table.SelectValue(con, col='closed_index')
				49
				50	last_close = issue_table.Select(con,
				51	cols=['closed'],
				52	where=[('closed > %s', [str(close)]),
				53	('project_id = %s',
				54	[str(COMPONENT_PREDICTOR_PROJECT)])],
				55	order_by=[('closed', [])],
				56	limit=ISSUE_LIMIT)[-1][0]
				57
				58	issue_ids = issue_table.Select(con,
				59	cols=['id'],
				60	where=[('closed > %s', [str(close)]),
				61	('closed <= %s', [str(last_close)]),
				62	('project_id = %s',
				63	[str(COMPONENT_PREDICTOR_PROJECT)])])
				64
				65
				66	logging.info('Close: ' + str(close))
				67	logging.info('Last close: ' + str(last_close))
				68
				69	# Get the comments and components for 50 issues at a time so as to not
				70	# overwhelm a single shard with all 7000 issues at once
				71	for i in range(0, len(issue_ids), ISSUES_PER_RUN):
				72	issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
				73
				74	comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
				75
				76	shard_id = random.randint(0, settings.num_logical_shards - 1)
				77
				78	components = issue_component_table.Select(con,
				79	cols=['issue_id',
				80	'GROUP_CONCAT(component_id '
				81	+ 'SEPARATOR \',\')'],
				82	joins=[('ComponentDef ON '
				83	'ComponentDef.id = '
				84	'Issue2Component.component_id',
				85	[])],
				86	where=[('(deprecated = %s OR deprecated'
				87	' IS NULL)', [False]),
				88	('is_deleted = %s', [False])],
				89	group_by=['issue_id'],
				90	shard_id=shard_id,
				91	issue_id=issue_list)
				92
				93	for issue_id, component_ids in components:
				94	comment_string = ' '.join(
				95	[comment.content for comment in comments[issue_id]])
				96
				97	final_text = CleanText(comment_string)
				98
				99	final_issue = component_ids, final_text
				100	csv_writer.writerow(final_issue)
				101
				102	closed_index_table.Update(con, delta={'closed_index' : last_close})
				103
				104	return csv_file
				105
				106
				107	def CleanText(text):
				108	"""Cleans provided text by lower casing words, removing punctuation, and
				109	normalizing spacing so that there is exactly one space between each word.
				110
				111	Args:
				112	text: Raw text to be cleaned.
				113
				114	Returns:
				115	Cleaned version of text.
				116
				117	"""
				118
				119	pretty_issue = text.lower().strip()
				120
				121	quoteless_issue = re.sub('\'', '', pretty_issue)
Adrià Vilanova Martínez	f19ea43	2024-01-23 20:20:52 +0100	[diff] [blame]	122	no_punctuation_issue = re.sub(r'[^\w\s]\|_+', ' ', quoteless_issue)
Copybara	854996b	2021-09-07 19:36:02 +0000	[diff] [blame]	123	one_space_issue = ' '.join(no_punctuation_issue.split())
				124
				125	return one_space_issue