blob: b13ae8893d0e721d2d8f42dc9156108db56277b9 [file] [log] [blame]
"""This module is used to go from raw data to a csv dataset to build models for
component prediction.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import argparse
import string
import sys
import csv
import re
import logging
import random
import time
import os
import settings
from framework import sql
from framework import servlet
if not settings.unit_test_mode:
import MySQLdb as mdb
ISSUE_LIMIT = 7000
ISSUES_PER_RUN = 50
COMPONENT_PREDICTOR_PROJECT = 16
def build_component_dataset(issue, csv_file):
"""Main function to build dataset for training models.
Args:
issue: The issue service with set up data.
csv_file: The csv file path to store the dataset.
"""
logging.info('Building dataset')
con = sql.MonorailConnection()
csv_writer = csv.writer(csv_file)
logging.info('Downloading the dataset from database.')
issue_table = sql.SQLTableManager('Issue')
issue_component_table = sql.SQLTableManager('Issue2Component')
closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
close = closed_index_table.SelectValue(con, col='closed_index')
last_close = issue_table.Select(con,
cols=['closed'],
where=[('closed > %s', [str(close)]),
('project_id = %s',
[str(COMPONENT_PREDICTOR_PROJECT)])],
order_by=[('closed', [])],
limit=ISSUE_LIMIT)[-1][0]
issue_ids = issue_table.Select(con,
cols=['id'],
where=[('closed > %s', [str(close)]),
('closed <= %s', [str(last_close)]),
('project_id = %s',
[str(COMPONENT_PREDICTOR_PROJECT)])])
logging.info('Close: ' + str(close))
logging.info('Last close: ' + str(last_close))
# Get the comments and components for 50 issues at a time so as to not
# overwhelm a single shard with all 7000 issues at once
for i in range(0, len(issue_ids), ISSUES_PER_RUN):
issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
shard_id = random.randint(0, settings.num_logical_shards - 1)
components = issue_component_table.Select(con,
cols=['issue_id',
'GROUP_CONCAT(component_id '
+ 'SEPARATOR \',\')'],
joins=[('ComponentDef ON '
'ComponentDef.id = '
'Issue2Component.component_id',
[])],
where=[('(deprecated = %s OR deprecated'
' IS NULL)', [False]),
('is_deleted = %s', [False])],
group_by=['issue_id'],
shard_id=shard_id,
issue_id=issue_list)
for issue_id, component_ids in components:
comment_string = ' '.join(
[comment.content for comment in comments[issue_id]])
final_text = CleanText(comment_string)
final_issue = component_ids, final_text
csv_writer.writerow(final_issue)
closed_index_table.Update(con, delta={'closed_index' : last_close})
return csv_file
def CleanText(text):
"""Cleans provided text by lower casing words, removing punctuation, and
normalizing spacing so that there is exactly one space between each word.
Args:
text: Raw text to be cleaned.
Returns:
Cleaned version of text.
"""
pretty_issue = text.lower().strip()
quoteless_issue = re.sub('\'', '', pretty_issue)
no_punctuation_issue = re.sub('[^\w\s]|_+', ' ', quoteless_issue)
one_space_issue = ' '.join(no_punctuation_issue.split())
return one_space_issue