blob: b8974448056fbde89336e95e5cc0c540e19e414e [file] [log] [blame]
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This module is used to go from raw data to a csv dataset to build models for
component prediction.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import argparse
import string
import sys
import csv
import re
import logging
import random
import time
import os
import settings
from framework import sql
if not settings.unit_test_mode:
import MySQLdb as mdb
ISSUE_LIMIT = 7000
ISSUES_PER_RUN = 50
COMPONENT_PREDICTOR_PROJECT = 16
def build_component_dataset(issue, csv_file):
"""Main function to build dataset for training models.
Args:
issue: The issue service with set up data.
csv_file: The csv file path to store the dataset.
"""
logging.info('Building dataset')
con = sql.MonorailConnection()
csv_writer = csv.writer(csv_file)
logging.info('Downloading the dataset from database.')
issue_table = sql.SQLTableManager('Issue')
issue_component_table = sql.SQLTableManager('Issue2Component')
closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
close = closed_index_table.SelectValue(con, col='closed_index')
last_close = issue_table.Select(con,
cols=['closed'],
where=[('closed > %s', [str(close)]),
('project_id = %s',
[str(COMPONENT_PREDICTOR_PROJECT)])],
order_by=[('closed', [])],
limit=ISSUE_LIMIT)[-1][0]
issue_ids = issue_table.Select(con,
cols=['id'],
where=[('closed > %s', [str(close)]),
('closed <= %s', [str(last_close)]),
('project_id = %s',
[str(COMPONENT_PREDICTOR_PROJECT)])])
logging.info('Close: ' + str(close))
logging.info('Last close: ' + str(last_close))
# Get the comments and components for 50 issues at a time so as to not
# overwhelm a single shard with all 7000 issues at once
for i in range(0, len(issue_ids), ISSUES_PER_RUN):
issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
shard_id = random.randint(0, settings.num_logical_shards - 1)
components = issue_component_table.Select(con,
cols=['issue_id',
'GROUP_CONCAT(component_id '
+ 'SEPARATOR \',\')'],
joins=[('ComponentDef ON '
'ComponentDef.id = '
'Issue2Component.component_id',
[])],
where=[('(deprecated = %s OR deprecated'
' IS NULL)', [False]),
('is_deleted = %s', [False])],
group_by=['issue_id'],
shard_id=shard_id,
issue_id=issue_list)
for issue_id, component_ids in components:
comment_string = ' '.join(
[comment.content for comment in comments[issue_id]])
final_text = CleanText(comment_string)
final_issue = component_ids, final_text
csv_writer.writerow(final_issue)
closed_index_table.Update(con, delta={'closed_index' : last_close})
return csv_file
def CleanText(text):
"""Cleans provided text by lower casing words, removing punctuation, and
normalizing spacing so that there is exactly one space between each word.
Args:
text: Raw text to be cleaned.
Returns:
Cleaned version of text.
"""
pretty_issue = text.lower().strip()
quoteless_issue = re.sub('\'', '', pretty_issue)
no_punctuation_issue = re.sub(r'[^\w\s]|_+', ' ', quoteless_issue)
one_space_issue = ' '.join(no_punctuation_issue.split())
return one_space_issue