blob: b13ae8893d0e721d2d8f42dc9156108db56277b9 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001"""This module is used to go from raw data to a csv dataset to build models for
2 component prediction.
3"""
4from __future__ import print_function
5from __future__ import division
6from __future__ import absolute_import
7
8import argparse
9import string
10import sys
11import csv
12import re
13import logging
14import random
15import time
16import os
17import settings
18from framework import sql
19from framework import servlet
20
21if not settings.unit_test_mode:
22 import MySQLdb as mdb
23ISSUE_LIMIT = 7000
24ISSUES_PER_RUN = 50
25COMPONENT_PREDICTOR_PROJECT = 16
26
27def build_component_dataset(issue, csv_file):
28 """Main function to build dataset for training models.
29
30 Args:
31 issue: The issue service with set up data.
32 csv_file: The csv file path to store the dataset.
33 """
34
35 logging.info('Building dataset')
36 con = sql.MonorailConnection()
37
38 csv_writer = csv.writer(csv_file)
39
40 logging.info('Downloading the dataset from database.')
41
42 issue_table = sql.SQLTableManager('Issue')
43 issue_component_table = sql.SQLTableManager('Issue2Component')
44 closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
45
46 close = closed_index_table.SelectValue(con, col='closed_index')
47
48 last_close = issue_table.Select(con,
49 cols=['closed'],
50 where=[('closed > %s', [str(close)]),
51 ('project_id = %s',
52 [str(COMPONENT_PREDICTOR_PROJECT)])],
53 order_by=[('closed', [])],
54 limit=ISSUE_LIMIT)[-1][0]
55
56 issue_ids = issue_table.Select(con,
57 cols=['id'],
58 where=[('closed > %s', [str(close)]),
59 ('closed <= %s', [str(last_close)]),
60 ('project_id = %s',
61 [str(COMPONENT_PREDICTOR_PROJECT)])])
62
63
64 logging.info('Close: ' + str(close))
65 logging.info('Last close: ' + str(last_close))
66
67 # Get the comments and components for 50 issues at a time so as to not
68 # overwhelm a single shard with all 7000 issues at once
69 for i in range(0, len(issue_ids), ISSUES_PER_RUN):
70 issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
71
72 comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
73
74 shard_id = random.randint(0, settings.num_logical_shards - 1)
75
76 components = issue_component_table.Select(con,
77 cols=['issue_id',
78 'GROUP_CONCAT(component_id '
79 + 'SEPARATOR \',\')'],
80 joins=[('ComponentDef ON '
81 'ComponentDef.id = '
82 'Issue2Component.component_id',
83 [])],
84 where=[('(deprecated = %s OR deprecated'
85 ' IS NULL)', [False]),
86 ('is_deleted = %s', [False])],
87 group_by=['issue_id'],
88 shard_id=shard_id,
89 issue_id=issue_list)
90
91 for issue_id, component_ids in components:
92 comment_string = ' '.join(
93 [comment.content for comment in comments[issue_id]])
94
95 final_text = CleanText(comment_string)
96
97 final_issue = component_ids, final_text
98 csv_writer.writerow(final_issue)
99
100 closed_index_table.Update(con, delta={'closed_index' : last_close})
101
102 return csv_file
103
104
105def CleanText(text):
106 """Cleans provided text by lower casing words, removing punctuation, and
107 normalizing spacing so that there is exactly one space between each word.
108
109 Args:
110 text: Raw text to be cleaned.
111
112 Returns:
113 Cleaned version of text.
114
115 """
116
117 pretty_issue = text.lower().strip()
118
119 quoteless_issue = re.sub('\'', '', pretty_issue)
120 no_punctuation_issue = re.sub('[^\w\s]|_+', ' ', quoteless_issue)
121 one_space_issue = ' '.join(no_punctuation_issue.split())
122
123 return one_space_issue