blob: b8974448056fbde89336e95e5cc0c540e19e414e [file] [log] [blame]
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +01001# Copyright 2018 The Chromium Authors
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
Copybara854996b2021-09-07 19:36:02 +00004"""This module is used to go from raw data to a csv dataset to build models for
5 component prediction.
6"""
7from __future__ import print_function
8from __future__ import division
9from __future__ import absolute_import
10
11import argparse
12import string
13import sys
14import csv
15import re
16import logging
17import random
18import time
19import os
20import settings
21from framework import sql
Copybara854996b2021-09-07 19:36:02 +000022
23if not settings.unit_test_mode:
24 import MySQLdb as mdb
25ISSUE_LIMIT = 7000
26ISSUES_PER_RUN = 50
27COMPONENT_PREDICTOR_PROJECT = 16
28
29def build_component_dataset(issue, csv_file):
30 """Main function to build dataset for training models.
31
32 Args:
33 issue: The issue service with set up data.
34 csv_file: The csv file path to store the dataset.
35 """
36
37 logging.info('Building dataset')
38 con = sql.MonorailConnection()
39
40 csv_writer = csv.writer(csv_file)
41
42 logging.info('Downloading the dataset from database.')
43
44 issue_table = sql.SQLTableManager('Issue')
45 issue_component_table = sql.SQLTableManager('Issue2Component')
46 closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex')
47
48 close = closed_index_table.SelectValue(con, col='closed_index')
49
50 last_close = issue_table.Select(con,
51 cols=['closed'],
52 where=[('closed > %s', [str(close)]),
53 ('project_id = %s',
54 [str(COMPONENT_PREDICTOR_PROJECT)])],
55 order_by=[('closed', [])],
56 limit=ISSUE_LIMIT)[-1][0]
57
58 issue_ids = issue_table.Select(con,
59 cols=['id'],
60 where=[('closed > %s', [str(close)]),
61 ('closed <= %s', [str(last_close)]),
62 ('project_id = %s',
63 [str(COMPONENT_PREDICTOR_PROJECT)])])
64
65
66 logging.info('Close: ' + str(close))
67 logging.info('Last close: ' + str(last_close))
68
69 # Get the comments and components for 50 issues at a time so as to not
70 # overwhelm a single shard with all 7000 issues at once
71 for i in range(0, len(issue_ids), ISSUES_PER_RUN):
72 issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]]
73
74 comments = issue.GetCommentsForIssues(con, issue_list, content_only=True)
75
76 shard_id = random.randint(0, settings.num_logical_shards - 1)
77
78 components = issue_component_table.Select(con,
79 cols=['issue_id',
80 'GROUP_CONCAT(component_id '
81 + 'SEPARATOR \',\')'],
82 joins=[('ComponentDef ON '
83 'ComponentDef.id = '
84 'Issue2Component.component_id',
85 [])],
86 where=[('(deprecated = %s OR deprecated'
87 ' IS NULL)', [False]),
88 ('is_deleted = %s', [False])],
89 group_by=['issue_id'],
90 shard_id=shard_id,
91 issue_id=issue_list)
92
93 for issue_id, component_ids in components:
94 comment_string = ' '.join(
95 [comment.content for comment in comments[issue_id]])
96
97 final_text = CleanText(comment_string)
98
99 final_issue = component_ids, final_text
100 csv_writer.writerow(final_issue)
101
102 closed_index_table.Update(con, delta={'closed_index' : last_close})
103
104 return csv_file
105
106
107def CleanText(text):
108 """Cleans provided text by lower casing words, removing punctuation, and
109 normalizing spacing so that there is exactly one space between each word.
110
111 Args:
112 text: Raw text to be cleaned.
113
114 Returns:
115 Cleaned version of text.
116
117 """
118
119 pretty_issue = text.lower().strip()
120
121 quoteless_issue = re.sub('\'', '', pretty_issue)
Adrià Vilanova Martínezf19ea432024-01-23 20:20:52 +0100122 no_punctuation_issue = re.sub(r'[^\w\s]|_+', ' ', quoteless_issue)
Copybara854996b2021-09-07 19:36:02 +0000123 one_space_issue = ' '.join(no_punctuation_issue.split())
124
125 return one_space_issue