Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | """This module is used to go from raw data to a csv dataset to build models for |
| 2 | component prediction. |
| 3 | """ |
| 4 | from __future__ import print_function |
| 5 | from __future__ import division |
| 6 | from __future__ import absolute_import |
| 7 | |
| 8 | import argparse |
| 9 | import string |
| 10 | import sys |
| 11 | import csv |
| 12 | import re |
| 13 | import logging |
| 14 | import random |
| 15 | import time |
| 16 | import os |
| 17 | import settings |
| 18 | from framework import sql |
| 19 | from framework import servlet |
| 20 | |
| 21 | if not settings.unit_test_mode: |
| 22 | import MySQLdb as mdb |
| 23 | ISSUE_LIMIT = 7000 |
| 24 | ISSUES_PER_RUN = 50 |
| 25 | COMPONENT_PREDICTOR_PROJECT = 16 |
| 26 | |
| 27 | def build_component_dataset(issue, csv_file): |
| 28 | """Main function to build dataset for training models. |
| 29 | |
| 30 | Args: |
| 31 | issue: The issue service with set up data. |
| 32 | csv_file: The csv file path to store the dataset. |
| 33 | """ |
| 34 | |
| 35 | logging.info('Building dataset') |
| 36 | con = sql.MonorailConnection() |
| 37 | |
| 38 | csv_writer = csv.writer(csv_file) |
| 39 | |
| 40 | logging.info('Downloading the dataset from database.') |
| 41 | |
| 42 | issue_table = sql.SQLTableManager('Issue') |
| 43 | issue_component_table = sql.SQLTableManager('Issue2Component') |
| 44 | closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex') |
| 45 | |
| 46 | close = closed_index_table.SelectValue(con, col='closed_index') |
| 47 | |
| 48 | last_close = issue_table.Select(con, |
| 49 | cols=['closed'], |
| 50 | where=[('closed > %s', [str(close)]), |
| 51 | ('project_id = %s', |
| 52 | [str(COMPONENT_PREDICTOR_PROJECT)])], |
| 53 | order_by=[('closed', [])], |
| 54 | limit=ISSUE_LIMIT)[-1][0] |
| 55 | |
| 56 | issue_ids = issue_table.Select(con, |
| 57 | cols=['id'], |
| 58 | where=[('closed > %s', [str(close)]), |
| 59 | ('closed <= %s', [str(last_close)]), |
| 60 | ('project_id = %s', |
| 61 | [str(COMPONENT_PREDICTOR_PROJECT)])]) |
| 62 | |
| 63 | |
| 64 | logging.info('Close: ' + str(close)) |
| 65 | logging.info('Last close: ' + str(last_close)) |
| 66 | |
| 67 | # Get the comments and components for 50 issues at a time so as to not |
| 68 | # overwhelm a single shard with all 7000 issues at once |
| 69 | for i in range(0, len(issue_ids), ISSUES_PER_RUN): |
| 70 | issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]] |
| 71 | |
| 72 | comments = issue.GetCommentsForIssues(con, issue_list, content_only=True) |
| 73 | |
| 74 | shard_id = random.randint(0, settings.num_logical_shards - 1) |
| 75 | |
| 76 | components = issue_component_table.Select(con, |
| 77 | cols=['issue_id', |
| 78 | 'GROUP_CONCAT(component_id ' |
| 79 | + 'SEPARATOR \',\')'], |
| 80 | joins=[('ComponentDef ON ' |
| 81 | 'ComponentDef.id = ' |
| 82 | 'Issue2Component.component_id', |
| 83 | [])], |
| 84 | where=[('(deprecated = %s OR deprecated' |
| 85 | ' IS NULL)', [False]), |
| 86 | ('is_deleted = %s', [False])], |
| 87 | group_by=['issue_id'], |
| 88 | shard_id=shard_id, |
| 89 | issue_id=issue_list) |
| 90 | |
| 91 | for issue_id, component_ids in components: |
| 92 | comment_string = ' '.join( |
| 93 | [comment.content for comment in comments[issue_id]]) |
| 94 | |
| 95 | final_text = CleanText(comment_string) |
| 96 | |
| 97 | final_issue = component_ids, final_text |
| 98 | csv_writer.writerow(final_issue) |
| 99 | |
| 100 | closed_index_table.Update(con, delta={'closed_index' : last_close}) |
| 101 | |
| 102 | return csv_file |
| 103 | |
| 104 | |
| 105 | def CleanText(text): |
| 106 | """Cleans provided text by lower casing words, removing punctuation, and |
| 107 | normalizing spacing so that there is exactly one space between each word. |
| 108 | |
| 109 | Args: |
| 110 | text: Raw text to be cleaned. |
| 111 | |
| 112 | Returns: |
| 113 | Cleaned version of text. |
| 114 | |
| 115 | """ |
| 116 | |
| 117 | pretty_issue = text.lower().strip() |
| 118 | |
| 119 | quoteless_issue = re.sub('\'', '', pretty_issue) |
| 120 | no_punctuation_issue = re.sub('[^\w\s]|_+', ' ', quoteless_issue) |
| 121 | one_space_issue = ' '.join(no_punctuation_issue.split()) |
| 122 | |
| 123 | return one_space_issue |