Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 1 | # Copyright 2018 The Chromium Authors |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 4 | """This module is used to go from raw data to a csv dataset to build models for |
| 5 | component prediction. |
| 6 | """ |
| 7 | from __future__ import print_function |
| 8 | from __future__ import division |
| 9 | from __future__ import absolute_import |
| 10 | |
| 11 | import argparse |
| 12 | import string |
| 13 | import sys |
| 14 | import csv |
| 15 | import re |
| 16 | import logging |
| 17 | import random |
| 18 | import time |
| 19 | import os |
| 20 | import settings |
| 21 | from framework import sql |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 22 | |
| 23 | if not settings.unit_test_mode: |
| 24 | import MySQLdb as mdb |
| 25 | ISSUE_LIMIT = 7000 |
| 26 | ISSUES_PER_RUN = 50 |
| 27 | COMPONENT_PREDICTOR_PROJECT = 16 |
| 28 | |
| 29 | def build_component_dataset(issue, csv_file): |
| 30 | """Main function to build dataset for training models. |
| 31 | |
| 32 | Args: |
| 33 | issue: The issue service with set up data. |
| 34 | csv_file: The csv file path to store the dataset. |
| 35 | """ |
| 36 | |
| 37 | logging.info('Building dataset') |
| 38 | con = sql.MonorailConnection() |
| 39 | |
| 40 | csv_writer = csv.writer(csv_file) |
| 41 | |
| 42 | logging.info('Downloading the dataset from database.') |
| 43 | |
| 44 | issue_table = sql.SQLTableManager('Issue') |
| 45 | issue_component_table = sql.SQLTableManager('Issue2Component') |
| 46 | closed_index_table = sql.SQLTableManager('ComponentIssueClosedIndex') |
| 47 | |
| 48 | close = closed_index_table.SelectValue(con, col='closed_index') |
| 49 | |
| 50 | last_close = issue_table.Select(con, |
| 51 | cols=['closed'], |
| 52 | where=[('closed > %s', [str(close)]), |
| 53 | ('project_id = %s', |
| 54 | [str(COMPONENT_PREDICTOR_PROJECT)])], |
| 55 | order_by=[('closed', [])], |
| 56 | limit=ISSUE_LIMIT)[-1][0] |
| 57 | |
| 58 | issue_ids = issue_table.Select(con, |
| 59 | cols=['id'], |
| 60 | where=[('closed > %s', [str(close)]), |
| 61 | ('closed <= %s', [str(last_close)]), |
| 62 | ('project_id = %s', |
| 63 | [str(COMPONENT_PREDICTOR_PROJECT)])]) |
| 64 | |
| 65 | |
| 66 | logging.info('Close: ' + str(close)) |
| 67 | logging.info('Last close: ' + str(last_close)) |
| 68 | |
| 69 | # Get the comments and components for 50 issues at a time so as to not |
| 70 | # overwhelm a single shard with all 7000 issues at once |
| 71 | for i in range(0, len(issue_ids), ISSUES_PER_RUN): |
| 72 | issue_list = [str(x[0]) for x in issue_ids[i:i+ISSUES_PER_RUN]] |
| 73 | |
| 74 | comments = issue.GetCommentsForIssues(con, issue_list, content_only=True) |
| 75 | |
| 76 | shard_id = random.randint(0, settings.num_logical_shards - 1) |
| 77 | |
| 78 | components = issue_component_table.Select(con, |
| 79 | cols=['issue_id', |
| 80 | 'GROUP_CONCAT(component_id ' |
| 81 | + 'SEPARATOR \',\')'], |
| 82 | joins=[('ComponentDef ON ' |
| 83 | 'ComponentDef.id = ' |
| 84 | 'Issue2Component.component_id', |
| 85 | [])], |
| 86 | where=[('(deprecated = %s OR deprecated' |
| 87 | ' IS NULL)', [False]), |
| 88 | ('is_deleted = %s', [False])], |
| 89 | group_by=['issue_id'], |
| 90 | shard_id=shard_id, |
| 91 | issue_id=issue_list) |
| 92 | |
| 93 | for issue_id, component_ids in components: |
| 94 | comment_string = ' '.join( |
| 95 | [comment.content for comment in comments[issue_id]]) |
| 96 | |
| 97 | final_text = CleanText(comment_string) |
| 98 | |
| 99 | final_issue = component_ids, final_text |
| 100 | csv_writer.writerow(final_issue) |
| 101 | |
| 102 | closed_index_table.Update(con, delta={'closed_index' : last_close}) |
| 103 | |
| 104 | return csv_file |
| 105 | |
| 106 | |
| 107 | def CleanText(text): |
| 108 | """Cleans provided text by lower casing words, removing punctuation, and |
| 109 | normalizing spacing so that there is exactly one space between each word. |
| 110 | |
| 111 | Args: |
| 112 | text: Raw text to be cleaned. |
| 113 | |
| 114 | Returns: |
| 115 | Cleaned version of text. |
| 116 | |
| 117 | """ |
| 118 | |
| 119 | pretty_issue = text.lower().strip() |
| 120 | |
| 121 | quoteless_issue = re.sub('\'', '', pretty_issue) |
Adrià Vilanova Martínez | f19ea43 | 2024-01-23 20:20:52 +0100 | [diff] [blame] | 122 | no_punctuation_issue = re.sub(r'[^\w\s]|_+', ' ', quoteless_issue) |
Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 123 | one_space_issue = ' '.join(no_punctuation_issue.split()) |
| 124 | |
| 125 | return one_space_issue |