blob: 988bfbcf90e700102f0a18b5e75a6f55fbf28686 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "This notebook is intended to show the process for going from \n",
8 "a database connection to two files the first associating an issue\n",
9 "to its components and the second to assocating an issue to its \n",
10 "comments. These files can then be used a machine learning pipeline\n",
11 "that will apply cleaning, vectorization of the text and building models."
12 ]
13 },
14 {
15 "cell_type": "code",
16 "execution_count": null,
17 "metadata": {
18 "collapsed": false
19 },
20 "outputs": [],
21 "source": [
22 "%pylab inline"
23 ]
24 },
25 {
26 "cell_type": "code",
27 "execution_count": null,
28 "metadata": {
29 "collapsed": true
30 },
31 "outputs": [],
32 "source": [
33 "from __future__ import print_function\n",
34 "from __future__ import division"
35 ]
36 },
37 {
38 "cell_type": "code",
39 "execution_count": null,
40 "metadata": {
41 "collapsed": true
42 },
43 "outputs": [],
44 "source": [
45 "import pandas as pd\n",
46 "from bs4 import BeautifulSoup\n",
47 "from collections import defaultdict\n",
48 "import pickle\n",
49 "import MySQLdb as mdb"
50 ]
51 },
52 {
53 "cell_type": "code",
54 "execution_count": null,
55 "metadata": {
56 "collapsed": true
57 },
58 "outputs": [],
59 "source": [
60 "connection = mdb.connect(host='', user='', db='monorail')\n",
61 "cursor = connection.cursor()"
62 ]
63 },
64 {
65 "cell_type": "code",
66 "execution_count": null,
67 "metadata": {
68 "collapsed": true
69 },
70 "outputs": [],
71 "source": [
72 "def table_to_dataframe(name, connection):\n",
73 " return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)"
74 ]
75 },
76 {
77 "cell_type": "code",
78 "execution_count": null,
79 "metadata": {
80 "collapsed": true
81 },
82 "outputs": [],
83 "source": [
84 "issue = table_to_dataframe('Issue', connection)\n",
85 "comment = table_to_dataframe('Comment', connection)\n",
86 "issue_component = table_to_dataframe('Issue2Component', connection)"
87 ]
88 },
89 {
90 "cell_type": "code",
91 "execution_count": null,
92 "metadata": {
93 "collapsed": true
94 },
95 "outputs": [],
96 "source": [
97 "issue.rename(columns={'id':'issue_id'}, inplace=True)\n",
98 "chrome_issue = issue[issue['project_id'] == 16].copy()\n",
99 "chrome_issue_id_set = set(chrome_issue['issue_id'])"
100 ]
101 },
102 {
103 "cell_type": "markdown",
104 "metadata": {},
105 "source": [
106 "### Associate an issue withs its components"
107 ]
108 },
109 {
110 "cell_type": "code",
111 "execution_count": null,
112 "metadata": {
113 "collapsed": true
114 },
115 "outputs": [],
116 "source": [
117 "components_by_issue = defaultdict(list)\n",
118 "i = 0\n",
119 "for index, row in issue_component.iterrows():\n",
120 " if row['issue_id'] in chrome_issue_id_set:\n",
121 " components_by_issue[row['issue_id']].append(row['component_id'])\n",
122 " if i % 100000 == 0:\n",
123 " print(i)\n",
124 " i += 1\n",
125 "\n",
126 "chrome_issue['components'] = chrome_issue['issue_id'].apply(lambda i_id: components_by_issue[i_id])"
127 ]
128 },
129 {
130 "cell_type": "markdown",
131 "metadata": {},
132 "source": [
133 "### Associate an issue withs its comments"
134 ]
135 },
136 {
137 "cell_type": "code",
138 "execution_count": null,
139 "metadata": {
140 "collapsed": true
141 },
142 "outputs": [],
143 "source": [
144 "comments_by_issue = defaultdict(list)\n",
145 "i = 0\n",
146 "for index, row in chrome_comment.iterrows():\n",
147 " comments_by_issue[row[\"issue_id\"]].append((index, row.created))\n",
148 " if i % 1000000 == 0:\n",
149 " print(i)\n",
150 " i += 1\n",
151 "\n",
152 "chrome_issue[\"comments\"] = chrome_issue[\"issue_id\"].apply(lambda i_id: \n",
153 " [tup[0] for tup \n",
154 " in sorted(comments_by_issue[i_id], \n",
155 " key=lambda x: x[1])])"
156 ]
157 },
158 {
159 "cell_type": "markdown",
160 "metadata": {},
161 "source": [
162 "### Only work with closed issues for training"
163 ]
164 },
165 {
166 "cell_type": "code",
167 "execution_count": null,
168 "metadata": {
169 "collapsed": true
170 },
171 "outputs": [],
172 "source": [
173 "closed_chrome_issues = chrome_issue[chrome_issue[\"closed\"] > 0]"
174 ]
175 },
176 {
177 "cell_type": "markdown",
178 "metadata": {},
179 "source": [
180 "### Subsample the data (faster to run experiments)"
181 ]
182 },
183 {
184 "cell_type": "code",
185 "execution_count": null,
186 "metadata": {
187 "collapsed": true
188 },
189 "outputs": [],
190 "source": [
191 "num_issues = len(closed_chrome_issues)"
192 ]
193 },
194 {
195 "cell_type": "code",
196 "execution_count": null,
197 "metadata": {
198 "collapsed": false
199 },
200 "outputs": [],
201 "source": [
202 "issue_subset = closed_chrome_issues.sample(int(num_issues * 0.05))"
203 ]
204 },
205 {
206 "cell_type": "markdown",
207 "metadata": {},
208 "source": [
209 "### Very light cleaning of text (removing markup)."
210 ]
211 },
212 {
213 "cell_type": "code",
214 "execution_count": null,
215 "metadata": {
216 "collapsed": false
217 },
218 "outputs": [],
219 "source": [
220 "comment_index_to_text = defaultdict(unicode)\n",
221 "\n",
222 "i = 0\n",
223 "for index, row in issue_subset.iterrows():\n",
224 " for num, comment_id in enumerate(row['comments']):\n",
225 " text = BeautifulSoup(comment.loc[comment_id]['content']).get_text().strip().lower()\n",
226 " comment_index_to_text[comment_id] = text\n",
227 " \n",
228 " if i % 10000 == 0:\n",
229 " print(i)\n",
230 " i += 1"
231 ]
232 },
233 {
234 "cell_type": "code",
235 "execution_count": null,
236 "metadata": {
237 "collapsed": true
238 },
239 "outputs": [],
240 "source": [
241 "issue_subset.to_pickle('subset_issue.pkl')"
242 ]
243 },
244 {
245 "cell_type": "code",
246 "execution_count": null,
247 "metadata": {
248 "collapsed": false
249 },
250 "outputs": [],
251 "source": [
252 "pickle.dump(comment_index_to_text, open('comment_text.pkl', 'w'))"
253 ]
254 },
255 {
256 "cell_type": "code",
257 "execution_count": null,
258 "metadata": {
259 "collapsed": true
260 },
261 "outputs": [],
262 "source": []
263 }
264 ],
265 "metadata": {
266 "kernelspec": {
267 "display_name": "Python 2",
268 "language": "python",
269 "name": "python2"
270 },
271 "language_info": {
272 "codemirror_mode": {
273 "name": "ipython",
274 "version": 2
275 },
276 "file_extension": ".py",
277 "mimetype": "text/x-python",
278 "name": "python",
279 "nbconvert_exporter": "python",
280 "pygments_lexer": "ipython2",
281 "version": "2.7.6"
282 }
283 },
284 "nbformat": 4,
285 "nbformat_minor": 0
286}