blob: 56dbe749747d1c274b67c21627e7505df9fa76d6 [file] [log] [blame]
Copybara854996b2021-09-07 19:36:02 +00001{
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": null,
6 "metadata": {
7 "collapsed": false
8 },
9 "outputs": [],
10 "source": [
11 "%pylab inline"
12 ]
13 },
14 {
15 "cell_type": "code",
16 "execution_count": null,
17 "metadata": {
18 "collapsed": false
19 },
20 "outputs": [],
21 "source": [
22 "from __future__ import print_function\n",
23 "from __future__ import division\n",
24 "from IPython.display import display, HTML"
25 ]
26 },
27 {
28 "cell_type": "code",
29 "execution_count": null,
30 "metadata": {
31 "collapsed": false
32 },
33 "outputs": [],
34 "source": [
35 "import seaborn as sns\n",
36 "import pandas as pd\n",
37 "import MySQLdb as mdb\n",
38 "import bs4\n",
39 "import datetime\n",
40 "from collections import defaultdict\n",
41 "from matplotlib import pyplot as plt\n",
42 "from ipywidgets import widgets"
43 ]
44 },
45 {
46 "cell_type": "markdown",
47 "metadata": {},
48 "source": [
49 "### Load the Data"
50 ]
51 },
52 {
53 "cell_type": "code",
54 "execution_count": null,
55 "metadata": {
56 "collapsed": false
57 },
58 "outputs": [],
59 "source": [
60 "def table_to_dataframe(name, connection):\n",
61 " return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n",
62 "\n",
63 "def project_table_to_dataframe(name, connection):\n",
64 " # project_id 1 is monorail\n",
65 " return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)"
66 ]
67 },
68 {
69 "cell_type": "code",
70 "execution_count": null,
71 "metadata": {
72 "collapsed": false
73 },
74 "outputs": [],
75 "source": [
76 "connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")"
77 ]
78 },
79 {
80 "cell_type": "code",
81 "execution_count": null,
82 "metadata": {
83 "collapsed": false
84 },
85 "outputs": [],
86 "source": [
87 "cursor = connection.cursor()"
88 ]
89 },
90 {
91 "cell_type": "code",
92 "execution_count": null,
93 "metadata": {
94 "collapsed": false
95 },
96 "outputs": [],
97 "source": [
98 "# Only look at monorail issues, and only look at issues opened in the past year.\n",
99 "issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n",
100 "comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n",
101 "status_def = project_table_to_dataframe(\"StatusDef\", connection)\n",
102 "issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n",
103 "issue_label = table_to_dataframe(\"Issue2Label\", connection)\n",
104 "issue_component = table_to_dataframe(\"Issue2Component\", connection)\n",
105 "issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n",
106 "issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)"
107 ]
108 },
109 {
110 "cell_type": "code",
111 "execution_count": null,
112 "metadata": {
113 "collapsed": false
114 },
115 "outputs": [],
116 "source": [
117 "print(\"Number of Issues\", issue.shape[0])\n",
118 "print(\"Number of IssueUpdates\", issue_update.shape[0])\n"
119 ]
120 },
121 {
122 "cell_type": "markdown",
123 "metadata": {},
124 "source": [
125 "### Associate IssueUpdates with their Issues\n",
126 "This next step is resource intensive and can take a while."
127 ]
128 },
129 {
130 "cell_type": "code",
131 "execution_count": null,
132 "metadata": {
133 "collapsed": false
134 },
135 "outputs": [],
136 "source": [
137 "updates_by_issue = defaultdict(list)\n",
138 "i = 0\n",
139 "for index, row in issue_update.iterrows():\n",
140 " updates_by_issue[row[\"issue_id\"]].append(row)\n",
141 " if i % 1000000 == 0:\n",
142 " print(i)\n",
143 " i += 1"
144 ]
145 },
146 {
147 "cell_type": "code",
148 "execution_count": null,
149 "metadata": {
150 "collapsed": false
151 },
152 "outputs": [],
153 "source": [
154 "issues_by_id = {}\n",
155 "i = 0\n",
156 "for index, row in issue.iterrows():\n",
157 " issues_by_id[row[\"issue_id\"]] = row\n",
158 " if i % 1000000 == 0:\n",
159 " print(i)\n",
160 " i += 1"
161 ]
162 },
163 {
164 "cell_type": "code",
165 "execution_count": null,
166 "metadata": {
167 "collapsed": false
168 },
169 "outputs": [],
170 "source": [
171 "status_by_id = {}\n",
172 "i = 0\n",
173 "for index, row in status_def.iterrows():\n",
174 " status_by_id[row[\"id\"]] = row\n",
175 " if i % 1000000 == 0:\n",
176 " print(i)\n",
177 " i += 1"
178 ]
179 },
180 {
181 "cell_type": "code",
182 "execution_count": null,
183 "metadata": {
184 "collapsed": false
185 },
186 "outputs": [],
187 "source": [
188 "issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n",
189 "issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))"
190 ]
191 },
192 {
193 "cell_type": "code",
194 "execution_count": null,
195 "metadata": {
196 "collapsed": false
197 },
198 "outputs": [],
199 "source": [
200 "sns.distplot(issue[\"num_updates\"], kde=False)"
201 ]
202 },
203 {
204 "cell_type": "code",
205 "execution_count": null,
206 "metadata": {
207 "collapsed": false
208 },
209 "outputs": [],
210 "source": [
211 "def StatusPath(i_id, updates):\n",
212 " statuses = []\n",
213 " for update in updates:\n",
214 " if update.field == 'status':\n",
215 " if len(statuses) == 0:\n",
216 " statuses.append(update.old_value if update.old_value else 'none')\n",
217 " statuses.append(update.new_value if update.new_value else 'none')\n",
218 "\n",
219 " if len(statuses) == 0:\n",
220 " # use ~np.isnan here instead?\n",
221 " if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n",
222 " status_id = int(issues_by_id[i_id].status_id)\n",
223 " if status_id is not NaN and status_id in status_by_id:\n",
224 " statuses = [status_by_id[status_id].status]\n",
225 " else:\n",
226 " statuses = ['mystery status id: %d' % status_id]\n",
227 " else:\n",
228 " statuses = ['never had status']\n",
229 " statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n",
230 " return u'->'.join(statuses)\n",
231 "\n"
232 ]
233 },
234 {
235 "cell_type": "code",
236 "execution_count": null,
237 "metadata": {
238 "collapsed": false
239 },
240 "outputs": [],
241 "source": [
242 "issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))"
243 ]
244 },
245 {
246 "cell_type": "code",
247 "execution_count": null,
248 "metadata": {
249 "collapsed": false,
250 "scrolled": true
251 },
252 "outputs": [],
253 "source": [
254 "plt.rcParams['figure.figsize']=(10,25)\n",
255 "by_path = issue.groupby([\"status_path\"]).size()\n",
256 "by_path.sort()\n",
257 "by_path.plot(kind='barh')"
258 ]
259 },
260 {
261 "cell_type": "code",
262 "execution_count": null,
263 "metadata": {
264 "collapsed": false
265 },
266 "outputs": [],
267 "source": [
268 "# Find distributions of time-to-close for various closed states.\n",
269 "\n",
270 "closed_issue = issue[issue[\"closed\"] > 0]\n",
271 " \n",
272 "closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n",
273 "closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n",
274 "print(\"Number of closed issues %d\" % closed_issue.shape[0])"
275 ]
276 },
277 {
278 "cell_type": "code",
279 "execution_count": null,
280 "metadata": {
281 "collapsed": false
282 },
283 "outputs": [],
284 "source": [
285 "plt.rcParams['figure.figsize']=(10,5)\n",
286 "sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)"
287 ]
288 },
289 {
290 "cell_type": "code",
291 "execution_count": null,
292 "metadata": {
293 "collapsed": false,
294 "scrolled": true
295 },
296 "outputs": [],
297 "source": [
298 "# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n",
299 "# since the time_to_close distribution skews waaaay out\n",
300 "sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")"
301 ]
302 }
303 ],
304 "metadata": {
305 "kernelspec": {
306 "display_name": "Python 2",
307 "language": "python",
308 "name": "python2"
309 },
310 "language_info": {
311 "codemirror_mode": {
312 "name": "ipython",
313 "version": 2
314 },
315 "file_extension": ".py",
316 "mimetype": "text/x-python",
317 "name": "python",
318 "nbconvert_exporter": "python",
319 "pygments_lexer": "ipython2",
320 "version": "2.7.6"
321 }
322 },
323 "nbformat": 4,
324 "nbformat_minor": 0
325}