blob: 56dbe749747d1c274b67c21627e7505df9fa76d6 [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"from __future__ import division\n",
"from IPython.display import display, HTML"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import seaborn as sns\n",
"import pandas as pd\n",
"import MySQLdb as mdb\n",
"import bs4\n",
"import datetime\n",
"from collections import defaultdict\n",
"from matplotlib import pyplot as plt\n",
"from ipywidgets import widgets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def table_to_dataframe(name, connection):\n",
" return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n",
"\n",
"def project_table_to_dataframe(name, connection):\n",
" # project_id 1 is monorail\n",
" return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cursor = connection.cursor()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Only look at monorail issues, and only look at issues opened in the past year.\n",
"issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n",
"comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n",
"status_def = project_table_to_dataframe(\"StatusDef\", connection)\n",
"issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n",
"issue_label = table_to_dataframe(\"Issue2Label\", connection)\n",
"issue_component = table_to_dataframe(\"Issue2Component\", connection)\n",
"issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n",
"issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(\"Number of Issues\", issue.shape[0])\n",
"print(\"Number of IssueUpdates\", issue_update.shape[0])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Associate IssueUpdates with their Issues\n",
"This next step is resource intensive and can take a while."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"updates_by_issue = defaultdict(list)\n",
"i = 0\n",
"for index, row in issue_update.iterrows():\n",
" updates_by_issue[row[\"issue_id\"]].append(row)\n",
" if i % 1000000 == 0:\n",
" print(i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"issues_by_id = {}\n",
"i = 0\n",
"for index, row in issue.iterrows():\n",
" issues_by_id[row[\"issue_id\"]] = row\n",
" if i % 1000000 == 0:\n",
" print(i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"status_by_id = {}\n",
"i = 0\n",
"for index, row in status_def.iterrows():\n",
" status_by_id[row[\"id\"]] = row\n",
" if i % 1000000 == 0:\n",
" print(i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n",
"issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sns.distplot(issue[\"num_updates\"], kde=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def StatusPath(i_id, updates):\n",
" statuses = []\n",
" for update in updates:\n",
" if update.field == 'status':\n",
" if len(statuses) == 0:\n",
" statuses.append(update.old_value if update.old_value else 'none')\n",
" statuses.append(update.new_value if update.new_value else 'none')\n",
"\n",
" if len(statuses) == 0:\n",
" # use ~np.isnan here instead?\n",
" if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n",
" status_id = int(issues_by_id[i_id].status_id)\n",
" if status_id is not NaN and status_id in status_by_id:\n",
" statuses = [status_by_id[status_id].status]\n",
" else:\n",
" statuses = ['mystery status id: %d' % status_id]\n",
" else:\n",
" statuses = ['never had status']\n",
" statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n",
" return u'->'.join(statuses)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize']=(10,25)\n",
"by_path = issue.groupby([\"status_path\"]).size()\n",
"by_path.sort()\n",
"by_path.plot(kind='barh')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Find distributions of time-to-close for various closed states.\n",
"\n",
"closed_issue = issue[issue[\"closed\"] > 0]\n",
" \n",
"closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n",
"closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n",
"print(\"Number of closed issues %d\" % closed_issue.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize']=(10,5)\n",
"sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n",
"# since the time_to_close distribution skews waaaay out\n",
"sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}