| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "%pylab inline" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "from __future__ import print_function\n", |
| "from __future__ import division\n", |
| "from IPython.display import display, HTML" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "import seaborn as sns\n", |
| "import pandas as pd\n", |
| "import MySQLdb as mdb\n", |
| "import bs4\n", |
| "import datetime\n", |
| "from collections import defaultdict\n", |
| "from matplotlib import pyplot as plt\n", |
| "from ipywidgets import widgets" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### Load the Data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "def table_to_dataframe(name, connection):\n", |
| " return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n", |
| "\n", |
| "def project_table_to_dataframe(name, connection):\n", |
| " # project_id 1 is monorail\n", |
| " return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "cursor = connection.cursor()" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "# Only look at monorail issues, and only look at issues opened in the past year.\n", |
| "issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n", |
| "comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n", |
| "status_def = project_table_to_dataframe(\"StatusDef\", connection)\n", |
| "issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n", |
| "issue_label = table_to_dataframe(\"Issue2Label\", connection)\n", |
| "issue_component = table_to_dataframe(\"Issue2Component\", connection)\n", |
| "issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n", |
| "issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "print(\"Number of Issues\", issue.shape[0])\n", |
| "print(\"Number of IssueUpdates\", issue_update.shape[0])\n" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "### Associate IssueUpdates with their Issues\n", |
| "This next step is resource intensive and can take a while." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "updates_by_issue = defaultdict(list)\n", |
| "i = 0\n", |
| "for index, row in issue_update.iterrows():\n", |
| " updates_by_issue[row[\"issue_id\"]].append(row)\n", |
| " if i % 1000000 == 0:\n", |
| " print(i)\n", |
| " i += 1" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "issues_by_id = {}\n", |
| "i = 0\n", |
| "for index, row in issue.iterrows():\n", |
| " issues_by_id[row[\"issue_id\"]] = row\n", |
| " if i % 1000000 == 0:\n", |
| " print(i)\n", |
| " i += 1" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "status_by_id = {}\n", |
| "i = 0\n", |
| "for index, row in status_def.iterrows():\n", |
| " status_by_id[row[\"id\"]] = row\n", |
| " if i % 1000000 == 0:\n", |
| " print(i)\n", |
| " i += 1" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n", |
| "issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "sns.distplot(issue[\"num_updates\"], kde=False)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "def StatusPath(i_id, updates):\n", |
| " statuses = []\n", |
| " for update in updates:\n", |
| " if update.field == 'status':\n", |
| " if len(statuses) == 0:\n", |
| " statuses.append(update.old_value if update.old_value else 'none')\n", |
| " statuses.append(update.new_value if update.new_value else 'none')\n", |
| "\n", |
| " if len(statuses) == 0:\n", |
| " # use ~np.isnan here instead?\n", |
| " if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n", |
| " status_id = int(issues_by_id[i_id].status_id)\n", |
| " if status_id is not NaN and status_id in status_by_id:\n", |
| " statuses = [status_by_id[status_id].status]\n", |
| " else:\n", |
| " statuses = ['mystery status id: %d' % status_id]\n", |
| " else:\n", |
| " statuses = ['never had status']\n", |
| " statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n", |
| " return u'->'.join(statuses)\n", |
| "\n" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false, |
| "scrolled": true |
| }, |
| "outputs": [], |
| "source": [ |
| "plt.rcParams['figure.figsize']=(10,25)\n", |
| "by_path = issue.groupby([\"status_path\"]).size()\n", |
| "by_path.sort()\n", |
| "by_path.plot(kind='barh')" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "# Find distributions of time-to-close for various closed states.\n", |
| "\n", |
| "closed_issue = issue[issue[\"closed\"] > 0]\n", |
| " \n", |
| "closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n", |
| "closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n", |
| "print(\"Number of closed issues %d\" % closed_issue.shape[0])" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false |
| }, |
| "outputs": [], |
| "source": [ |
| "plt.rcParams['figure.figsize']=(10,5)\n", |
| "sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": false, |
| "scrolled": true |
| }, |
| "outputs": [], |
| "source": [ |
| "# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n", |
| "# since the time_to_close distribution skews waaaay out\n", |
| "sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.6" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 0 |
| } |