Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/datalab/update_paths.ipynb b/tools/datalab/update_paths.ipynb
new file mode 100644
index 0000000..56dbe74
--- /dev/null
+++ b/tools/datalab/update_paths.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%pylab inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "from __future__ import division\n",
+    "from IPython.display import display, HTML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import pandas as pd\n",
+    "import MySQLdb as mdb\n",
+    "import bs4\n",
+    "import datetime\n",
+    "from collections import defaultdict\n",
+    "from matplotlib import pyplot as plt\n",
+    "from ipywidgets import widgets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def table_to_dataframe(name, connection):\n",
+    "    return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n",
+    "\n",
+    "def project_table_to_dataframe(name, connection):\n",
+    "    # project_id 1 is monorail\n",
+    "    return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "cursor = connection.cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Only look at monorail issues, and only look at issues opened in the past year.\n",
+    "issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n",
+    "comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n",
+    "status_def = project_table_to_dataframe(\"StatusDef\", connection)\n",
+    "issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n",
+    "issue_label = table_to_dataframe(\"Issue2Label\", connection)\n",
+    "issue_component = table_to_dataframe(\"Issue2Component\", connection)\n",
+    "issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n",
+    "issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print(\"Number of Issues\", issue.shape[0])\n",
+    "print(\"Number of IssueUpdates\", issue_update.shape[0])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Associate IssueUpdates with their Issues\n",
+    "This next step is resource intensive and can take a while."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "updates_by_issue = defaultdict(list)\n",
+    "i = 0\n",
+    "for index, row in issue_update.iterrows():\n",
+    "    updates_by_issue[row[\"issue_id\"]].append(row)\n",
+    "    if i % 1000000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "issues_by_id = {}\n",
+    "i = 0\n",
+    "for index, row in issue.iterrows():\n",
+    "    issues_by_id[row[\"issue_id\"]] = row\n",
+    "    if i % 1000000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "status_by_id = {}\n",
+    "i = 0\n",
+    "for index, row in status_def.iterrows():\n",
+    "    status_by_id[row[\"id\"]] = row\n",
+    "    if i % 1000000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n",
+    "issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "sns.distplot(issue[\"num_updates\"], kde=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def StatusPath(i_id, updates):\n",
+    "    statuses = []\n",
+    "    for update in updates:\n",
+    "        if update.field == 'status':\n",
+    "            if len(statuses) == 0:\n",
+    "                statuses.append(update.old_value if update.old_value else 'none')\n",
+    "            statuses.append(update.new_value if update.new_value else 'none')\n",
+    "\n",
+    "    if len(statuses) == 0:\n",
+    "        # use ~np.isnan here instead?\n",
+    "        if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n",
+    "            status_id = int(issues_by_id[i_id].status_id)\n",
+    "            if status_id is not NaN and status_id in status_by_id:\n",
+    "                statuses = [status_by_id[status_id].status]\n",
+    "            else:\n",
+    "                statuses = ['mystery status id: %d' % status_id]\n",
+    "        else:\n",
+    "            statuses = ['never had status']\n",
+    "    statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n",
+    "    return u'->'.join(statuses)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "plt.rcParams['figure.figsize']=(10,25)\n",
+    "by_path = issue.groupby([\"status_path\"]).size()\n",
+    "by_path.sort()\n",
+    "by_path.plot(kind='barh')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Find distributions of time-to-close for various closed states.\n",
+    "\n",
+    "closed_issue = issue[issue[\"closed\"] > 0]\n",
+    "    \n",
+    "closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n",
+    "closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n",
+    "print(\"Number of closed issues %d\" % closed_issue.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "plt.rcParams['figure.figsize']=(10,5)\n",
+    "sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n",
+    "# since the time_to_close distribution skews waaaay out\n",
+    "sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}