Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/datalab/update_paths.ipynb b/tools/datalab/update_paths.ipynb
new file mode 100644
index 0000000..56dbe74
--- /dev/null
+++ b/tools/datalab/update_paths.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%pylab inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from __future__ import print_function\n",
+ "from __future__ import division\n",
+ "from IPython.display import display, HTML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import pandas as pd\n",
+ "import MySQLdb as mdb\n",
+ "import bs4\n",
+ "import datetime\n",
+ "from collections import defaultdict\n",
+ "from matplotlib import pyplot as plt\n",
+ "from ipywidgets import widgets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load the Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def table_to_dataframe(name, connection):\n",
+ " return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)\n",
+ "\n",
+ "def project_table_to_dataframe(name, connection):\n",
+ " # project_id 1 is monorail\n",
+ " return pd.read_sql(\"SELECT * FROM {} where project_id = 1;\".format(name) , con=connection)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "connection = mdb.connect(host=\"localhost\", user=\"root\", db=\"monorail\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "cursor = connection.cursor()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "# Only look at monorail issues, and only look at issues opened in the past year.\n",
+ "issue = pd.read_sql(\"SELECT * FROM Issue where project_id = 1 and opened > 1436396241;\", con=connection)\n",
+ "comment = pd.read_sql(\"SELECT * FROM Comment where project_id = 1 and created > 1436396241;\", con=connection)\n",
+ "status_def = project_table_to_dataframe(\"StatusDef\", connection)\n",
+ "issue_summarny = table_to_dataframe(\"IssueSummary\", connection)\n",
+ "issue_label = table_to_dataframe(\"Issue2Label\", connection)\n",
+ "issue_component = table_to_dataframe(\"Issue2Component\", connection)\n",
+ "issue_update = table_to_dataframe(\"IssueUpdate\", connection)\n",
+ "issue.rename(columns={\"id\":\"issue_id\"}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "print(\"Number of Issues\", issue.shape[0])\n",
+ "print(\"Number of IssueUpdates\", issue_update.shape[0])\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Associate IssueUpdates with their Issues\n",
+ "This next step is resource intensive and can take a while."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "updates_by_issue = defaultdict(list)\n",
+ "i = 0\n",
+ "for index, row in issue_update.iterrows():\n",
+ " updates_by_issue[row[\"issue_id\"]].append(row)\n",
+ " if i % 1000000 == 0:\n",
+ " print(i)\n",
+ " i += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "issues_by_id = {}\n",
+ "i = 0\n",
+ "for index, row in issue.iterrows():\n",
+ " issues_by_id[row[\"issue_id\"]] = row\n",
+ " if i % 1000000 == 0:\n",
+ " print(i)\n",
+ " i += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "status_by_id = {}\n",
+ "i = 0\n",
+ "for index, row in status_def.iterrows():\n",
+ " status_by_id[row[\"id\"]] = row\n",
+ " if i % 1000000 == 0:\n",
+ " print(i)\n",
+ " i += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "issue[\"updates\"] = issue[\"issue_id\"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])\n",
+ "issue[\"num_updates\"] = issue[\"updates\"].apply(lambda updates: len(updates))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "sns.distplot(issue[\"num_updates\"], kde=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def StatusPath(i_id, updates):\n",
+ " statuses = []\n",
+ " for update in updates:\n",
+ " if update.field == 'status':\n",
+ " if len(statuses) == 0:\n",
+ " statuses.append(update.old_value if update.old_value else 'none')\n",
+ " statuses.append(update.new_value if update.new_value else 'none')\n",
+ "\n",
+ " if len(statuses) == 0:\n",
+ " # use ~np.isnan here instead?\n",
+ " if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack\n",
+ " status_id = int(issues_by_id[i_id].status_id)\n",
+ " if status_id is not NaN and status_id in status_by_id:\n",
+ " statuses = [status_by_id[status_id].status]\n",
+ " else:\n",
+ " statuses = ['mystery status id: %d' % status_id]\n",
+ " else:\n",
+ " statuses = ['never had status']\n",
+ " statuses = [s.decode('utf-8', errors='replace') for s in statuses]\n",
+ " return u'->'.join(statuses)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "issue[\"status_path\"] = issue[\"issue_id\"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "plt.rcParams['figure.figsize']=(10,25)\n",
+ "by_path = issue.groupby([\"status_path\"]).size()\n",
+ "by_path.sort()\n",
+ "by_path.plot(kind='barh')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "# Find distributions of time-to-close for various closed states.\n",
+ "\n",
+ "closed_issue = issue[issue[\"closed\"] > 0]\n",
+ " \n",
+ "closed_issue[\"time_to_close\"] = closed_issue[\"issue_id\"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)\n",
+ "closed_issue[\"issue_state\"] = closed_issue[\"status_id\"].apply(lambda s_id: status_by_id[s_id].status)\n",
+ "print(\"Number of closed issues %d\" % closed_issue.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "plt.rcParams['figure.figsize']=(10,5)\n",
+ "sns.distplot(closed_issue[closed_issue[\"time_to_close\"] < 1e7][\"time_to_close\"], kde=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)\n",
+ "# since the time_to_close distribution skews waaaay out\n",
+ "sns.boxplot(data=closed_issue, x=\"time_to_close\", y=\"issue_state\", palette=\"colorblind\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}