Project import generated by Copybara.

GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/datalab/subsample_data.ipynb b/tools/datalab/subsample_data.ipynb
new file mode 100644
index 0000000..988bfbc
--- /dev/null
+++ b/tools/datalab/subsample_data.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook is intended to show the process for going from \n",
+    "a database connection to two files the first associating an issue\n",
+    "to its components and the second to assocating an issue to its \n",
+    "comments. These files can then be used a machine learning pipeline\n",
+    "that will apply cleaning, vectorization of the text and building models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%pylab inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "from __future__ import division"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from bs4 import BeautifulSoup\n",
+    "from collections import defaultdict\n",
+    "import pickle\n",
+    "import MySQLdb as mdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "connection = mdb.connect(host='', user='', db='monorail')\n",
+    "cursor = connection.cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def table_to_dataframe(name, connection):\n",
+    "    return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "issue = table_to_dataframe('Issue', connection)\n",
+    "comment = table_to_dataframe('Comment', connection)\n",
+    "issue_component = table_to_dataframe('Issue2Component', connection)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "issue.rename(columns={'id':'issue_id'}, inplace=True)\n",
+    "chrome_issue = issue[issue['project_id'] == 16].copy()\n",
+    "chrome_issue_id_set = set(chrome_issue['issue_id'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Associate an issue withs its components"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "components_by_issue = defaultdict(list)\n",
+    "i = 0\n",
+    "for index, row in issue_component.iterrows():\n",
+    "    if row['issue_id'] in chrome_issue_id_set:\n",
+    "        components_by_issue[row['issue_id']].append(row['component_id'])\n",
+    "    if i % 100000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1\n",
+    "\n",
+    "chrome_issue['components'] = chrome_issue['issue_id'].apply(lambda i_id: components_by_issue[i_id])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Associate an issue withs its comments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "comments_by_issue = defaultdict(list)\n",
+    "i = 0\n",
+    "for index, row in chrome_comment.iterrows():\n",
+    "    comments_by_issue[row[\"issue_id\"]].append((index, row.created))\n",
+    "    if i % 1000000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1\n",
+    "\n",
+    "chrome_issue[\"comments\"] = chrome_issue[\"issue_id\"].apply(lambda i_id: \n",
+    "                                                          [tup[0] for tup \n",
+    "                                                           in sorted(comments_by_issue[i_id], \n",
+    "                                                                     key=lambda x: x[1])])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Only work with closed issues for training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "closed_chrome_issues = chrome_issue[chrome_issue[\"closed\"] > 0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subsample the data (faster to run experiments)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "num_issues = len(closed_chrome_issues)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "issue_subset = closed_chrome_issues.sample(int(num_issues * 0.05))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Very light cleaning of text (removing markup)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "comment_index_to_text = defaultdict(unicode)\n",
+    "\n",
+    "i = 0\n",
+    "for index, row in issue_subset.iterrows():\n",
+    "    for num, comment_id in enumerate(row['comments']):\n",
+    "        text =  BeautifulSoup(comment.loc[comment_id]['content']).get_text().strip().lower()\n",
+    "        comment_index_to_text[comment_id] = text\n",
+    "    \n",
+    "    if i % 10000 == 0:\n",
+    "        print(i)\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "issue_subset.to_pickle('subset_issue.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "pickle.dump(comment_index_to_text, open('comment_text.pkl', 'w'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}