Project import generated by Copybara.
GitOrigin-RevId: d9e9e3fb4e31372ec1fb43b178994ca78fa8fe70
diff --git a/tools/datalab/subsample_data.ipynb b/tools/datalab/subsample_data.ipynb
new file mode 100644
index 0000000..988bfbc
--- /dev/null
+++ b/tools/datalab/subsample_data.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook is intended to show the process for going from \n",
+ "a database connection to two files the first associating an issue\n",
+ "to its components and the second to assocating an issue to its \n",
+ "comments. These files can then be used a machine learning pipeline\n",
+ "that will apply cleaning, vectorization of the text and building models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "%pylab inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "from __future__ import print_function\n",
+ "from __future__ import division"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from bs4 import BeautifulSoup\n",
+ "from collections import defaultdict\n",
+ "import pickle\n",
+ "import MySQLdb as mdb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "connection = mdb.connect(host='', user='', db='monorail')\n",
+ "cursor = connection.cursor()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def table_to_dataframe(name, connection):\n",
+ " return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "issue = table_to_dataframe('Issue', connection)\n",
+ "comment = table_to_dataframe('Comment', connection)\n",
+ "issue_component = table_to_dataframe('Issue2Component', connection)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "issue.rename(columns={'id':'issue_id'}, inplace=True)\n",
+ "chrome_issue = issue[issue['project_id'] == 16].copy()\n",
+ "chrome_issue_id_set = set(chrome_issue['issue_id'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Associate an issue withs its components"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "components_by_issue = defaultdict(list)\n",
+ "i = 0\n",
+ "for index, row in issue_component.iterrows():\n",
+ " if row['issue_id'] in chrome_issue_id_set:\n",
+ " components_by_issue[row['issue_id']].append(row['component_id'])\n",
+ " if i % 100000 == 0:\n",
+ " print(i)\n",
+ " i += 1\n",
+ "\n",
+ "chrome_issue['components'] = chrome_issue['issue_id'].apply(lambda i_id: components_by_issue[i_id])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Associate an issue withs its comments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "comments_by_issue = defaultdict(list)\n",
+ "i = 0\n",
+ "for index, row in chrome_comment.iterrows():\n",
+ " comments_by_issue[row[\"issue_id\"]].append((index, row.created))\n",
+ " if i % 1000000 == 0:\n",
+ " print(i)\n",
+ " i += 1\n",
+ "\n",
+ "chrome_issue[\"comments\"] = chrome_issue[\"issue_id\"].apply(lambda i_id: \n",
+ " [tup[0] for tup \n",
+ " in sorted(comments_by_issue[i_id], \n",
+ " key=lambda x: x[1])])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Only work with closed issues for training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "closed_chrome_issues = chrome_issue[chrome_issue[\"closed\"] > 0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Subsample the data (faster to run experiments)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "num_issues = len(closed_chrome_issues)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "issue_subset = closed_chrome_issues.sample(int(num_issues * 0.05))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Very light cleaning of text (removing markup)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "comment_index_to_text = defaultdict(unicode)\n",
+ "\n",
+ "i = 0\n",
+ "for index, row in issue_subset.iterrows():\n",
+ " for num, comment_id in enumerate(row['comments']):\n",
+ " text = BeautifulSoup(comment.loc[comment_id]['content']).get_text().strip().lower()\n",
+ " comment_index_to_text[comment_id] = text\n",
+ " \n",
+ " if i % 10000 == 0:\n",
+ " print(i)\n",
+ " i += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "issue_subset.to_pickle('subset_issue.pkl')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "pickle.dump(comment_index_to_text, open('comment_text.pkl', 'w'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}