blob: 988bfbcf90e700102f0a18b5e75a6f55fbf28686 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook is intended to show the process for going from \n",
"a database connection to two files the first associating an issue\n",
"to its components and the second to assocating an issue to its \n",
"comments. These files can then be used a machine learning pipeline\n",
"that will apply cleaning, vectorization of the text and building models."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"from __future__ import division"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"from collections import defaultdict\n",
"import pickle\n",
"import MySQLdb as mdb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"connection = mdb.connect(host='', user='', db='monorail')\n",
"cursor = connection.cursor()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def table_to_dataframe(name, connection):\n",
" return pd.read_sql(\"SELECT * FROM {};\".format(name) , con=connection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"issue = table_to_dataframe('Issue', connection)\n",
"comment = table_to_dataframe('Comment', connection)\n",
"issue_component = table_to_dataframe('Issue2Component', connection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"issue.rename(columns={'id':'issue_id'}, inplace=True)\n",
"chrome_issue = issue[issue['project_id'] == 16].copy()\n",
"chrome_issue_id_set = set(chrome_issue['issue_id'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Associate an issue withs its components"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"components_by_issue = defaultdict(list)\n",
"i = 0\n",
"for index, row in issue_component.iterrows():\n",
" if row['issue_id'] in chrome_issue_id_set:\n",
" components_by_issue[row['issue_id']].append(row['component_id'])\n",
" if i % 100000 == 0:\n",
" print(i)\n",
" i += 1\n",
"\n",
"chrome_issue['components'] = chrome_issue['issue_id'].apply(lambda i_id: components_by_issue[i_id])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Associate an issue withs its comments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"comments_by_issue = defaultdict(list)\n",
"i = 0\n",
"for index, row in chrome_comment.iterrows():\n",
" comments_by_issue[row[\"issue_id\"]].append((index, row.created))\n",
" if i % 1000000 == 0:\n",
" print(i)\n",
" i += 1\n",
"\n",
"chrome_issue[\"comments\"] = chrome_issue[\"issue_id\"].apply(lambda i_id: \n",
" [tup[0] for tup \n",
" in sorted(comments_by_issue[i_id], \n",
" key=lambda x: x[1])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Only work with closed issues for training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"closed_chrome_issues = chrome_issue[chrome_issue[\"closed\"] > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Subsample the data (faster to run experiments)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"num_issues = len(closed_chrome_issues)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"issue_subset = closed_chrome_issues.sample(int(num_issues * 0.05))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Very light cleaning of text (removing markup)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"comment_index_to_text = defaultdict(unicode)\n",
"\n",
"i = 0\n",
"for index, row in issue_subset.iterrows():\n",
" for num, comment_id in enumerate(row['comments']):\n",
" text = BeautifulSoup(comment.loc[comment_id]['content']).get_text().strip().lower()\n",
" comment_index_to_text[comment_id] = text\n",
" \n",
" if i % 10000 == 0:\n",
" print(i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"issue_subset.to_pickle('subset_issue.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pickle.dump(comment_index_to_text, open('comment_text.pkl', 'w'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}