blob: 326c49ccc278b093bcaabb248fb81c46a682d20c [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"from __future__ import division"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import pickle\n",
"import unicodedata\n",
"import time\n",
"import sklearn\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.multiclass import OneVsRestClassifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"issues = pickle.load(open(\"subset_issue.pkl\"))\n",
"comment_text = pickle.load(open(\"comment_text.pkl\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"table for removing punctuation from text."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"table = dict.fromkeys(i for i in xrange(sys.maxunicode)\n",
" if unicodedata.category(unichr(i)).startswith('P'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clean The text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_text_components_per_issue(issues):\n",
" text_per_issue = []\n",
" components_per_issue = []\n",
"\n",
" for index, row in issues.iterrows():\n",
" issue_text = \"\"\n",
" for comment_id in row[\"comments\"]:\n",
" text = comment_text[comment_id].strip()\n",
" # Remove punctuation\n",
" text = text.translate(table)\n",
" issue_text += text + \" \"\n",
" text_per_issue.append(issue_text.strip())\n",
"\n",
" components_per_issue.append(set(row[\"components\"]))\n",
" \n",
" return text_per_issue, components_per_issue\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"text_per_issue, components_per_issue = get_text_components_per_issue(issues)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filter out components that are used infrequently(not enough singal) or too frequently (signal not meaningful)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def prune_and_bin_components(components_per_issue, prune_low=0.005, prune_high=0.25):\n",
" mlb = MultiLabelBinarizer()\n",
" bins = mlb.fit_transform(components_per_issue)\n",
" exclude_comp_ids = set(mlb.classes_[~(((bins.sum(axis=0) / bins.sum()) > prune_low) & \n",
" ((bins.sum(axis=0) / bins.sum()) < prune_high))])\n",
" \n",
" comps_per_issue_exclude = []\n",
" for comp_set in components_per_issue:\n",
" comps = comp_set - exclude_comp_ids\n",
" comps_per_issue_exclude.append(comps)\n",
" \n",
" mlb = MultiLabelBinarizer()\n",
" bins = mlb.fit_transform(comps_per_issue_exclude)\n",
" return bins, mlb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bins, mlb = prune_and_bin_components(components_per_issue)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenize the text and perform tfidf transformations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),\n",
" token_pattern=r'\\b\\w+\\b',\n",
" min_df=5,\n",
" max_df=0.5,\n",
" stop_words='english')\n",
"\n",
"tfidf_transformer = TfidfTransformer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"counts = bigram_vectorizer.fit_transform(text_per_issue)\n",
"tfidf = tfidf_transformer.fit_transform(counts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(tfidf, bins, train_size=0.8, random_state=42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train a very simple linear model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"clf = OneVsRestClassifier(LinearSVC(C=1.0))\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict and analyze the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"predictions = clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"(y_test == predictions).sum() / (y_test.shape[0] * y_test.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"np.sum((y_test == predictions).sum(axis=1) == 44) / y_test.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sns.distplot(y_test.sum(axis=1), kde=False)\n",
"sns.distplot(predictions.sum(axis=1), kde=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sns.barplot(range(44), y_test.sum(axis=0), color=\"red\")\n",
"sns.barplot(range(44), predictions.sum(axis=0), color=\"blue\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Serialize the data and the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def serialize_data_model(vectorizer, classifier, features, targets, transformer=None):\n",
" current_time = int(time.time())\n",
" pickle.dump(vectorizer, open(\"{}-vectorizer.pkl\".format(current_time), \"wb\"))\n",
" pickle.dump(classifier, open(\"{}-classifier.pkl\".format(current_time), \"wb\"))\n",
" \n",
" training = {\"features\": features, \"targets\": targets}\n",
" pickle.dump(training, open(\"{}.pkl\".format(current_time), \"wb\"))\n",
" \n",
" if transformer:\n",
" pickle.dump(transformer, open(\"{}-transformer.pkl\".format(current_time), \"wb\"))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"serialize_data_model(bigram_vectorizer, clf, tfidf, bins, tfidf_transformer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}