tiyd-python-2015-05 · tshealy · Jun 4, 2015 · Jun 4, 2015 · Jun 5, 2015 · Jun 6, 2015
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,13 @@ docs/_build/
 # PyBuilder
 target/
 
+.direnv/
+.envrc
+.idea/
+.DS_Store
+scraper_500x100.pkl
+scraper_50x10.pkl
+scraper_50x1_17.pkl
+scraper_700x1_17.pkl
+test_X_values.pkl
+test_y_values.pkl
diff --git a/bs4_scratch_work.ipynb b/bs4_scratch_work.ipynb
diff --git a/classification_accuracy_by_dataframe_size.ipynb b/classification_accuracy_by_dataframe_size.ipynb
@@ -0,0 +1,337 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from scraper import pipeline_runner\n",
+    "from feature_vectors import *\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#Small Dataframe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###Only using CountVectorizer, no additional regex feature vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df_50x10 = pd.read_pickle('scraper_50x10.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df_mnb = pipeline_runner(df_50x10, 'Multinomial')\n",
+    "df_knn = pipeline_runner(df_50x10, 'KNeighbors')\n",
+    "df_forest = pipeline_runner(df_50x10, 'Forest')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "df_mnb: (0.89317507418397624, 0.51333333333333331)\n",
+      "df_knn: (0.49406528189910981, 0.31777777777777777)\n",
+      "df_forest: (0.99183976261127593, 0.62666666666666671)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"df_mnb: {}\".format(df_mnb))\n",
+    "print(\"df_knn: {}\".format(df_knn))\n",
+    "print(\"df_forest: {}\".format(df_forest))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "###Getting low scores. Random Forest is the most accurate with 0.62 mean score. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#Larger Dataframe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###Still no additional feature vectorizers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#see scraper.py for how the 500x100 dataframe was pulled from RosettaCode. \n",
+    "df_500x100 = pd.read_pickle('scraper_500x100.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df_mnb_500 = pipeline_runner(df_500x100, 'Multinomial')\n",
+    "df_knn_500 = pipeline_runner(df_500x100, 'KNeighbors')\n",
+    "df_forest_500 = pipeline_runner(df_500x100, 'Forest')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "df_mnb_500: (0.80047166883882581, 0.62267311988086371)\n",
+      "df_knn_500: (0.60628064295910133, 0.40171258376768426)\n",
+      "df_forest_500: (0.98907714267982372, 0.72040208488458679)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"df_mnb_500: {}\".format(df_mnb_500))\n",
+    "print(\"df_knn_500: {}\".format(df_knn_500))\n",
+    "print(\"df_forest_500: {}\".format(df_forest_500))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##Random Forest, again, produces the best mean accuracy. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#Adding additional feature vectors to improve score accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.naive_bayes import MultinomialNB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X = df_500x100.loc[:, 1]\n",
+    "y = df_500x100.loc[:, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "language_featurizer = make_union(CountVectorizer(),\n",
+    "                                     FunctionFeaturizer(longest_run_of_capital_letters_feature,\n",
+    "                                                        longest_run_of_character_feature,\n",
+    "                                                        percent_character_combinations,\n",
+    "                                                        percent_character_feature,\n",
+    "                                                        binary_character_combinations))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<21485x67251 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 833585 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "language_featurizer.fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "pipe = make_pipeline(language_featurizer, RandomForestClassifier())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
+       "       transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
+       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
+       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
+       " ...n_jobs=1,\n",
+       "            oob_score=False, random_state=None, verbose=0,\n",
+       "            warm_start=False))])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.76954579300074455"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "The test mean accuracy is 0.76 using only the data pulled from RosettaCode (not using the test data from the file)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###Score accuracy imporved 0.05 by adding additional feature vectors. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}