From e1f5ac90964d3897a2a8ce135c0b8ca7e7dc9261 Mon Sep 17 00:00:00 2001
From: Manish Patel <maddypatel@gmail.com>
Date: Fri, 5 Jun 2015 19:48:15 -0400
Subject: [PATCH 1/3] Read code files

---
 read_data.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 read_data.py

diff --git a/read_data.py b/read_data.py
new file mode 100644
index 0000000..416df47
--- /dev/null
+++ b/read_data.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import glob
+
+file_ext = {"C": ["gcc", "c", "h"],
+            "C#": ["csharp"],
+            "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
+            "Common Lisp": ["sbcl"],
+            "Haskell": ["hs", "lhs", "ghc"],
+            "Java": ["java", "class", "jar"],
+            "Javascript": ["js", "javascript"],
+            "OCaml": ["ocaml", "ml"],
+            "Perl": ["pl", "pm", "t", "pod", "perl"],
+            "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
+            "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
+            "Ruby": ["rb", "rbw", "jruby", "yarv"],
+            "Scala": ["scala"],
+            "Scheme": ["scm", "ss", "racket"],
+            "Tcl": ["tcl"]}
+
+def read_bench_files():
+    files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*")
+    texts = []
+    for file in files:
+        ext = get_ext(file.split(".")[-1])
+        with open(file) as fh:
+            if ext != None:
+                texts.append((fh.read(), ext))
+    return texts
+
+def get_ext(ext):
+    for key, value in file_ext.items():
+        if ext in value:
+            return key
+
+
+data = read_bench_files()
+data = pd.DataFrame(data, columns = ["code", "language"])
+print(data)

From 689566adb408e2c9de5d35cf35335de172770fa4 Mon Sep 17 00:00:00 2001
From: Manish Patel <maddypatel@gmail.com>
Date: Sun, 7 Jun 2015 22:28:57 -0400
Subject: [PATCH 2/3] Added ipython notebook

---
 Untitled.ipynb | 923 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 923 insertions(+)
 create mode 100644 Untitled.ipynb

diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..fef59ba
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,923 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "file_ext = {\"C\": [\"gcc\", \"c\", \"h\"],\n",
+    "            \"C#\": [\"csharp\"],\n",
+    "            \"Clojure\": [\"clj\", \"cljs\", \"cljs\", \"edn\", \"clojure\"],\n",
+    "            \"Common Lisp\": [\"sbcl\"],\n",
+    "            \"Haskell\": [\"hs\", \"lhs\", \"ghc\"],\n",
+    "            \"Java\": [\"java\", \"class\", \"jar\"],\n",
+    "            \"Javascript\": [\"js\", \"javascript\"],\n",
+    "            \"OCaml\": [\"ocaml\", \"ml\"],\n",
+    "            \"Perl\": [\"pl\", \"pm\", \"t\", \"pod\", \"perl\"],\n",
+    "            \"PHP\": [\"php\", \"phtml\", \"php4\", \"php3\", \"php5\", \"phps\", \"hack\"],\n",
+    "            \"Python\": [\"py\", \"pyw\", \"pyc\", \"pyo\", \"pyd\", \"python3\", \"Python2\"],\n",
+    "            \"Ruby\": [\"rb\", \"rbw\", \"jruby\", \"yarv\"],\n",
+    "            \"Scala\": [\"scala\"],\n",
+    "            \"Scheme\": [\"scm\", \"ss\", \"racket\"],\n",
+    "            \"Tcl\": [\"tcl\"]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def read_bench_files():\n",
+    "    files = glob.glob(\"benchmarksgame/benchmarksgame/bench/*/*.*\")\n",
+    "    texts = []\n",
+    "    for file in files:\n",
+    "        ext = get_ext(file.split(\".\")[-1])\n",
+    "        with open(file) as fh:\n",
+    "            if ext != None:\n",
+    "                texts.append((fh.read(), ext))\n",
+    "    return texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_ext(ext):\n",
+    "    for key, value in file_ext.items():\n",
+    "        if ext in value:\n",
+    "            return key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Code Language\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...        C\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...        C\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...        C\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = read_bench_files()\n",
+    "data = pd.DataFrame(data, columns = [\"Code\", \"Language\"])\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Ruby           73\n",
+       "C              61\n",
+       "PHP            55\n",
+       "Java           51\n",
+       "Scala          43\n",
+       "C#             41\n",
+       "Clojure        38\n",
+       "Python         36\n",
+       "Common Lisp    34\n",
+       "OCaml          34\n",
+       "Perl           34\n",
+       "Haskell        33\n",
+       "Scheme         29\n",
+       "Javascript     25\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.Language.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          C\n",
+       "1          C\n",
+       "2          C\n",
+       "3    Clojure\n",
+       "4    Clojure\n",
+       "Name: Language, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = data.loc[:,\"Language\"]\n",
+    "y.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n",
+       "1    /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n",
+       "2    /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n",
+       "3    ;; The Computer Language Benchmarks Game\\n;; h...\n",
+       "4    ;; The Computer Language Benchmarks Game\\n;; h...\n",
+       "Name: Code, dtype: object"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X = data.loc[:,\"Code\"]\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.cross_validation import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.feature_extraction.text import CountVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.base import TransformerMixin\n",
+    "class DumbFeaturizer(TransformerMixin):\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "    \n",
+    "    def fit(self, X, y=None):\n",
+    "        return self\n",
+    "    \n",
+    "    def transform(self, X):\n",
+    "        return [[1] for _ in X]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.11914893617021277"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n",
+    "pipe.fit(X_train, y_train)\n",
+    "pipe.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def char_count(text, char):\n",
+    "    return text.count(char)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def string_count(string, code):\n",
+    "    value = len(re.findall(string, code))\n",
+    "    return value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
+    "            \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
+    "            \"and\", \"var\", \"loop\", \"array\", \"local\"]\n",
+    "symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(22, 10)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(keywords), len(symbols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def add_features():\n",
+    "    df = data\n",
+    "    my_dict = {}\n",
+    "    for index, row in df.iterrows():\n",
+    "        for keyword in keywords:\n",
+    "            value = string_count(keyword, row[\"Code\"])\n",
+    "            if keyword not in my_dict:\n",
+    "                my_dict[keyword] = [value]\n",
+    "            else:\n",
+    "                my_dict[keyword].append(value)\n",
+    "        for symbol in symbols:\n",
+    "            count = char_count(symbol, row[\"Code\"])\n",
+    "            if symbol not in my_dict:\n",
+    "                my_dict[symbol] = [count]\n",
+    "            else:\n",
+    "                my_dict[symbol].append(count)\n",
+    "        \n",
+    "    return my_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "features = add_features()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "fdf = pd.DataFrame.from_dict(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data = fdf.join(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 587 entries, 0 to 586\n",
+      "Data columns (total 34 columns):\n",
+      "#           587 non-null int64\n",
+      "(           587 non-null int64\n",
+      ")           587 non-null int64\n",
+      ",           587 non-null int64\n",
+      ":           587 non-null int64\n",
+      ";           587 non-null int64\n",
+      "[           587 non-null int64\n",
+      "]           587 non-null int64\n",
+      "and         587 non-null int64\n",
+      "array       587 non-null int64\n",
+      "def         587 non-null int64\n",
+      "define      587 non-null int64\n",
+      "elif        587 non-null int64\n",
+      "else        587 non-null int64\n",
+      "float       587 non-null int64\n",
+      "for         587 non-null int64\n",
+      "format      587 non-null int64\n",
+      "function    587 non-null int64\n",
+      "if          587 non-null int64\n",
+      "import      587 non-null int64\n",
+      "int         587 non-null int64\n",
+      "local       587 non-null int64\n",
+      "loop        587 non-null int64\n",
+      "private     587 non-null int64\n",
+      "public      587 non-null int64\n",
+      "return      587 non-null int64\n",
+      "static      587 non-null int64\n",
+      "var         587 non-null int64\n",
+      "void        587 non-null int64\n",
+      "while       587 non-null int64\n",
+      "{           587 non-null int64\n",
+      "}           587 non-null int64\n",
+      "Code        587 non-null object\n",
+      "Language    587 non-null object\n",
+      "dtypes: int64(32), object(2)\n",
+      "memory usage: 160.5+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "y = data.loc[:,(\"Language\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          C\n",
+       "1          C\n",
+       "2          C\n",
+       "3    Clojure\n",
+       "4    Clojure\n",
+       "Name: Language, dtype: object"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Series' object has no attribute 'indo'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-50-9f42df1c19c9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2081\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2082\u001b[0m             raise AttributeError(\"'%s' object has no attribute '%s'\" %\n\u001b[0;32m-> 2083\u001b[0;31m                                  (type(self).__name__, name))\n\u001b[0m\u001b[1;32m   2084\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2085\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'indo'"
+     ]
+    }
+   ],
+   "source": [
+    "y.indo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X = data.loc[:, (\"Code\", \"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
+    "            \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
+    "            \"and\", \"var\", \"loop\", \"array\", \"local\", \":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "      <th>public</th>\n",
+       "      <th>private</th>\n",
+       "      <th>static</th>\n",
+       "      <th>if</th>\n",
+       "      <th>else</th>\n",
+       "      <th>elif</th>\n",
+       "      <th>def</th>\n",
+       "      <th>void</th>\n",
+       "      <th>int</th>\n",
+       "      <th>...</th>\n",
+       "      <th>:</th>\n",
+       "      <th>;</th>\n",
+       "      <th>{</th>\n",
+       "      <th>}</th>\n",
+       "      <th>(</th>\n",
+       "      <th>)</th>\n",
+       "      <th>#</th>\n",
+       "      <th>[</th>\n",
+       "      <th>]</th>\n",
+       "      <th>,</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>21</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>29</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>12</td>\n",
+       "      <td>1</td>\n",
+       "      <td>22</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>26</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 33 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Code  public  private  static  \\\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...       0        0       1   \n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...       0        0       1   \n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...       0        0       3   \n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...       0        0       0   \n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h...       0        0       0   \n",
+       "\n",
+       "   if  else  elif  def  void  int ...  :  ;  {  }  (  )  #  [  ]  ,  \n",
+       "0  25     2     0    8     1   21 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "1  29     3     0   12     1   22 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "2  26     2     0   13     1   25 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "3   8     0     0   11     0    4 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "4  11     0     0    9     0    4 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "\n",
+       "[5 rows x 33 columns]"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 587 entries, 0 to 586\n",
+      "Data columns (total 33 columns):\n",
+      "Code        587 non-null object\n",
+      "public      587 non-null int64\n",
+      "private     587 non-null int64\n",
+      "static      587 non-null int64\n",
+      "if          587 non-null int64\n",
+      "else        587 non-null int64\n",
+      "elif        587 non-null int64\n",
+      "def         587 non-null int64\n",
+      "void        587 non-null int64\n",
+      "int         587 non-null int64\n",
+      "float       587 non-null int64\n",
+      "for         587 non-null int64\n",
+      "while       587 non-null int64\n",
+      "import      587 non-null int64\n",
+      "define      587 non-null int64\n",
+      "function    587 non-null int64\n",
+      "return      587 non-null int64\n",
+      "format      587 non-null int64\n",
+      "and         587 non-null int64\n",
+      "var         587 non-null int64\n",
+      "loop        587 non-null int64\n",
+      "array       587 non-null int64\n",
+      "local       587 non-null int64\n",
+      ":           587 non-null int64\n",
+      ";           587 non-null int64\n",
+      "{           587 non-null int64\n",
+      "}           587 non-null int64\n",
+      "(           587 non-null int64\n",
+      ")           587 non-null int64\n",
+      "#           587 non-null int64\n",
+      "[           587 non-null int64\n",
+      "]           587 non-null int64\n",
+      ",           587 non-null int64\n",
+      "dtypes: int64(32), object(1)\n",
+      "memory usage: 155.9+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "X.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Number of labels=352 does not match number of samples=33",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-54-5860e4dcbc10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.40\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mpipe\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDumbFeaturizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDecisionTreeClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m    139\u001b[0m         \"\"\"\n\u001b[1;32m    140\u001b[0m         \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pre_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 141\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    142\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/tree/tree.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[1;32m    219\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mn_samples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    220\u001b[0m             raise ValueError(\"Number of labels=%d does not match \"\n\u001b[0;32m--> 221\u001b[0;31m                              \"number of samples=%d\" % (len(y), n_samples))\n\u001b[0m\u001b[1;32m    222\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin_samples_split\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    223\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"min_samples_split must be greater than zero.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mValueError\u001b[0m: Number of labels=352 does not match number of samples=33"
+     ]
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n",
+    "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n",
+    "pipe.fit(X_train, y_train)\n",
+    "pipe.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

From f4c64c5ad80aca816b38ccc79b9c23c577bb2d05 Mon Sep 17 00:00:00 2001
From: Manish Patel <maddypatel@gmail.com>
Date: Mon, 8 Jun 2015 14:49:17 -0400
Subject: [PATCH 3/3] Added file after help from Clinton

---
 Untitled.ipynb | 762 ++++++++++++++++++-------------------------------
 1 file changed, 282 insertions(+), 480 deletions(-)

diff --git a/Untitled.ipynb b/Untitled.ipynb
index fef59ba..35b4d17 100644
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 60,
    "metadata": {
     "collapsed": true
    },
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 61,
    "metadata": {
     "collapsed": true
    },
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 62,
    "metadata": {
     "collapsed": true
    },
@@ -59,7 +59,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 63,
    "metadata": {
     "collapsed": true
    },
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 64,
    "metadata": {
     "collapsed": false
    },
@@ -129,7 +129,7 @@
        "4  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 64,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -142,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 65,
    "metadata": {
     "collapsed": false
    },
@@ -167,7 +167,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -178,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 66,
    "metadata": {
     "collapsed": false
    },
@@ -194,7 +194,7 @@
        "Name: Language, dtype: object"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 66,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -206,35 +206,69 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 83,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "0    /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n",
-       "1    /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n",
-       "2    /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n",
-       "3    ;; The Computer Language Benchmarks Game\\n;; h...\n",
-       "4    ;; The Computer Language Benchmarks Game\\n;; h...\n",
-       "Name: Code, dtype: object"
+       "                                                Code\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h..."
       ]
      },
-     "execution_count": 8,
+     "execution_count": 83,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "X = data.loc[:,\"Code\"]\n",
+    "X = data.loc[:,[\"Code\"]]\n",
     "X.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 84,
    "metadata": {
     "collapsed": false
    },
@@ -246,73 +280,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 85,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
     "from sklearn.pipeline import make_pipeline\n",
-    "from sklearn.feature_extraction.text import CountVectorizer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from sklearn.base import TransformerMixin\n",
-    "class DumbFeaturizer(TransformerMixin):\n",
-    "    def __init__(self):\n",
-    "        pass\n",
-    "    \n",
-    "    def fit(self, X, y=None):\n",
-    "        return self\n",
-    "    \n",
-    "    def transform(self, X):\n",
-    "        return [[1] for _ in X]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.11914893617021277"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from sklearn.tree import DecisionTreeClassifier\n",
-    "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n",
-    "pipe.fit(X_train, y_train)\n",
-    "pipe.score(X_test, y_test)"
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.base import TransformerMixin"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 86,
    "metadata": {
     "collapsed": false
    },
@@ -323,273 +304,141 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 92,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "def char_count(text, char):\n",
-    "    return text.count(char)"
+    "def char_count(char, code):\n",
+    "    return code.count(char)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 101,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "def string_count(string, code):\n",
-    "    value = len(re.findall(string, code))\n",
-    "    return value"
+    "def char_percent(char, code):\n",
+    "    return code.count(char) / len(code)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 102,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
-    "            \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
-    "            \"and\", \"var\", \"loop\", \"array\", \"local\"]\n",
-    "symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(22, 10)"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(keywords), len(symbols)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "def add_features():\n",
-    "    df = data\n",
-    "    my_dict = {}\n",
-    "    for index, row in df.iterrows():\n",
-    "        for keyword in keywords:\n",
-    "            value = string_count(keyword, row[\"Code\"])\n",
-    "            if keyword not in my_dict:\n",
-    "                my_dict[keyword] = [value]\n",
-    "            else:\n",
-    "                my_dict[keyword].append(value)\n",
-    "        for symbol in symbols:\n",
-    "            count = char_count(symbol, row[\"Code\"])\n",
-    "            if symbol not in my_dict:\n",
-    "                my_dict[symbol] = [count]\n",
-    "            else:\n",
-    "                my_dict[symbol].append(count)\n",
-    "        \n",
-    "    return my_dict"
+    "def string_count(string, code):\n",
+    "    value = len(re.findall(string, code))\n",
+    "    return value"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 103,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "features = add_features()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "fdf = pd.DataFrame.from_dict(features)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
    "source": [
-    "data = fdf.join(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Int64Index: 587 entries, 0 to 586\n",
-      "Data columns (total 34 columns):\n",
-      "#           587 non-null int64\n",
-      "(           587 non-null int64\n",
-      ")           587 non-null int64\n",
-      ",           587 non-null int64\n",
-      ":           587 non-null int64\n",
-      ";           587 non-null int64\n",
-      "[           587 non-null int64\n",
-      "]           587 non-null int64\n",
-      "and         587 non-null int64\n",
-      "array       587 non-null int64\n",
-      "def         587 non-null int64\n",
-      "define      587 non-null int64\n",
-      "elif        587 non-null int64\n",
-      "else        587 non-null int64\n",
-      "float       587 non-null int64\n",
-      "for         587 non-null int64\n",
-      "format      587 non-null int64\n",
-      "function    587 non-null int64\n",
-      "if          587 non-null int64\n",
-      "import      587 non-null int64\n",
-      "int         587 non-null int64\n",
-      "local       587 non-null int64\n",
-      "loop        587 non-null int64\n",
-      "private     587 non-null int64\n",
-      "public      587 non-null int64\n",
-      "return      587 non-null int64\n",
-      "static      587 non-null int64\n",
-      "var         587 non-null int64\n",
-      "void        587 non-null int64\n",
-      "while       587 non-null int64\n",
-      "{           587 non-null int64\n",
-      "}           587 non-null int64\n",
-      "Code        587 non-null object\n",
-      "Language    587 non-null object\n",
-      "dtypes: int64(32), object(2)\n",
-      "memory usage: 160.5+ KB\n"
-     ]
-    }
-   ],
-   "source": [
-    "data.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "y = data.loc[:,(\"Language\")]"
+    "class CodeVectorizer(TransformerMixin):\n",
+    "    def __init__(self):\n",
+    "        self.keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
+    "                         \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
+    "                         \"and\", \"var\", \"loop\", \"array\", \"local\"]\n",
+    "        self.symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]\n",
+    "    \n",
+    "    def fit(self, X, y=None):\n",
+    "        return self\n",
+    "    \n",
+    "    def transform(self, X):\n",
+    "        feature_list = []\n",
+    "        for code in X[\"Code\"]:\n",
+    "            features = {}\n",
+    "            for keyword in keywords:\n",
+    "                features[keyword] = string_count(keyword, code)\n",
+    "            for symbol in symbols:\n",
+    "                features[symbol] = char_percent(symbol, code)\n",
+    "            feature_list.append(features)\n",
+    "        return pd.DataFrame(feature_list)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 104,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "0          C\n",
-       "1          C\n",
-       "2          C\n",
-       "3    Clojure\n",
-       "4    Clojure\n",
-       "Name: Language, dtype: object"
+       "                                                Code Language\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...        C\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...        C\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...        C\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure"
       ]
      },
-     "execution_count": 47,
+     "execution_count": 104,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "y.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'Series' object has no attribute 'indo'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-50-9f42df1c19c9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2081\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2082\u001b[0m             raise AttributeError(\"'%s' object has no attribute '%s'\" %\n\u001b[0;32m-> 2083\u001b[0;31m                                  (type(self).__name__, name))\n\u001b[0m\u001b[1;32m   2084\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2085\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'indo'"
-     ]
-    }
-   ],
-   "source": [
-    "y.indo()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "X = data.loc[:, (\"Code\", \"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
-    "            \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
-    "            \"and\", \"var\", \"loop\", \"array\", \"local\", \":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\")]"
+    "data.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 105,
    "metadata": {
     "collapsed": false
    },
@@ -602,114 +451,114 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>Code</th>\n",
-       "      <th>public</th>\n",
+       "      <th>#</th>\n",
+       "      <th>(</th>\n",
+       "      <th>)</th>\n",
+       "      <th>,</th>\n",
+       "      <th>:</th>\n",
+       "      <th>;</th>\n",
+       "      <th>[</th>\n",
+       "      <th>]</th>\n",
+       "      <th>and</th>\n",
+       "      <th>array</th>\n",
+       "      <th>...</th>\n",
+       "      <th>loop</th>\n",
        "      <th>private</th>\n",
+       "      <th>public</th>\n",
+       "      <th>return</th>\n",
        "      <th>static</th>\n",
-       "      <th>if</th>\n",
-       "      <th>else</th>\n",
-       "      <th>elif</th>\n",
-       "      <th>def</th>\n",
+       "      <th>var</th>\n",
        "      <th>void</th>\n",
-       "      <th>int</th>\n",
-       "      <th>...</th>\n",
-       "      <th>:</th>\n",
-       "      <th>;</th>\n",
+       "      <th>while</th>\n",
        "      <th>{</th>\n",
        "      <th>}</th>\n",
-       "      <th>(</th>\n",
-       "      <th>)</th>\n",
-       "      <th>#</th>\n",
-       "      <th>[</th>\n",
-       "      <th>]</th>\n",
-       "      <th>,</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.004087</td>\n",
+       "      <td>0.017938</td>\n",
+       "      <td>0.017938</td>\n",
+       "      <td>0.010899</td>\n",
+       "      <td>0.002044</td>\n",
+       "      <td>0.020209</td>\n",
+       "      <td>0.002271</td>\n",
+       "      <td>0.002271</td>\n",
        "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>25</td>\n",
-       "      <td>2</td>\n",
        "      <td>0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>1</td>\n",
-       "      <td>21</td>\n",
        "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1</td>\n",
        "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.006585</td>\n",
+       "      <td>0.006585</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>0.004470</td>\n",
+       "      <td>0.020019</td>\n",
+       "      <td>0.020019</td>\n",
+       "      <td>0.011273</td>\n",
+       "      <td>0.001944</td>\n",
+       "      <td>0.018465</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>0.002332</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>29</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>22</td>\n",
        "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>18</td>\n",
+       "      <td>1</td>\n",
        "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>0.006414</td>\n",
+       "      <td>0.006414</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.005647</td>\n",
+       "      <td>0.015586</td>\n",
+       "      <td>0.015586</td>\n",
+       "      <td>0.010843</td>\n",
+       "      <td>0.002259</td>\n",
+       "      <td>0.017619</td>\n",
+       "      <td>0.002259</td>\n",
+       "      <td>0.002259</td>\n",
        "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>26</td>\n",
-       "      <td>2</td>\n",
        "      <td>0</td>\n",
-       "      <td>13</td>\n",
-       "      <td>1</td>\n",
-       "      <td>25</td>\n",
        "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>3</td>\n",
        "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.005421</td>\n",
+       "      <td>0.005421</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>8</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>11</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.035950</td>\n",
+       "      <td>0.035950</td>\n",
+       "      <td>0.000826</td>\n",
+       "      <td>0.002066</td>\n",
+       "      <td>0.009091</td>\n",
+       "      <td>0.008264</td>\n",
+       "      <td>0.008264</td>\n",
+       "      <td>1</td>\n",
        "      <td>0</td>\n",
-       "      <td>4</td>\n",
        "      <td>...</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -719,22 +568,23 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>9</td>\n",
+       "      <td>0.000386</td>\n",
+       "      <td>0.035852</td>\n",
+       "      <td>0.035852</td>\n",
+       "      <td>0.000771</td>\n",
+       "      <td>0.002313</td>\n",
+       "      <td>0.008096</td>\n",
+       "      <td>0.007710</td>\n",
+       "      <td>0.007710</td>\n",
+       "      <td>2</td>\n",
        "      <td>0</td>\n",
-       "      <td>4</td>\n",
        "      <td>...</td>\n",
+       "      <td>2</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
@@ -742,161 +592,113 @@
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 33 columns</p>\n",
+       "<p>5 rows × 32 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                Code  public  private  static  \\\n",
-       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...       0        0       1   \n",
-       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...       0        0       1   \n",
-       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...       0        0       3   \n",
-       "3  ;; The Computer Language Benchmarks Game\\n;; h...       0        0       0   \n",
-       "4  ;; The Computer Language Benchmarks Game\\n;; h...       0        0       0   \n",
+       "          #         (         )         ,         :         ;         [  \\\n",
+       "0  0.004087  0.017938  0.017938  0.010899  0.002044  0.020209  0.002271   \n",
+       "1  0.004470  0.020019  0.020019  0.011273  0.001944  0.018465  0.002332   \n",
+       "2  0.005647  0.015586  0.015586  0.010843  0.002259  0.017619  0.002259   \n",
+       "3  0.000000  0.035950  0.035950  0.000826  0.002066  0.009091  0.008264   \n",
+       "4  0.000386  0.035852  0.035852  0.000771  0.002313  0.008096  0.007710   \n",
+       "\n",
+       "          ]  and  array    ...     loop  private  public  return  static  var  \\\n",
+       "0  0.002271    0      0    ...        0        0       0      15       1    0   \n",
+       "1  0.002332    0      0    ...        0        0       0      18       1    0   \n",
+       "2  0.002259    0      0    ...        0        0       0      13       3    0   \n",
+       "3  0.008264    1      0    ...        0        0       0       0       0    0   \n",
+       "4  0.007710    2      0    ...        2        0       0       0       0    0   \n",
        "\n",
-       "   if  else  elif  def  void  int ...  :  ;  {  }  (  )  #  [  ]  ,  \n",
-       "0  25     2     0    8     1   21 ...  0  0  0  0  0  0  0  0  0  0  \n",
-       "1  29     3     0   12     1   22 ...  0  0  0  0  0  0  0  0  0  0  \n",
-       "2  26     2     0   13     1   25 ...  0  0  0  0  0  0  0  0  0  0  \n",
-       "3   8     0     0   11     0    4 ...  0  0  0  0  0  0  0  0  0  0  \n",
-       "4  11     0     0    9     0    4 ...  0  0  0  0  0  0  0  0  0  0  \n",
+       "   void  while         {         }  \n",
+       "0     1      5  0.006585  0.006585  \n",
+       "1     1      6  0.006414  0.006414  \n",
+       "2     1      5  0.005421  0.005421  \n",
+       "3     0      0  0.000000  0.000000  \n",
+       "4     0      0  0.000000  0.000000  \n",
        "\n",
-       "[5 rows x 33 columns]"
+       "[5 rows x 32 columns]"
       ]
      },
-     "execution_count": 52,
+     "execution_count": 105,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "X.head()"
+    "cv = CodeVectorizer()\n",
+    "cv.fit(data)\n",
+    "cv.transform(data).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "y = data.loc[:,(\"Language\")]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 107,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Int64Index: 587 entries, 0 to 586\n",
-      "Data columns (total 33 columns):\n",
-      "Code        587 non-null object\n",
-      "public      587 non-null int64\n",
-      "private     587 non-null int64\n",
-      "static      587 non-null int64\n",
-      "if          587 non-null int64\n",
-      "else        587 non-null int64\n",
-      "elif        587 non-null int64\n",
-      "def         587 non-null int64\n",
-      "void        587 non-null int64\n",
-      "int         587 non-null int64\n",
-      "float       587 non-null int64\n",
-      "for         587 non-null int64\n",
-      "while       587 non-null int64\n",
-      "import      587 non-null int64\n",
-      "define      587 non-null int64\n",
-      "function    587 non-null int64\n",
-      "return      587 non-null int64\n",
-      "format      587 non-null int64\n",
-      "and         587 non-null int64\n",
-      "var         587 non-null int64\n",
-      "loop        587 non-null int64\n",
-      "array       587 non-null int64\n",
-      "local       587 non-null int64\n",
-      ":           587 non-null int64\n",
-      ";           587 non-null int64\n",
-      "{           587 non-null int64\n",
-      "}           587 non-null int64\n",
-      "(           587 non-null int64\n",
-      ")           587 non-null int64\n",
-      "#           587 non-null int64\n",
-      "[           587 non-null int64\n",
-      "]           587 non-null int64\n",
-      ",           587 non-null int64\n",
-      "dtypes: int64(32), object(1)\n",
-      "memory usage: 155.9+ KB\n"
-     ]
+     "data": {
+      "text/plain": [
+       "0          C\n",
+       "1          C\n",
+       "2          C\n",
+       "3    Clojure\n",
+       "4    Clojure\n",
+       "Name: Language, dtype: object"
+      ]
+     },
+     "execution_count": 107,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "X.info()"
+    "y.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 108,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "ename": "ValueError",
-     "evalue": "Number of labels=352 does not match number of samples=33",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-54-5860e4dcbc10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.40\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mpipe\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDumbFeaturizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDecisionTreeClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m    139\u001b[0m         \"\"\"\n\u001b[1;32m    140\u001b[0m         \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pre_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 141\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    142\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/tree/tree.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[1;32m    219\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mn_samples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    220\u001b[0m             raise ValueError(\"Number of labels=%d does not match \"\n\u001b[0;32m--> 221\u001b[0;31m                              \"number of samples=%d\" % (len(y), n_samples))\n\u001b[0m\u001b[1;32m    222\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin_samples_split\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    223\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"min_samples_split must be greater than zero.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mValueError\u001b[0m: Number of labels=352 does not match number of samples=33"
-     ]
+     "data": {
+      "text/plain": [
+       "0.87234042553191493"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n",
-    "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n",
+    "pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())\n",
     "pipe.fit(X_train, y_train)\n",
     "pipe.score(X_test, y_test)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {