diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..35b4d17
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,725 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "file_ext = {\"C\": [\"gcc\", \"c\", \"h\"],\n",
+    "            \"C#\": [\"csharp\"],\n",
+    "            \"Clojure\": [\"clj\", \"cljs\", \"cljs\", \"edn\", \"clojure\"],\n",
+    "            \"Common Lisp\": [\"sbcl\"],\n",
+    "            \"Haskell\": [\"hs\", \"lhs\", \"ghc\"],\n",
+    "            \"Java\": [\"java\", \"class\", \"jar\"],\n",
+    "            \"Javascript\": [\"js\", \"javascript\"],\n",
+    "            \"OCaml\": [\"ocaml\", \"ml\"],\n",
+    "            \"Perl\": [\"pl\", \"pm\", \"t\", \"pod\", \"perl\"],\n",
+    "            \"PHP\": [\"php\", \"phtml\", \"php4\", \"php3\", \"php5\", \"phps\", \"hack\"],\n",
+    "            \"Python\": [\"py\", \"pyw\", \"pyc\", \"pyo\", \"pyd\", \"python3\", \"Python2\"],\n",
+    "            \"Ruby\": [\"rb\", \"rbw\", \"jruby\", \"yarv\"],\n",
+    "            \"Scala\": [\"scala\"],\n",
+    "            \"Scheme\": [\"scm\", \"ss\", \"racket\"],\n",
+    "            \"Tcl\": [\"tcl\"]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def read_bench_files():\n",
+    "    files = glob.glob(\"benchmarksgame/benchmarksgame/bench/*/*.*\")\n",
+    "    texts = []\n",
+    "    for file in files:\n",
+    "        ext = get_ext(file.split(\".\")[-1])\n",
+    "        with open(file) as fh:\n",
+    "            if ext != None:\n",
+    "                texts.append((fh.read(), ext))\n",
+    "    return texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_ext(ext):\n",
+    "    for key, value in file_ext.items():\n",
+    "        if ext in value:\n",
+    "            return key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Code Language\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...        C\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...        C\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...        C\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = read_bench_files()\n",
+    "data = pd.DataFrame(data, columns = [\"Code\", \"Language\"])\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Ruby           73\n",
+       "C              61\n",
+       "PHP            55\n",
+       "Java           51\n",
+       "Scala          43\n",
+       "C#             41\n",
+       "Clojure        38\n",
+       "Python         36\n",
+       "Common Lisp    34\n",
+       "OCaml          34\n",
+       "Perl           34\n",
+       "Haskell        33\n",
+       "Scheme         29\n",
+       "Javascript     25\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.Language.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          C\n",
+       "1          C\n",
+       "2          C\n",
+       "3    Clojure\n",
+       "4    Clojure\n",
+       "Name: Language, dtype: object"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y = data.loc[:,\"Language\"]\n",
+    "y.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Code\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h..."
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X = data.loc[:,[\"Code\"]]\n",
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.cross_validation import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.base import TransformerMixin"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def char_count(char, code):\n",
+    "    return code.count(char)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def char_percent(char, code):\n",
+    "    return code.count(char) / len(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def string_count(string, code):\n",
+    "    value = len(re.findall(string, code))\n",
+    "    return value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "class CodeVectorizer(TransformerMixin):\n",
+    "    def __init__(self):\n",
+    "        self.keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
+    "                         \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
+    "                         \"and\", \"var\", \"loop\", \"array\", \"local\"]\n",
+    "        self.symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]\n",
+    "    \n",
+    "    def fit(self, X, y=None):\n",
+    "        return self\n",
+    "    \n",
+    "    def transform(self, X):\n",
+    "        feature_list = []\n",
+    "        for code in X[\"Code\"]:\n",
+    "            features = {}\n",
+    "            for keyword in keywords:\n",
+    "                features[keyword] = string_count(keyword, code)\n",
+    "            for symbol in symbols:\n",
+    "                features[symbol] = char_percent(symbol, code)\n",
+    "            feature_list.append(features)\n",
+    "        return pd.DataFrame(feature_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>;; The Computer Language Benchmarks Game\\n;; h...</td>\n",
+       "      <td>Clojure</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Code Language\n",
+       "0  /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...        C\n",
+       "1  /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...        C\n",
+       "2  /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...        C\n",
+       "3  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure\n",
+       "4  ;; The Computer Language Benchmarks Game\\n;; h...  Clojure"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>#</th>\n",
+       "      <th>(</th>\n",
+       "      <th>)</th>\n",
+       "      <th>,</th>\n",
+       "      <th>:</th>\n",
+       "      <th>;</th>\n",
+       "      <th>[</th>\n",
+       "      <th>]</th>\n",
+       "      <th>and</th>\n",
+       "      <th>array</th>\n",
+       "      <th>...</th>\n",
+       "      <th>loop</th>\n",
+       "      <th>private</th>\n",
+       "      <th>public</th>\n",
+       "      <th>return</th>\n",
+       "      <th>static</th>\n",
+       "      <th>var</th>\n",
+       "      <th>void</th>\n",
+       "      <th>while</th>\n",
+       "      <th>{</th>\n",
+       "      <th>}</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.004087</td>\n",
+       "      <td>0.017938</td>\n",
+       "      <td>0.017938</td>\n",
+       "      <td>0.010899</td>\n",
+       "      <td>0.002044</td>\n",
+       "      <td>0.020209</td>\n",
+       "      <td>0.002271</td>\n",
+       "      <td>0.002271</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.006585</td>\n",
+       "      <td>0.006585</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.004470</td>\n",
+       "      <td>0.020019</td>\n",
+       "      <td>0.020019</td>\n",
+       "      <td>0.011273</td>\n",
+       "      <td>0.001944</td>\n",
+       "      <td>0.018465</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>18</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>0.006414</td>\n",
+       "      <td>0.006414</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.005647</td>\n",
+       "      <td>0.015586</td>\n",
+       "      <td>0.015586</td>\n",
+       "      <td>0.010843</td>\n",
+       "      <td>0.002259</td>\n",
+       "      <td>0.017619</td>\n",
+       "      <td>0.002259</td>\n",
+       "      <td>0.002259</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.005421</td>\n",
+       "      <td>0.005421</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.035950</td>\n",
+       "      <td>0.035950</td>\n",
+       "      <td>0.000826</td>\n",
+       "      <td>0.002066</td>\n",
+       "      <td>0.009091</td>\n",
+       "      <td>0.008264</td>\n",
+       "      <td>0.008264</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.000386</td>\n",
+       "      <td>0.035852</td>\n",
+       "      <td>0.035852</td>\n",
+       "      <td>0.000771</td>\n",
+       "      <td>0.002313</td>\n",
+       "      <td>0.008096</td>\n",
+       "      <td>0.007710</td>\n",
+       "      <td>0.007710</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 32 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          #         (         )         ,         :         ;         [  \\\n",
+       "0  0.004087  0.017938  0.017938  0.010899  0.002044  0.020209  0.002271   \n",
+       "1  0.004470  0.020019  0.020019  0.011273  0.001944  0.018465  0.002332   \n",
+       "2  0.005647  0.015586  0.015586  0.010843  0.002259  0.017619  0.002259   \n",
+       "3  0.000000  0.035950  0.035950  0.000826  0.002066  0.009091  0.008264   \n",
+       "4  0.000386  0.035852  0.035852  0.000771  0.002313  0.008096  0.007710   \n",
+       "\n",
+       "          ]  and  array    ...     loop  private  public  return  static  var  \\\n",
+       "0  0.002271    0      0    ...        0        0       0      15       1    0   \n",
+       "1  0.002332    0      0    ...        0        0       0      18       1    0   \n",
+       "2  0.002259    0      0    ...        0        0       0      13       3    0   \n",
+       "3  0.008264    1      0    ...        0        0       0       0       0    0   \n",
+       "4  0.007710    2      0    ...        2        0       0       0       0    0   \n",
+       "\n",
+       "   void  while         {         }  \n",
+       "0     1      5  0.006585  0.006585  \n",
+       "1     1      6  0.006414  0.006414  \n",
+       "2     1      5  0.005421  0.005421  \n",
+       "3     0      0  0.000000  0.000000  \n",
+       "4     0      0  0.000000  0.000000  \n",
+       "\n",
+       "[5 rows x 32 columns]"
+      ]
+     },
+     "execution_count": 105,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv = CodeVectorizer()\n",
+    "cv.fit(data)\n",
+    "cv.transform(data).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "y = data.loc[:,(\"Language\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          C\n",
+       "1          C\n",
+       "2          C\n",
+       "3    Clojure\n",
+       "4    Clojure\n",
+       "Name: Language, dtype: object"
+      ]
+     },
+     "execution_count": 107,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.87234042553191493"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n",
+    "pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())\n",
+    "pipe.fit(X_train, y_train)\n",
+    "pipe.score(X_test, y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/read_data.py b/read_data.py
new file mode 100644
index 0000000..416df47
--- /dev/null
+++ b/read_data.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import glob
+
+file_ext = {"C": ["gcc", "c", "h"],
+            "C#": ["csharp"],
+            "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
+            "Common Lisp": ["sbcl"],
+            "Haskell": ["hs", "lhs", "ghc"],
+            "Java": ["java", "class", "jar"],
+            "Javascript": ["js", "javascript"],
+            "OCaml": ["ocaml", "ml"],
+            "Perl": ["pl", "pm", "t", "pod", "perl"],
+            "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
+            "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
+            "Ruby": ["rb", "rbw", "jruby", "yarv"],
+            "Scala": ["scala"],
+            "Scheme": ["scm", "ss", "racket"],
+            "Tcl": ["tcl"]}
+
+def read_bench_files():
+    files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*")
+    texts = []
+    for file in files:
+        ext = get_ext(file.split(".")[-1])
+        with open(file) as fh:
+            if ext != None:
+                texts.append((fh.read(), ext))
+    return texts
+
+def get_ext(ext):
+    for key, value in file_ext.items():
+        if ext in value:
+            return key
+
+
+data = read_bench_files()
+data = pd.DataFrame(data, columns = ["code", "language"])
+print(data)

	Code	Language
0	/\\n $Id: simple_hash.h,v 1.1 2013/01/02 04:...	C
1	/\\n $Id: simple_hash2.h,v 1.1 2013/01/02 04...	C
2	/\\n $Id: simple_hash3.h,v 1.1 2013/01/02 04...	C
3	;; The Computer Language Benchmarks Game\\n;; h...	Clojure
4	;; The Computer Language Benchmarks Game\\n;; h...	Clojure
	#	(	)	,	:	;	[	]	and	...	loop	return	static	void	while	{	}
0	0.004087	0.017938	0.017938	0.010899	0.002044	0.020209	0.002271	0.002271	0	...	0	15	1	1	5	0.006585	0.006585
1	0.004470	0.020019	0.020019	0.011273	0.001944	0.018465	0.002332	0.002332	0	...	0	18	1	1	6	0.006414	0.006414
2	0.005647	0.015586	0.015586	0.010843	0.002259	0.017619	0.002259	0.002259	0	...	0	13	3	1	5	0.005421	0.005421
3	0.000000	0.035950	0.035950	0.000826	0.002066	0.009091	0.008264	0.008264	1	...	0	0	0	0	0	0.000000	0.000000
4	0.000386	0.035852	0.035852	0.000771	0.002313	0.008096	0.007710	0.007710	2	...	2	0	0	0	0	0.000000	0.000000