diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..94840b3
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+layout python3
diff --git a/.gitignore b/.gitignore
index f00dbf2..a7c3822 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,8 @@ docs/_build/
 # PyBuilder
 target/
 
+.DS_store
+benchmarksgame-2014-08-31/
+
+.direnv/
+test_w_ext/
diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb
new file mode 100644
index 0000000..d66060b
--- /dev/null
+++ b/Lang_classifier_use.ipynb
@@ -0,0 +1,409 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from lclassifier.lclassifier import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Demonstration of Language Classifier (lclassifier)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "acceptable_file(\"py\") # testing that import is functional"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'py'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_ext(\"python3\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   total samples 931\n",
+      " number of usable files 656\n",
+      " \n",
+      " number of read file types:  32\n",
+      " number of recognized types: 14\n",
+      " summary of tile types\n",
+      "ats             \n",
+      "clj            38  \n",
+      "cs             41  \n",
+      "dart            \n",
+      "erlang          \n",
+      "fpascal         \n",
+      "fsharp          \n",
+      "c              129  \n",
+      "hs             33  \n",
+      "gnat            \n",
+      "go              \n",
+      "php            55  \n",
+      "ifc             \n",
+      "java           51  \n",
+      "js             25  \n",
+      "ruby           73  \n",
+      "lua             \n",
+      "ocaml          35  \n",
+      "oz              \n",
+      "pl             34  \n",
+      "py             36  \n",
+      "racket         29  \n",
+      "rust            \n",
+      "sbcl           34  \n",
+      "scala          43  \n",
+      "vw              \n",
+      "cint            \n",
+      "javasteady      \n",
+      "parrot          \n",
+      "cc              \n",
+      "txt             \n",
+      "ozf             \n",
+      " not included:  \n",
+      " \n"
+     ]
+    }
+   ],
+   "source": [
+    "filelist, testlist = load_file_names()\n",
+    "contents, ltype, testcont = load_files(filelist, testlist)\n",
+    "\n",
+    "plist = [fit2, fit3, fit4, fit5, fit6]\n",
+    "\n",
+    "X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pipe = fit6(Xt, yt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "testing set outcomes\n",
+      "scala    scala   \n",
+      "c        c       \n",
+      "c        c       \n",
+      "clj      clj     \n",
+      "java     java    \n",
+      "py       py      \n",
+      "clj      clj     \n",
+      "js       js      \n",
+      "c        c       \n",
+      "pl       pl      \n",
+      "cs       cs      \n",
+      "c        c       \n",
+      "c        c       \n",
+      "ocaml    ocaml   \n",
+      "hs       hs      \n",
+      "sbcl     sbcl    \n",
+      "racket   racket  \n",
+      "php      php     \n",
+      "pl       pl      \n",
+      "ocaml    ocaml   \n",
+      "\n",
+      " overall score: 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "M = pipe.predict(Xt)\n",
+    "print(\"testing set outcomes\")\n",
+    "for i in range(20):\n",
+    "    print(M[i].ljust(8)+ \" \" + yt[i].ljust(8))\n",
+    "print(\"\")\n",
+    "print(\" overall score: \"+str(pipe.score(Xt, yt)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Test Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['php', 'php', 'php', 'clj', 'py', 'py', 'py', 'py', 'js', 'js',\n",
+       "       'js', 'js', 'ruby', 'ruby', 'ruby', 'hs', 'php', 'hs', 'racket',\n",
+       "       'php', 'racket', 'java', 'java', 'scala', 'scala', 'php', 'php',\n",
+       "       'java', 'php', 'java', 'ocaml', 'php'], \n",
+       "      dtype='<U6')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.predict(testcont) # prediction of file types for the data given"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<_csv.reader object at 0x10bfd6198>\n",
+      " number of testing file types: 11\n",
+      " actual_file_type  predicted_type\n",
+      "clj       php       \n",
+      "clj       php       \n",
+      "clj       php       \n",
+      "clj       clj       \n",
+      "py        py        \n",
+      "py        py        \n",
+      "py        py        \n",
+      "py        py        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "ruby      ruby      \n",
+      "ruby      ruby      \n",
+      "ruby      ruby      \n",
+      "haskell   hs        \n",
+      "haskell   php       \n",
+      "haskell   hs        \n",
+      "racket    racket    \n",
+      "racket    php       \n",
+      "racket    racket    \n",
+      "java      java      \n",
+      "java      java      \n",
+      "scala     scala     \n",
+      "scala     scala     \n",
+      "tcl       php       \n",
+      "tcl       php       \n",
+      "php       java      \n",
+      "php       php       \n",
+      "php       java      \n",
+      "ocaml     ocaml     \n",
+      "ocaml     php       \n",
+      " \n",
+      " score: 0.625\n"
+     ]
+    }
+   ],
+   "source": [
+    "ans = read_answers()\n",
+    "M = pipe.predict(testcont)\n",
+    "print(\" actual_file_type  predicted_type\")\n",
+    "for i in range(len(ans)):\n",
+    "    print(ans[i].ljust(10)+M[i].ljust(10))\n",
+    "print(\" \")\n",
+    "print(\" score: \"+str(pipe.score(testcont, ans)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This score isn't very good, but it would be difficult to match all these no matter what methods were being used due to the small quantity of training data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Single file demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "py_file = '''JOIN_RETRANSMIT = 0.7\n",
+    "CATCHUP_INTERVAL = 0.6\n",
+    "ACCEPT_RETRANSMIT = 1.0\n",
+    "PREPARE_RETRANSMIT = 1.0\n",
+    "INVOKE_RETRANSMIT = 0.5\n",
+    "LEADER_TIMEOUT = 1.0\n",
+    "NULL_BALLOT = Ballot(-1, -1)  # sorts before all real ballots\n",
+    "NOOP_PROPOSAL = Proposal(None, None, None)  # no-op to fill otherwise empty slots\n",
+    "\n",
+    "class Node(object):\n",
+    "    unique_ids = itertools.count()\n",
+    "\n",
+    "    def __init__(self, network, address):\n",
+    "        self.network = network\n",
+    "        self.address = address or 'N%d' % self.unique_ids.next()\n",
+    "        self.logger = SimTimeLogger(logging.getLogger(self.address), {'network': self.network})\n",
+    "        self.logger.info('starting')\n",
+    "        self.roles = []\n",
+    "        self.send = functools.partial(self.network.send, self)\n",
+    "\n",
+    "    def register(self, roles):\n",
+    "        self.roles.append(roles)\n",
+    "\n",
+    "    def unregister(self, roles):\n",
+    "        self.roles.remove(roles)\n",
+    "\n",
+    "    def receive(self, sender, message):\n",
+    "        handler_name = 'do_%s' % type(message).__name__\n",
+    "\n",
+    "        for comp in self.roles[:]:\n",
+    "            if not hasattr(comp, handler_name):\n",
+    "                continue\n",
+    "            comp.logger.debug(\"received %s from %s\", message, sender)\n",
+    "            fn = getattr(comp, handler_name)\n",
+    "            fn(sender=sender, **message._asdict())\n",
+    "\n",
+    "class Timer(object):\n",
+    "\n",
+    "    def __init__(self, expires, address, callback):\n",
+    "        self.expires = expires\n",
+    "        self.address = address\n",
+    "        self.callback = callback\n",
+    "        self.cancelled = False'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['py'], \n",
+       "      dtype='<U6')"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.predict([py_file])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This correctly predicted that the file is python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/lclassifier/ben_output.txt b/lclassifier/ben_output.txt
new file mode 100644
index 0000000..c8534c9
--- /dev/null
+++ b/lclassifier/ben_output.txt
@@ -0,0 +1,319 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+ score for    training_set     test_set
+0    0.9932  0.8387  
+1    0.5399  0.3733  
+2    0.2346  0.235   
+3    0.2916  0.2765  
+4    0.533   0.3779  
+ 
+  failed to classify
+js     misclassified as c
+ocaml  misclassified as c
+java   misclassified as ruby
+cs     misclassified as c
+java   misclassified as ruby
+java   misclassified as c
+scala  misclassified as c
+php    misclassified as c
+scala  misclassified as c
+java   misclassified as c
+c      misclassified as ruby
+ruby   misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+cs     misclassified as c
+js     misclassified as c
+hs     misclassified as py
+scala  misclassified as c
+java   misclassified as c
+php    misclassified as c
+java   misclassified as c
+php    misclassified as ruby
+js     misclassified as c
+java   misclassified as ocaml
+hs     misclassified as clj
+pl     misclassified as c
+php    misclassified as c
+hs     misclassified as py
+pl     misclassified as ruby
+scala  misclassified as c
+php    misclassified as c
+ocaml  misclassified as c
+ocaml  misclassified as c
+java   misclassified as c
+ruby   misclassified as pl
+cs     misclassified as c
+ruby   misclassified as pl
+py     misclassified as c
+scala  misclassified as c
+scala  misclassified as hs
+js     misclassified as c
+java   misclassified as c
+cs     misclassified as c
+php    misclassified as c
+php    misclassified as ruby
+java   misclassified as c
+php    misclassified as pl
+php    misclassified as c
+hs     misclassified as ocaml
+pl     misclassified as ruby
+java   misclassified as c
+cs     misclassified as ruby
+php    misclassified as pl
+cs     misclassified as c
+c      misclassified as ruby
+java   misclassified as c
+py     misclassified as ruby
+hs     misclassified as c
+scala  misclassified as c
+c      misclassified as pl
+cs     misclassified as c
+c      misclassified as ruby
+cs     misclassified as c
+py     misclassified as c
+php    misclassified as ruby
+ruby   misclassified as hs
+pl     misclassified as ruby
+cs     misclassified as c
+scala  misclassified as c
+java   misclassified as c
+php    misclassified as c
+java   misclassified as c
+hs     misclassified as c
+ruby   misclassified as pl
+java   misclassified as ruby
+ocaml  misclassified as c
+hs     misclassified as ruby
+java   misclassified as pl
+hs     misclassified as c
+js     misclassified as php
+clj    misclassified as racket
+clj    misclassified as racket
+ruby   misclassified as c
+hs     misclassified as c
+ocaml  misclassified as c
+scala  misclassified as c
+cs     misclassified as c
+php    misclassified as c
+java   misclassified as ruby
+ocaml  misclassified as py
+clj    misclassified as racket
+php    misclassified as c
+java   misclassified as c
+ruby   misclassified as pl
+java   misclassified as c
+scala  misclassified as c
+php    misclassified as c
+py     misclassified as scala
+php    misclassified as pl
+py     misclassified as scala
+pl     misclassified as ruby
+js     misclassified as ruby
+php    misclassified as ruby
+hs     misclassified as c
+java   misclassified as c
+scala  misclassified as c
+c      misclassified as scala
+java   misclassified as c
+java   misclassified as c
+java   misclassified as c
+java   misclassified as c
+hs     misclassified as py
+ruby   misclassified as c
+php    misclassified as ruby
+hs     misclassified as py
+java   misclassified as c
+ruby   misclassified as pl
+c      misclassified as pl
+py     misclassified as ruby
+py     misclassified as ruby
+php    misclassified as ruby
+pl     misclassified as c
+java   misclassified as c
+cs     misclassified as c
+pl     misclassified as ruby
+js     misclassified as c
+java   misclassified as c
+pl     misclassified as c
+c      misclassified as scala
+js     misclassified as c
+clj    misclassified as racket
+hs     misclassified as scala
+pl     misclassified as c
+ruby   misclassified as c
+php    misclassified as c
+java   misclassified as c
+cs     misclassified as c
+php    misclassified as ruby
+ocaml  misclassified as ruby
+php    misclassified as c
+cs     misclassified as c
+pl     misclassified as c
+py     misclassified as c
+java   misclassified as c
+java   misclassified as c
+pl     misclassified as c
+php    misclassified as c
+pl     misclassified as ruby
+scala  misclassified as c
+ruby   misclassified as c
+clj    misclassified as racket
+php    misclassified as c
+java   misclassified as c
+ocaml  misclassified as py
+java   misclassified as c
+clj    misclassified as racket
+php    misclassified as ruby
+cs     misclassified as c
+ocaml  misclassified as scala
+ruby   misclassified as pl
+clj    misclassified as racket
+pl     misclassified as ruby
+ocaml  misclassified as c
+pl     misclassified as c
+cs     misclassified as c
+scala  misclassified as c
+hs     misclassified as c
+scala  misclassified as c
+js     misclassified as php
+hs     misclassified as c
+php    misclassified as ruby
+java   misclassified as c
+ruby   misclassified as pl
+scala  misclassified as c
+hs     misclassified as c
+cs     misclassified as c
+pl     misclassified as c
+js     misclassified as c
+cs     misclassified as scala
+pl     misclassified as c
+cs     misclassified as c
+ocaml  misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+ruby   misclassified as c
+c      misclassified as ruby
+js     misclassified as ruby
+php    misclassified as ruby
+pl     misclassified as c
+hs     misclassified as c
+pl     misclassified as ruby
+java   misclassified as c
+hs     misclassified as py
+scala  misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+java   misclassified as c
+js     misclassified as ruby
+ruby   misclassified as py
+php    misclassified as c
+java   misclassified as ruby
+java   misclassified as c
+php    misclassified as c
+py     misclassified as ocaml
+cs     misclassified as c
+
+ failure counts
+  wrongly classified:
+c      ###########################################################################################################################
+ruby   #######################################
+racket #######
+py     ########
+hs     ##
+clj    #
+ocaml  ###
+php    ##
+pl     #############
+scala  #######
+  failed to classify
+ruby   ###############
+cs     ######################
+scala  ################
+ocaml  ###########
+pl     ##################
+clj    #######
+c      ###########
+js     ############
+php    #############################
+hs     ##################
+py     #########
+java   #####################################
+ 
+<_csv.reader object at 0x10a1a2c18>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 2 0.125
+ pred 2 ['java' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php'
+ 'ruby' 'php' 'pl' 'ruby' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' 'c'
+ 'c' 'ruby' 'c' 'c' 'php' 'hs' 'ocaml' 'c' 'py']
+ 
+ score_quest 3 0.125
+ pred 3 ['scala' 'ocaml' 'ocaml' 'scala' 'ruby' 'ruby' 'ocaml' 'ruby' 'cs' 'cs'
+ 'cs' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'scala' 'ocaml'
+ 'scala' 'ocaml' 'ruby' 'ruby' 'ruby' 'scala' 'ruby' 'ruby' 'cs' 'cs'
+ 'ruby' 'ocaml']
+ 
+ score_quest 4 0.0625
+ pred 4 ['c' 'clj' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c'
+ 'c' 'sbcl' 'sbcl' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c']
+ 
+ score_quest 5 0.125
+ pred 5 ['clj' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php'
+ 'ruby' 'php' 'pl' 'py' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c'
+ 'ruby' 'c' 'ruby' 'clj' 'ruby' 'php' 'hs' 'ocaml' 'c' 'py']
+ 
+clj      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    571  142  142  0    
+py       0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  285  
+js       0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    428  285  142  428  
+ruby     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    142  142  142  571  
+haskell  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  285  
+racket   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1000 0    142  142  
+java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    142  142  142  0    
+scala    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  142  
+tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  0    
+php      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  428  
+ocaml    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  142  
diff --git a/lclassifier/bens_rules.py b/lclassifier/bens_rules.py
new file mode 100644
index 0000000..37f8572
--- /dev/null
+++ b/lclassifier/bens_rules.py
@@ -0,0 +1,19 @@
+
+
+
+    elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b',
+                '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b',
+                '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b']
+    results = []
+    for element in elements:
+        results.append(len(re.findall(element, text)))
+
+    elements = ['[)]+','[}]+', '[\]]+', '[=]+']
+
+    for element in elements:
+        runs = sorted(re.findall(element, text), key=len)
+        if runs:
+            results.append(len(runs[-1]))
+        else:
+            results.append(0)
+    return results
diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py
new file mode 100644
index 0000000..9e96436
--- /dev/null
+++ b/lclassifier/lclassifier.py
@@ -0,0 +1,420 @@
+from glob import glob
+import pandas as pd
+import numpy as np
+from sklearn import linear_model
+from sklearn.cross_validation import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.feature_extraction.text import TfidfTransformer
+# estimators
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import SGDClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.base import TransformerMixin
+# other utilities
+import csv
+import re
+import sys
+
+
+def acceptable_file(text):
+    if text in llist:
+        return True
+    else:
+        return False
+
+
+def clean_ext(textp):
+    text = textp.strip()
+    if text == "gcc" or text == "h" or text == "gpp":
+        return "c"
+    elif text == "hack":
+        return "php"
+    elif text == "yarv" or text == "jruby":
+        return "ruby"
+    elif text == "clojure":
+        return "clj"
+    elif text == "python3" or text == "python":
+        return "py"
+    elif text == "perl":
+        return "pl"
+    elif text == "javascript":
+        return "js"
+    elif text == "csharp":
+        return "cs"
+    elif text == "ghc":
+        return "hs"
+    elif text == "scheme":
+        return "racket"
+    else:
+        return text
+
+llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js",
+         "ocaml", "pl", "php", "py", "ruby", "scala", "racket"]
+main_dir = "/Users/admin/Documents/week5/programming-language-classifier"
+
+
+def list_uniques(alist):
+    rlist = []
+    for item in alist:
+        if item not in rlist:
+            rlist.append(item)
+    return rlist
+
+
+def load_file_names():
+    l = [0 for i in range(5)]
+    s = main_dir+"/benchmarksgame-2014-08-31/benchmarksgame/bench/"
+    max_lvl = 4
+    for i in range(max_lvl):
+        l[i] = glob(s + "*/" * i + "*.*")
+#    l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*")
+#    l2 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*/*.*")
+#    filelist = l1 + l2
+    filelist = []
+    for i in range(max_lvl):
+        filelist += l[i]
+    testlist = glob(main_dir+"/test/*")
+
+    print("   total samples " + str(len(filelist)))
+    return filelist, testlist
+
+
+def load_files(filelist, testlist):
+    contents = []
+    ltype = []
+    ext_list = []
+    for filename in filelist:
+        i = filename.rfind(".")
+        ext = clean_ext(filename[i + 1:])
+        if ext == "tcl":
+            print(filename)
+    #    print(ext, end=" - ")
+    #    print(ext+ str(ext in ext_list) + " - "+str(ext_list))
+        if ext not in ext_list:
+            ext_list.append(ext)
+        if acceptable_file(ext):
+            ltype.append(ext)
+            with open(filename, encoding="ISO-8859-1") as file:
+                #            print(filename)
+                contents.append(file.read())
+#    return contents, ltype
+
+    print(" number of usable files " + str(len(ltype)))
+    print(" ")
+    print(" number of read file types:  " + str(len(ext_list)))
+    print(" number of recognized types: " + str(len(llist)))
+    print(" summary of tile types")
+    for ext in ext_list:
+        print(ext.ljust(12) + "  ", end=" ")
+        if ext in llist:
+            print(ltype.count(ext), end=" ")
+        print(" ")
+    print(" not included: ", end="")
+    for ext in llist:
+        if ext not in ext_list:
+            print(ext, end=" : ")
+    print(" ")
+
+    testcont = [0] * 32
+    for filename in testlist:
+        #    print(filename)
+        with open(filename) as file:
+            di = filename.rfind("/")
+            i = int(filename[di + 1:])
+#            print(filename+" "+str(i))
+            testcont[i - 1] = file.read()
+    print(" ")
+    return contents, ltype, testcont
+    # print(testlist)
+
+
+def read_answers():
+    with open(main_dir+"/test.csv") as csvfile:
+        ans_list = csv.reader(csvfile, delimiter=",")
+        ans = []
+        print(ans_list)
+        for row in ans_list:
+            ans.append(clean_ext(row[1]))
+    print(" number of testing file types: " + str(len(list_uniques(ans))))
+#            print(row[0])
+    return ans
+
+
+def fit1(contents, ltype):
+    pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                     ('tfidf', TfidfTransformer()),
+                     ('bayes', MultinomialNB())])
+    pipe.fit(contents, ltype)
+    return pipe
+#    print(pipe.score(contents, ltype))
+#    print(pipe.predict(testcont))
+#    return pipe.score(contents, ltype)
+
+
+def fit2(contents, ltype):
+    pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                     #                          ('tfidf', TfidfTransformer()),
+                     ('bayes', MultinomialNB())])
+    pipe.fit(contents, ltype)
+    return pipe
+#    print(pipe.score(contents, ltype))
+#    print(pipe.predict(testcont))
+#    return pipe.score(contents, ltype)
+
+
+def print_matrix(matrix, p_max=None):
+    if p_max is None:
+        upper_limit = len(matrix)
+    else:
+        upper_limit = p_max
+    for i in range(upper_limit):
+        vector = matrix[i]
+        for val in vector:
+            print(str(round(val, 3)).ljust(5) + ",", end="")
+        print("")
+        #print([str(round(val, 3)) for val in vector])
+
+def ben_transform(X):
+    elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b',
+                '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b',
+                '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b']
+
+    elements2 = ['[)]+','[}]+', '[\]]+', '[=]+']
+
+    matrix = []
+    for text in X:
+        results = []
+        for element in elements:
+            results.append(len(re.findall(element, text)))
+
+        for element in elements2:
+            runs = sorted(re.findall(element, text), key=len)
+            if runs:
+                results.append(len(runs[-1]))
+            else:
+                results.append(0)
+        matrix.append(results)
+    return matrix
+
+
+def alan_transform(X):
+    cish = ["^[ \t]*\*", "^[ \t]*/\*\*"]
+    clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "]
+    python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s",
+              "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"]
+    js = ["^[ \t]*var", "=\s*function",
+          "function\s*\w*\(\w*[\w\s,]*\)\s*\{"]
+    ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$",
+            "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"]
+    hs = ["&&&", "^\{-"]
+    clj = ["^\(define", "^[ \t]*;+"]
+    java = ["^[ \t]*public \w* \w*", "^import .*;$"]
+    scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
+    tcl = ["^[ \t]*proc \w*::\w* \{"]
+    php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
+           "^[ \t]*\$\w* ?=.*;$"]
+    ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
+    perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
+    gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<",
+           "^using .*;$", "sealed"]
+
+    reg_list = clojure + python + js + ruby + hs + clj + java + scl\
+        + tcl + php + ocaml + perl + gcc + cish
+
+    matrix = []
+    for text in X:
+        v = [0] * len(reg_list)
+        for i in range(len(reg_list)):
+            reg_expr = reg_list[i]
+            prog = re.compile(reg_expr, flags=re.MULTILINE)
+            val = len(prog.findall(text))  # /len(text)
+            # this was found to have best results over normalized forms
+            v[i] = val
+        matrix.append(v)
+    return matrix
+
+
+def old_transform(X):
+    char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
+                "this\.", "^end", ";", "\*", "%", "^do",
+                "\<\$php", "/\*", "__", "=", "==",
+                "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
+                "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
+                "int", "\b\*\w", "\(&\w", "argv", "\[\]"
+                "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
+                "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
+                "\(\w+:", "@", "\b@\w"]
+    word_list = ["private", "static", "make","let", "def", "^\(defn",
+                 "defn", "do", "class", "^function", "public",
+                 "unset", "printf\(", "return", "NULL", "void",
+                 "main\(", "main_", "void\s\*\w", "\{else\}",
+                 "char", "array\(", "__init__", "__str__", "token",
+                 "^import", "^from", "final", "val", "type", "package",
+                 "object", "String", "string", "primitive", "fixnum",
+                 "error", "try"]
+
+    reg_list = char_list + word_list
+
+    matrix = []
+    for text in X:
+        v = [0] * len(reg_list)
+        for i in range(len(reg_list)):
+            reg_expr = reg_list[i]
+            prog = re.compile(reg_expr, flags=re.MULTILINE)
+            val = len(prog.findall(text))  # /len(text)
+            # this was found to have best results over normalized forms
+            v[i] = val
+        matrix.append(v)
+    return matrix
+
+
+class CustomFeaturizer(TransformerMixin):
+
+    def __init__(self):
+        pass
+        #self.featurizers = featurizers
+
+    def fit(self, X, y=None):
+        """All scikit-lear compatible transforms and classifiers have the
+        same interface, and fit always returns the same object."""
+        return self
+
+    def transform(self, X):
+
+        #matrix = ben_transform(X)
+
+        #matrix = old_transform(X)
+
+        matrix = alan_transform(X)
+
+        return matrix
+
+
+def fit3(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, DecisionTreeClassifier())
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+def fit4(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, SGDClassifier())
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+def fit5(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, MultinomialNB())
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+def fit6(contents, ltype):
+    '''Random Forest uses multiple decision trees and selects the
+       tree out of all of those which has occurs the most'''
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, RandomForestClassifier())
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+def demo_class(X, y):
+    types = []
+    for ext in y:
+        if ext not in types:
+            types.append(ext)
+    typecont = [""] * len(types)
+    for i in range(len(X)):
+        text = X[i]
+        for j in range(len(types)):
+            ext = types[j]
+            if ext == y[i]:
+                typecont[j] += text
+    custom_feature = CustomFeaturizer()
+    M = custom_feature.transform(typecont)
+    ratio = 1000 / max([max(vt) for vt in M])
+    for j in range(len(M)):
+        print(types[j].ljust(8) + " ", end="")
+        for k in range(len(M[0])):
+            print(str(int(ratio*M[j][k])).ljust(5), end="")
+        print("")
+
+
+def default_action():
+    filelist, testlist = load_file_names()
+    contents, ltype, testcont = load_files(filelist, testlist)
+
+    plist = [fit2, fit3, fit4, fit5, fit6]
+
+    X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)
+    pipel = [0 for i in range(len(plist))]
+    print(" score for    training_set     test_set")
+    for i in range(len(plist)):
+        pipe = plist[i](X, y)
+        print(str(i).ljust(4) + " " + str(round(pipe.score(X, y), 4)).ljust(8)
+              + str(round(pipe.score(Xt, yt), 4)).ljust(8))
+    print(" ")
+    for i in range(len(plist)):
+        pipel[i] = plist[i](contents, ltype)
+
+    print("  failed to classify")
+    failed_to_classify = {}
+    wrongly_classified = {}
+    A = pipe.predict(X)
+    for i in range(len(A)):
+        if A[i] != y[i]:
+            #            print(" ")
+            print(y[i].ljust(6) + " misclassified as " + A[i])
+            if y[i] in failed_to_classify:
+                failed_to_classify[y[i]] += 1
+            else:
+                failed_to_classify[y[i]] = 1
+            if A[i] in wrongly_classified:
+                wrongly_classified[A[i]] += 1
+            else:
+                wrongly_classified[A[i]] = 1
+    print("")
+    print(" failure counts")
+    print("  wrongly classified:")
+    for ext in wrongly_classified:
+        print(ext.ljust(7) + "#" * wrongly_classified[ext])
+    print("  failed to classify")
+    for ext in failed_to_classify:
+        print(ext.ljust(7) + "#" * failed_to_classify[ext])
+    print(" ")
+
+    ans = read_answers()
+    print(ans)
+
+    i = 0
+    for pipe in pipel:
+        i += 1
+        print(" score_quest " + str(i) + " " + str(pipe.score(testcont, ans)))
+        print(" pred " + str(i) + " " + str(pipe.predict(testcont)))
+        print(" ")
+
+    demo_class(testcont, ans)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        default_action()
+    elif len(sys.argv) == 2:
+        test_file = sys.argv[1]
+        print("Estimating file type of " + test_file)
+
+        filelist, testlist = load_file_names()
+        X, y, testcont = load_files(filelist, testlist)
+        pipe = fit6(X, y)
+        with open(test_file) as f:
+            test_contents = f.read()
+        est_ext = pipe.predict([test_contents])
+
+        print("Predicted extension: " + str(est_ext))
+
+    else:
+        print("error: command line arguments not supported")
diff --git a/lclassifier/old_output.txt b/lclassifier/old_output.txt
new file mode 100644
index 0000000..be1c816
--- /dev/null
+++ b/lclassifier/old_output.txt
@@ -0,0 +1,96 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+ score for    training_set     test_set
+0    0.9863  0.9124  
+1    1.0     0.9401  
+2    0.82    0.7926  
+3    0.9636  0.9447  
+4    0.9977  0.9724  
+ 
+  failed to classify
+sbcl   misclassified as racket
+
+ failure counts
+  wrongly classified:
+racket #
+  failed to classify
+sbcl   #
+ 
+<_csv.reader object at 0x1113abc18>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 2 0.375
+ pred 2 ['clj' 'clj' 'js' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' 'js' 'hs'
+ 'hs' 'hs' 'hs' 'js' 'hs' 'js' 'js' 'racket' 'ocaml' 'js' 'pl' 'scala'
+ 'ocaml' 'ocaml' 'php' 'js' 'php' 'ocaml' 'ocaml']
+ 
+ score_quest 3 0.5625
+ pred 3 ['clj' 'clj' 'cs' 'clj' 'py' 'py' 'sbcl' 'py' 'js' 'js' 'ruby' 'java'
+ 'ruby' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'c' 'c'
+ 'scala' 'scala' 'hs' 'hs' 'c' 'ruby' 'hs' 'ocaml' 'ocaml']
+ 
+ score_quest 4 0.59375
+ pred 4 ['clj' 'clj' 'ocaml' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'scala' 'cs'
+ 'scala' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'sbcl' 'racket' 'racket' 'js' 'js'
+ 'scala' 'scala' 'php' 'php' 'sbcl' 'php' 'php' 'ocaml' 'ocaml']
+ 
+ score_quest 5 0.4375
+ pred 5 ['clj' 'clj' 'ocaml' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala'
+ 'racket' 'scala' 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket'
+ 'ocaml' 'c' 'js' 'scala' 'scala' 'pl' 'php' 'php' 'racket' 'php' 'ocaml'
+ 'racket']
+ 
+clj      0    1    2    0    0    23   0    0    0    5    0    1    0    0    0    0    0    0    0    0    0    20   0    0    0    0    1    1    3    0    0    0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    3    10   9    9    10   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    9    0    0    0    0    1    0    0    1    6    0    0    0    0    
+py       3    0    19   0    6    97   0    0    0    6    9    44   0    0    0    75   366  11   0    66   14   217  0    0    0    0    198  0    13   0    0    0    0    0    0    1    0    0    3    0    0    0    0    0    0    0    0    2    0    91   0    0    41   22   0    0    0    0    26   2    0    0    0    0    0    31   0    29   2    0    10   2    0    7    5    0    6    0    0    0    0    0    5    
+js       0    2    244  3    9    100  0    11   0    45   29   3    0    0    5    1    297  27   7    55   18   87   3    0    0    0    6    0    0    0    0    0    0    1    0    39   0    1    0    0    15   0    0    1    0    0    0    1    1    20   0    0    114  1    27   0    0    0    66   0    0    0    0    0    0    2    0    0    0    0    0    0    1    61   18   0    9    1    15   0    0    11   1    
+ruby     0    0    5    0    10   7    0    0    3    0    5    0    0    0    0    0    44   7    3    0    1    33   0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    7    19   0    1    0    1    0    20   0    0    5    9    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    1    
+haskell  0    92   33   63   28   24   0    0    0    6    11   0    0    0    0    0    92   2    0    5    0    81   6    0    0    0    1    1    5    0    0    0    0    0    13   0    0    0    0    0    0    0    0    16   0    0    0    0    3    6    0    0    179  1    0    0    0    0    14   0    0    0    0    0    0    1    0    0    0    0    76   0    0    0    0    0    0    145  1    0    0    5    0    
+racket   0    1    0    7    20   48   0    0    0    7    0    0    0    0    0    15   22   0    0    3    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    14   11   70   0    0    5    0    0    0    0    0    0    0    0    0    0    0    0    6    0    0    0    0    0    0    0    5    0    0    0    0    32   65   5    19   0    
+java     0    0    6    0    3    1    0    0    0    16   136  0    0    0    18   0    0    0    0    2    0    1    0    0    0    32   0    0    10   0    0    0    0    0    0    2    0    0    0    0    0    0    0    37   0    0    0    0    3    5    0    0    2    0    0    11   0    0    7    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    11   1    0    0    0    3    
+scala    0    3    33   32   0    28   0    0    0    0    6    0    0    0    1    0    71   0    0    0    0    57   0    0    0    2    0    0    0    0    0    0    0    0    0    3    0    0    0    0    0    1    23   2    0    0    0    0    0    16   0    0    2    31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    15   37   9    2    9    13   0    0    0    0    1    
+tcl      0    0    48   92   13   41   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    31   0    0    0    0    0    10   0    0    0    6    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    2    0    
+php      0    93   53   185  1    14   0    0    0    74   999  7    0    0    33   5    27   3    2    28   0    18   0    0    0    0    0    0    5    0    0    0    0    0    1    2    0    0    0    0    0    0    0    36   0    0    0    0    0    0    0    0    6    5    0    23   0    0    35   0    0    0    0    0    0    0    3    0    0    0    0    0    0    20   0    0    1    0    18   0    0    0    0    
+ocaml    0    83   19   0    2    11   0    0    0    13   76   0    0    0    0    0    75   0    0    3    2    35   0    0    0    0    1    0    11   0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    1    57   3    0    0    0    0    0    0    0    0    9    0    0    0    0    0    0    0    0    0    0    0    0    0    0    10   22   0    0    20   9    0    0    0    2    
diff --git a/lclassifier/output.txt b/lclassifier/output.txt
new file mode 100644
index 0000000..577a52b
--- /dev/null
+++ b/lclassifier/output.txt
@@ -0,0 +1,92 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+ score for    training_set     test_set
+0    0.9818  0.871   
+1    1.0     0.977   
+2    0.9658  0.9355  
+3    0.9795  0.9677  
+4    1.0     0.977   
+ 
+  failed to classify
+
+ failure counts
+  wrongly classified:
+  failed to classify
+ 
+<_csv.reader object at 0x113420a58>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 2 0.65625
+ pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'racket' 'sbcl' 'java' 'java'
+ 'scala' 'scala' 'ruby' 'ruby' 'java' 'java' 'java' 'ocaml' 'ruby']
+ 
+ score_quest 3 0.6875
+ pred 3 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' 'racket' 'pl' 'pl'
+ 'scala' 'scala' 'racket' 'racket' 'pl' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 4 0.71875
+ pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala'
+ 'scala' 'py' 'py' 'java' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 5 0.625
+ pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java'
+ 'scala' 'scala' 'ruby' 'ruby' 'java' 'php' 'php' 'ocaml' 'ruby']
+ 
+clj      79   31   63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+py       0    0    0    587  142  79   476  0    0    0    0    0    0    0    63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+js       7    15   0    0    0    0    0    333  158  682  0    0    0    0    0    0    0    0    15   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    15   0    
+ruby     0    0    0    0    0    0    0    0    0    0    158  87   23   55   31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+haskell  0    0    0    0    0    468  0    0    0    0    0    0    0    0    0    7    79   0    0    0    0    0    0    0    0    0    7    0    0    0    0    0    0    0    0    0    
+racket   1000 39   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    412  39   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    7    0    0    0    0    63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    619  103  
+scala    7    0    0    0    0    7    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    39   55   0    0    0    0    0    0    0    0    0    0    15   15   0    
+tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    15   0    0    0    0    0    0    0    0    0    0    0    0    
+php      0    0    0    0    0    0    0    0    0    63   0    0    0    0    0    0    0    0    0    134  0    0    0    0    87   39   0    0    0    0    0    0    0    0    730  206  
+ocaml    47   0    0    0    0    0    0    0    31   0    15   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    277  7    0    0    0    0    0    0    7    0    
diff --git a/lclassifier/tests/test_lclassifier.py b/lclassifier/tests/test_lclassifier.py
new file mode 100644
index 0000000..502b789
--- /dev/null
+++ b/lclassifier/tests/test_lclassifier.py
@@ -0,0 +1,40 @@
+from lclassifier import *
+
+def test_ext():
+    ext = "cowboy"
+    assert acceptable_file(ext) == False
+
+def test_correct_ext():
+    ext = "perl"
+    assert clean_ext(ext) == "pl"
+
+def test_reg_use():
+    reg_expr = "\s__\w*__\("
+    prog = re.compile(reg_expr)
+    text ='''import packlag
+def __init__(self):
+    var = thing'''
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    reg_expr = "\):[ \t]*\n[ \t]*\w"
+    prog = re.compile(reg_expr)
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    reg_expr = "(^from|^import)\s"
+    prog = re.compile(reg_expr)
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    textjs = '''function noAction() {
+    }
+    '''
+    reg_expr = "function\s*\w*\(\w*[\w\s,]*\)\s*\{"
+    prog = re.compile(reg_expr)
+    val = prog.findall(textjs)
+    print(val)
+    assert len(val) == 1
diff --git a/ref_program.py b/ref_program.py
new file mode 100644
index 0000000..d6b7a6d
--- /dev/null
+++ b/ref_program.py
@@ -0,0 +1,54 @@
+import csv
+import re
+import numpy as np
+import random
+
+#from textblob import TextBlob
+from collections import Counter
+
+from sklearn.pipeline import make_pipeline, make_union
+from sklearn.base import TransformerMixin
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix
+
+
+class DumbFeaturizer(TransformerMixin):
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        matrix = []
+        for i in range(len(X)):
+            vector = []
+            for j in range(11):
+                if j == X[i]:
+                    vector.append(1)
+                else:
+                    vector.append(0)
+            matrix.append(vector)
+        return matrix
+
+N = 22
+y = [0] * N
+X = [0] * N
+for k in range(N):
+    val = random.randrange(11)
+    y[k] = val
+    X[k] = val
+
+
+dumb = DumbFeaturizer()
+print(dumb.transform(X))
+
+pipe = make_pipeline(dumb, DecisionTreeClassifier())
+pipe.fit(X, y)
+# Our baseline
+print(pipe.score(X, y))
+print(" ")
+print(" transform ")
+print(pipe.transform(X))