From 2483ffcb3c05fa3b4f15b9ec157b00588ad5c050 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Wed, 3 Jun 2015 18:44:50 -0400
Subject: [PATCH 1/9] Have a corelation score, but lots left to do

---
 .gitignore     |  2 ++
 lclassifier.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 lclassifier.py

diff --git a/.gitignore b/.gitignore
index f00dbf2..02937f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,5 @@ docs/_build/
 # PyBuilder
 target/
 
+.DS_store
+benchmarksgame-2014-08-31/
diff --git a/lclassifier.py b/lclassifier.py
new file mode 100644
index 0000000..e7f5e23
--- /dev/null
+++ b/lclassifier.py
@@ -0,0 +1,77 @@
+from glob import glob
+
+l1 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/*")
+filelist = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*.*")
+
+print(str(len(l1))+" l2 "+str(len(filelist)))
+
+contents = []
+ltype = []
+for filename in filelist:
+    if "ocaml-2" not in filename:
+        i = filename.index(".")
+        ltype.append(filename[i:])
+        with open(filename) as file:
+            contents.append(file.read())
+
+testcont = []
+testlist = glob("test/*")
+for filename in testlist:
+    print(filename)
+    with open(filename) as file:
+        testcont.append(file.read())
+
+print(" ")
+print(ltype)
+print(" ")
+#print(testcont[15])
+#print(testlist)
+
+#from scikit-learn.datasets import load_iris
+from sklearn import datasets
+iris = datasets.load_iris()
+print(iris.keys())
+print(" ")
+#print(iris.data)
+print(" ")
+print(iris.target)
+
+from sklearn import neighbors, datasets
+
+iris = datasets.load_iris()
+X, y = iris.data, iris.target
+
+# create the model
+knn = neighbors.KNeighborsClassifier(n_neighbors=5)
+
+# fit the model
+knn.fit(X, y)
+
+# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal?
+# call the "predict" method:
+result = knn.predict([[3, 5, 4, 2],])
+
+print(iris.target_names[result])
+
+
+
+import pandas as pd
+import numpy as np
+from sklearn import linear_model
+from sklearn.cross_validation import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfTransformer
+
+
+
+pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                      ('tfidf', TfidfTransformer()),
+                      ('bayes', MultinomialNB())])
+
+pipe.fit(contents, ltype)
+
+print(pipe.score(contents, ltype))
+
+print(pipe.predict(testcont))

From aaa83f0155e13e4fcd1cbd3b52db92a9850d78df Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Thu, 4 Jun 2015 17:14:39 -0400
Subject: [PATCH 2/9] added tests, close to custom features but not quite

---
 .envrc              |   1 +
 .gitignore          |   2 +
 lclassifier.py      | 254 ++++++++++++++++++++++++++++++++------------
 test_lclassifier.py |   9 ++
 4 files changed, 198 insertions(+), 68 deletions(-)
 create mode 100644 .envrc
 create mode 100644 test_lclassifier.py

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..94840b3
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+layout python3
diff --git a/.gitignore b/.gitignore
index 02937f0..6d081c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,5 @@ target/
 
 .DS_store
 benchmarksgame-2014-08-31/
+
+.direnv/
diff --git a/lclassifier.py b/lclassifier.py
index e7f5e23..6c82da0 100644
--- a/lclassifier.py
+++ b/lclassifier.py
@@ -1,60 +1,4 @@
 from glob import glob
-
-l1 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/*")
-filelist = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*.*")
-
-print(str(len(l1))+" l2 "+str(len(filelist)))
-
-contents = []
-ltype = []
-for filename in filelist:
-    if "ocaml-2" not in filename:
-        i = filename.index(".")
-        ltype.append(filename[i:])
-        with open(filename) as file:
-            contents.append(file.read())
-
-testcont = []
-testlist = glob("test/*")
-for filename in testlist:
-    print(filename)
-    with open(filename) as file:
-        testcont.append(file.read())
-
-print(" ")
-print(ltype)
-print(" ")
-#print(testcont[15])
-#print(testlist)
-
-#from scikit-learn.datasets import load_iris
-from sklearn import datasets
-iris = datasets.load_iris()
-print(iris.keys())
-print(" ")
-#print(iris.data)
-print(" ")
-print(iris.target)
-
-from sklearn import neighbors, datasets
-
-iris = datasets.load_iris()
-X, y = iris.data, iris.target
-
-# create the model
-knn = neighbors.KNeighborsClassifier(n_neighbors=5)
-
-# fit the model
-knn.fit(X, y)
-
-# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal?
-# call the "predict" method:
-result = knn.predict([[3, 5, 4, 2],])
-
-print(iris.target_names[result])
-
-
-
 import pandas as pd
 import numpy as np
 from sklearn import linear_model
@@ -63,15 +7,189 @@
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import TfidfTransformer
-
-
-
-pipe = Pipeline([('bag_of_words', CountVectorizer()),
-                      ('tfidf', TfidfTransformer()),
-                      ('bayes', MultinomialNB())])
-
-pipe.fit(contents, ltype)
-
-print(pipe.score(contents, ltype))
-
-print(pipe.predict(testcont))
+import csv
+
+
+def acceptable_file(text):
+    if text in llist:
+        return True
+    else:
+        return False
+
+def clean_ext(text):
+    if text == "gcc" or text == "h" or text == "gpp":
+        return "c"
+    elif text == "hack":
+        return "php"
+    elif text == "yarv" or text == "jruby":
+        return "ruby"
+    elif text == "clojure":
+        return "clj"
+    elif text == "python3" and text == "python":
+        return "py"
+    elif text == "perl":
+        return "pl"
+    elif text == "javascript":
+        return "js"
+    elif text == "csharp":
+        return "cs"
+    elif text == "ghc":
+        return "hs"
+    elif text == "scheme":
+        return "racket"
+    else:
+        return text
+
+llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js",
+         "ocaml", "pl", "php", "py", "ruby", "scala", "racket"]
+
+def load_file_names():
+    l = [0 for i in range(5)]
+    s = "benchmarksgame-2014-08-31/benchmarksgame/"
+    max_lvl = 5
+    for i in range(max_lvl):
+        l[i] = glob(s+"*/"*i+"*.*")
+#    l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*")
+#    l2 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*/*.*")
+#    filelist = l1 + l2
+    filelist = []
+    for i in range(max_lvl):
+        filelist += l[i]
+    testlist = glob("test/*")
+
+    print("   total samples "+str(len(filelist)))
+    return filelist, testlist
+
+
+def load_files(filelist, testlist):
+    contents = []
+    ltype = []
+    ext_list = []
+    for filename in filelist:
+        i = filename.rfind(".")
+        ext = clean_ext(filename[i+1:])
+    #    print(ext, end=" - ")
+    #    print(ext+ str(ext in ext_list) + " - "+str(ext_list))
+        if not ext in ext_list:
+            ext_list.append(ext)
+        if acceptable_file(ext):
+            ltype.append(ext)
+            with open(filename, encoding="ISO-8859-1") as file:
+    #            print(filename)
+                contents.append(file.read())
+#    return contents, ltype
+
+    print(" number of usable files "+str(len(ltype)))
+    print(" summary of tile types")
+    for ext in ext_list:
+        print(ext.ljust(12)+ "  ", end=" ")
+        if ext in llist:
+            print(ltype.count(ext), end=" ")
+        print(" ")
+    print(" not included: ", end="")
+    for ext in llist:
+        if ext not in ext_list:
+            print(ext, end=" : ")
+    print(" ")
+
+    testcont = []
+    for filename in testlist:
+    #    print(filename)
+        with open(filename) as file:
+            testcont.append(file.read())
+
+    print(" ")
+    return contents, ltype, testcont
+    #print(testcont[15])
+    #print(testlist)
+
+def read_answers():
+    with open("test.csv") as csvfile:
+        ans_list = csv.reader(csvfile, delimiter=",")
+        ans = []
+        print(ans_list)
+        for row in ans_list:
+            ans.append(clean_ext(row[1]))
+    return ans
+
+
+def fit1(contents, ltype):
+    pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                          ('tfidf', TfidfTransformer()),
+                          ('bayes', MultinomialNB())])
+    pipe.fit(contents, ltype)
+    return pipe
+#    print(pipe.score(contents, ltype))
+#    print(pipe.predict(testcont))
+#    return pipe.score(contents, ltype)
+
+
+def fit2(contents, ltype):
+    pipe = Pipeline([('bag_of_words', CountVectorizer()),
+#                          ('tfidf', TfidfTransformer()),
+                          ('bayes', MultinomialNB())])
+    pipe.fit(contents, ltype)
+    return pipe
+#    print(pipe.score(contents, ltype))
+#    print(pipe.predict(testcont))
+#    return pipe.score(contents, ltype)
+
+
+class CustomFeaturizer:
+    def __init__(self, *featurizers):
+        self.featurizers = featurizers
+
+    def fit(self, X, y=None):
+        """All scikit-lear compatible transforms and classifiers have the same interface, and
+        fit always returns the same object."""
+        return self
+
+    def transform(self, X):
+        fvs = []
+        for datum in X:
+            fvs.append([f(datum) for f in self.featurizers])
+        return fvs
+
+
+def fit3(contents, ltype):
+    pipe = Pipeline([('custom_feature', CustomFeaturizer()),
+                     ('bayes', MultinomialNB())])
+    MultinomialNB()
+    model = MultinomialNB(X, y)
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+#sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature,
+#                                  percent_periods_feature)
+#big_list = sms_featurizer.transform(sms_data[:10])
+#print(big_list)
+
+if __name__ == "__main__":
+    filelist, testlist = load_file_names()
+    contents, ltype, testcont = load_files(filelist, testlist)
+
+    plist = [fit1, fit2]
+
+    pipel = [0 for i in range(len(plist))]
+    for i in range(len(plist)):
+        pipel[i] = plist[i](contents, ltype)
+    #pipe1 = fit1(contents, ltype)
+    #pipe2 = fit2(contents, ltype)
+
+    ans = read_answers()
+    print(ans)
+
+    i = 0
+    for pipe in pipel:
+        i += 1
+        print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype)))
+        print(" pred "+str(i)+" "+str(pipe.predict(testlist)))
+        print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans)))
+    #print(" score2 "+str(pipe2.score(contents, ltype)))
+
+    #print(" pred1 "+str(pipe1.predict(testlist)))
+    #print(" pred2 "+str(pipe2.predict(testlist)))
+
+    #print(" score1 "+str(pipe1.score(testlist, ans)))
+    #print(" score2 "+str(pipe2.score(testlist, ans)))
diff --git a/test_lclassifier.py b/test_lclassifier.py
new file mode 100644
index 0000000..431962e
--- /dev/null
+++ b/test_lclassifier.py
@@ -0,0 +1,9 @@
+from lclassifier import *
+
+def test_ext():
+    ext = "cowboy"
+    assert acceptable_file(ext) == False
+
+def test_correct_ext():
+    ext = "perl"
+    assert clean_ext(ext) == "pl"

From e530c5ae39ee73cba4aec88e9520813c77ae28dd Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Thu, 4 Jun 2015 22:14:23 -0400
Subject: [PATCH 3/9] custom featurizer running

---
 lclassifier.py | 66 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/lclassifier.py b/lclassifier.py
index 6c82da0..3a6d882 100644
--- a/lclassifier.py
+++ b/lclassifier.py
@@ -5,9 +5,11 @@
 from sklearn.cross_validation import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.pipeline import Pipeline
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_extraction.text import TfidfTransformer
 import csv
+import re
 
 
 def acceptable_file(text):
@@ -134,29 +136,56 @@ def fit2(contents, ltype):
 #    print(pipe.predict(testcont))
 #    return pipe.score(contents, ltype)
 
+def print_matrix(matrix, p_max=None):
+    if p_max is None:
+        upper_limit = len(matrix)
+    else:
+        upper_limit = p_max
+    for i in range(upper_limit):
+        vector = matrix[i]
+        for val in vector:
+            print(str(round(val, 3)).ljust(5)+",", end="")
+        print("")
+        #print([str(round(val, 3)) for val in vector])
+
 
 class CustomFeaturizer:
-    def __init__(self, *featurizers):
-        self.featurizers = featurizers
+    def __init__(self):
+        pass
+        #self.featurizers = featurizers
 
     def fit(self, X, y=None):
-        """All scikit-lear compatible transforms and classifiers have the same interface, and
-        fit always returns the same object."""
+        """All scikit-lear compatible transforms and classifiers have the
+        same interface, and fit always returns the same object."""
         return self
 
     def transform(self, X):
-        fvs = []
-        for datum in X:
-            fvs.append([f(datum) for f in self.featurizers])
-        return fvs
+        reg_list = ["^#", "-\>", "\{", "\$", "let", "def",
+                    "private", "static", "\<", "\[", "func\b",
+                    "this\."]
+        matrix = []
+        for text in X:
+            vector = []
+            for reg_expr in reg_list:
+                prog = re.compile(reg_expr)
+                vector.append(len(prog.findall(text))/len(text))
+            matrix.append(vector)
+        return matrix
 
 
 def fit3(contents, ltype):
-    pipe = Pipeline([('custom_feature', CustomFeaturizer()),
-                     ('bayes', MultinomialNB())])
-    MultinomialNB()
-    model = MultinomialNB(X, y)
+#    pipe = Pipeline([('custom_feature', CustomFeaturizer()),
+#                     ('bayes', MultinomialNB())])
+#    MultinomialNB()
+#    model = MultinomialNB(X, y)
+#    pipe.fit(contents, ltype)
+
+    custom_feature = CustomFeaturizer()
+#    custom_feature.fit(contents, ltype)
+
+    pipe = make_pipeline(custom_feature, DecisionTreeClassifier())
     pipe.fit(contents, ltype)
+
     return pipe
 
 
@@ -169,7 +198,7 @@ def fit3(contents, ltype):
     filelist, testlist = load_file_names()
     contents, ltype, testcont = load_files(filelist, testlist)
 
-    plist = [fit1, fit2]
+    plist = [fit1, fit2, fit3]
 
     pipel = [0 for i in range(len(plist))]
     for i in range(len(plist)):
@@ -186,6 +215,15 @@ def fit3(contents, ltype):
         print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype)))
         print(" pred "+str(i)+" "+str(pipe.predict(testlist)))
         print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans)))
+
+    word_list = re.findall(r"^#", "# include ")
+    print(word_list)
+    print(len(word_list))
+
+
+    custom_feature = CustomFeaturizer()
+    matrix = custom_feature.transform(contents)
+    print_matrix(matrix, 10)
     #print(" score2 "+str(pipe2.score(contents, ltype)))
 
     #print(" pred1 "+str(pipe1.predict(testlist)))

From 4f03b3897a8dcfeb30c2374775d3e9dadde78771 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Thu, 4 Jun 2015 23:27:41 -0400
Subject: [PATCH 4/9] massive list of regex, improved but method showing limits

---
 lclassifier.py | 70 ++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/lclassifier.py b/lclassifier.py
index 3a6d882..286c893 100644
--- a/lclassifier.py
+++ b/lclassifier.py
@@ -4,10 +4,13 @@
 from sklearn import linear_model
 from sklearn.cross_validation import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_extraction.text import TfidfTransformer
+# estimators
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import SGDClassifier
+# other utilities
 import csv
 import re
 
@@ -160,32 +163,48 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        reg_list = ["^#", "-\>", "\{", "\$", "let", "def",
-                    "private", "static", "\<", "\[", "func\b",
-                    "this\."]
+        char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
+                    "this\.", "^end", ";", "\*", "%", "^do",
+                    "\<\$php", "/\*", "__", "=", "==",
+                    "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
+                    "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
+                    "int", "\b\*\w", "\(&\w", "argv", "\[\]"
+                    "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
+                    "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
+                    "\(\w+:", "@", "\b@\w"]
+        word_list = ["private", "static", "make","let", "def", "^\(defn",
+                     "defn", "do", "class", "^function", "public",
+                     "unset", "printf\(", "return", "NULL", "void",
+                     "main\(", "main_", "void\s\*\w", "\{else\}",
+                     "char", "array\(", "__init__", "__str__", "token",
+                     "^import", "^from", "final", "val", "type", "package",
+                     "object", "String", "string", "primitive", "fixnum",
+                     "error", "try"]
+        reg_list = char_list + word_list
         matrix = []
         for text in X:
             vector = []
             for reg_expr in reg_list:
                 prog = re.compile(reg_expr)
-                vector.append(len(prog.findall(text))/len(text))
+                val = len(prog.findall(text))/len(text)
+                if val > 0:
+                    val = 1
+                vector.append(val)
             matrix.append(vector)
         return matrix
 
 
 def fit3(contents, ltype):
-#    pipe = Pipeline([('custom_feature', CustomFeaturizer()),
-#                     ('bayes', MultinomialNB())])
-#    MultinomialNB()
-#    model = MultinomialNB(X, y)
-#    pipe.fit(contents, ltype)
-
     custom_feature = CustomFeaturizer()
-#    custom_feature.fit(contents, ltype)
-
     pipe = make_pipeline(custom_feature, DecisionTreeClassifier())
     pipe.fit(contents, ltype)
+    return pipe
+
 
+def fit4(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, SGDClassifier())
+    pipe.fit(contents, ltype)
     return pipe
 
 
@@ -198,11 +217,12 @@ def fit3(contents, ltype):
     filelist, testlist = load_file_names()
     contents, ltype, testcont = load_files(filelist, testlist)
 
-    plist = [fit1, fit2, fit3]
+    plist = [fit1, fit2, fit3, fit4]
 
+    X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)
     pipel = [0 for i in range(len(plist))]
     for i in range(len(plist)):
-        pipel[i] = plist[i](contents, ltype)
+        pipel[i] = plist[i](X, y)
     #pipe1 = fit1(contents, ltype)
     #pipe2 = fit2(contents, ltype)
 
@@ -212,22 +232,12 @@ def fit3(contents, ltype):
     i = 0
     for pipe in pipel:
         i += 1
-        print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype)))
+        print(" score_train "+str(i)+" "+str(pipe.score(X, y)))
+        print(" score_test  "+str(i)+" "+str(pipe.score(Xt, yt)))
+        print(" score_quest "+str(i)+" "+str(pipe.score(testlist, ans)))
         print(" pred "+str(i)+" "+str(pipe.predict(testlist)))
-        print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans)))
+        print(" ")
 
     word_list = re.findall(r"^#", "# include ")
     print(word_list)
     print(len(word_list))
-
-
-    custom_feature = CustomFeaturizer()
-    matrix = custom_feature.transform(contents)
-    print_matrix(matrix, 10)
-    #print(" score2 "+str(pipe2.score(contents, ltype)))
-
-    #print(" pred1 "+str(pipe1.predict(testlist)))
-    #print(" pred2 "+str(pipe2.predict(testlist)))
-
-    #print(" score1 "+str(pipe1.score(testlist, ans)))
-    #print(" score2 "+str(pipe2.score(testlist, ans)))

From 5994e5a25a193e0df78010a39116fadb03a82eb0 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Fri, 5 Jun 2015 14:11:13 -0400
Subject: [PATCH 5/9] fixed careless error, classifying up to 50% now

---
 .gitignore          |  1 +
 lclassifier.py      | 96 ++++++++++++++++++++++++++++++++-------------
 test_lclassifier.py | 31 +++++++++++++++
 3 files changed, 100 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6d081c5..a7c3822 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,4 @@ target/
 benchmarksgame-2014-08-31/
 
 .direnv/
+test_w_ext/
diff --git a/lclassifier.py b/lclassifier.py
index 286c893..962743c 100644
--- a/lclassifier.py
+++ b/lclassifier.py
@@ -10,6 +10,7 @@
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import SGDClassifier
+from sklearn.ensemble import RandomForestClassifier
 # other utilities
 import csv
 import re
@@ -50,8 +51,8 @@ def clean_ext(text):
 
 def load_file_names():
     l = [0 for i in range(5)]
-    s = "benchmarksgame-2014-08-31/benchmarksgame/"
-    max_lvl = 5
+    s = "benchmarksgame-2014-08-31/benchmarksgame/bench/"
+    max_lvl = 4
     for i in range(max_lvl):
         l[i] = glob(s+"*/"*i+"*.*")
 #    l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*")
@@ -97,12 +98,14 @@ def load_files(filelist, testlist):
             print(ext, end=" : ")
     print(" ")
 
-    testcont = []
+    testcont = [0] * 32
     for filename in testlist:
     #    print(filename)
         with open(filename) as file:
-            testcont.append(file.read())
-
+            di = filename.rfind("/")
+            i = int(filename[di+1:])
+            print(filename+" "+str(i))
+            testcont[i-1] = file.read()
     print(" ")
     return contents, ltype, testcont
     #print(testcont[15])
@@ -115,6 +118,7 @@ def read_answers():
         print(ans_list)
         for row in ans_list:
             ans.append(clean_ext(row[1]))
+#            print(row[0])
     return ans
 
 
@@ -163,32 +167,52 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
-                    "this\.", "^end", ";", "\*", "%", "^do",
-                    "\<\$php", "/\*", "__", "=", "==",
-                    "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
-                    "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
-                    "int", "\b\*\w", "\(&\w", "argv", "\[\]"
-                    "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
-                    "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
-                    "\(\w+:", "@", "\b@\w"]
-        word_list = ["private", "static", "make","let", "def", "^\(defn",
-                     "defn", "do", "class", "^function", "public",
-                     "unset", "printf\(", "return", "NULL", "void",
-                     "main\(", "main_", "void\s\*\w", "\{else\}",
-                     "char", "array\(", "__init__", "__str__", "token",
-                     "^import", "^from", "final", "val", "type", "package",
-                     "object", "String", "string", "primitive", "fixnum",
-                     "error", "try"]
-        reg_list = char_list + word_list
+        # char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
+        #             "this\.", "^end", ";", "\*", "%", "^do",
+        #             "\<\$php", "/\*", "__", "=", "==",
+        #             "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
+        #             "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
+        #             "int", "\b\*\w", "\(&\w", "argv", "\[\]"
+        #             "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
+        #             "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
+        #             "\(\w+:", "@", "\b@\w"]
+        # word_list = ["private", "static", "make","let", "def", "^\(defn",
+        #              "defn", "do", "class", "^function", "public",
+        #              "unset", "printf\(", "return", "NULL", "void",
+        #              "main\(", "main_", "void\s\*\w", "\{else\}",
+        #              "char", "array\(", "__init__", "__str__", "token",
+        #              "^import", "^from", "final", "val", "type", "package",
+        #              "object", "String", "string", "primitive", "fixnum",
+        #              "error", "try"]
+        clojure = ["^\s*\(\w.*\s*$", "\(:\w+[]\s\w+]*\)"]
+        python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s",
+                  "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"]
+        js = ["^[ \t]var", "=\s*function",
+              "function\s*\w*\(\w*[\w\s,]*\)\s*\{"]
+        ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$",
+                "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"]
+        hs = ["&&&", "^\{-"]
+        clj = ["^\(define", "^[ \t]*;+"]
+        java = ["^[ \t]*public \w* \w*", "^[ \t]*\*", "^[ \t]*/\*\*"]
+        scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
+        tcl = ["^[ \t]*proc \w*::\w* \{"]
+        php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
+                "^[ \t]*\$\w* ?=.*;$"]
+        ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
+        perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
+        gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<"]
+#        reg_list = char_list + word_list
+        reg_list = clojure + python + js + ruby + hs + clj + java + scl\
+                   + tcl + php + ocaml + perl + gcc
         matrix = []
         for text in X:
             vector = []
             for reg_expr in reg_list:
-                prog = re.compile(reg_expr)
+#                print(reg_expr)
+                prog = re.compile(reg_expr, flags=re.MULTILINE)
                 val = len(prog.findall(text))/len(text)
-                if val > 0:
-                    val = 1
+                #if val > 0:
+                #    val = 1
                 vector.append(val)
             matrix.append(vector)
         return matrix
@@ -208,6 +232,19 @@ def fit4(contents, ltype):
     return pipe
 
 
+def fit4(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, MultinomialNB())
+    pipe.fit(contents, ltype)
+    return pipe
+
+
+def fit4(contents, ltype):
+    custom_feature = CustomFeaturizer()
+    pipe = make_pipeline(custom_feature, RandomForestClassifier())
+    pipe.fit(contents, ltype)
+    return pipe
+
 #sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature,
 #                                  percent_periods_feature)
 #big_list = sms_featurizer.transform(sms_data[:10])
@@ -225,6 +262,9 @@ def fit4(contents, ltype):
         pipel[i] = plist[i](X, y)
     #pipe1 = fit1(contents, ltype)
     #pipe2 = fit2(contents, ltype)
+    pipe = fit4(X, y)
+    #print(pipe.transform(testlist))
+    #print(testcont)
 
     ans = read_answers()
     print(ans)
@@ -234,8 +274,8 @@ def fit4(contents, ltype):
         i += 1
         print(" score_train "+str(i)+" "+str(pipe.score(X, y)))
         print(" score_test  "+str(i)+" "+str(pipe.score(Xt, yt)))
-        print(" score_quest "+str(i)+" "+str(pipe.score(testlist, ans)))
-        print(" pred "+str(i)+" "+str(pipe.predict(testlist)))
+        print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans)))
+        print(" pred "+str(i)+" "+str(pipe.predict(testcont)))
         print(" ")
 
     word_list = re.findall(r"^#", "# include ")
diff --git a/test_lclassifier.py b/test_lclassifier.py
index 431962e..502b789 100644
--- a/test_lclassifier.py
+++ b/test_lclassifier.py
@@ -7,3 +7,34 @@ def test_ext():
 def test_correct_ext():
     ext = "perl"
     assert clean_ext(ext) == "pl"
+
+def test_reg_use():
+    reg_expr = "\s__\w*__\("
+    prog = re.compile(reg_expr)
+    text ='''import packlag
+def __init__(self):
+    var = thing'''
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    reg_expr = "\):[ \t]*\n[ \t]*\w"
+    prog = re.compile(reg_expr)
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    reg_expr = "(^from|^import)\s"
+    prog = re.compile(reg_expr)
+    val = prog.findall(text)
+    print(val)
+    assert len(val) == 1
+
+    textjs = '''function noAction() {
+    }
+    '''
+    reg_expr = "function\s*\w*\(\w*[\w\s,]*\)\s*\{"
+    prog = re.compile(reg_expr)
+    val = prog.findall(textjs)
+    print(val)
+    assert len(val) == 1

From 585fe3a93b0f9d05034a3823c2f26858dcbbf510 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Sat, 6 Jun 2015 10:26:13 -0400
Subject: [PATCH 6/9] match rates up to 70%, organized directory structure

---
 Lang_classifier_use.ipynb                     | 186 ++++++++++++++++++
 lclassifier.py => lclassifier/lclassifier.py  | 104 ++++++++--
 lclassifier/output.txt                        | 129 ++++++++++++
 .../tests/test_lclassifier.py                 |   0
 ref_program.py                                |  54 +++++
 5 files changed, 454 insertions(+), 19 deletions(-)
 create mode 100644 Lang_classifier_use.ipynb
 rename lclassifier.py => lclassifier/lclassifier.py (76%)
 create mode 100644 lclassifier/output.txt
 rename test_lclassifier.py => lclassifier/tests/test_lclassifier.py (100%)
 create mode 100644 ref_program.py

diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb
new file mode 100644
index 0000000..3f20fdf
--- /dev/null
+++ b/Lang_classifier_use.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from lclassifier.lclassifier import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "acceptable_file(\"py\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   total samples 931\n",
+      " number of usable files 656\n",
+      " \n",
+      " number of read file types:  32\n",
+      " number of recognized types: 14\n",
+      " summary of tile types\n",
+      "ats             \n",
+      "clj            38  \n",
+      "cs             41  \n",
+      "dart            \n",
+      "erlang          \n",
+      "fpascal         \n",
+      "fsharp          \n",
+      "c              129  \n",
+      "hs             33  \n",
+      "gnat            \n",
+      "go              \n",
+      "php            55  \n",
+      "ifc             \n",
+      "java           51  \n",
+      "js             25  \n",
+      "ruby           73  \n",
+      "lua             \n",
+      "ocaml          35  \n",
+      "oz              \n",
+      "pl             34  \n",
+      "py             36  \n",
+      "racket         29  \n",
+      "rust            \n",
+      "sbcl           34  \n",
+      "scala          43  \n",
+      "vw              \n",
+      "cint            \n",
+      "javasteady      \n",
+      "parrot          \n",
+      "cc              \n",
+      "txt             \n",
+      "ozf             \n",
+      " not included:  \n",
+      " \n"
+     ]
+    }
+   ],
+   "source": [
+    "filelist, testlist = load_file_names()\n",
+    "contents, ltype, testcont = load_files(filelist, testlist)\n",
+    "\n",
+    "plist = [fit2, fit3, fit4, fit5, fit6]\n",
+    "\n",
+    "X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pipe = fit5(Xt, yt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['hs', 'c', 'java', 'ruby', 'clj', 'hs', 'racket', 'c', 'clj',\n",
+       "       'ruby', 'c', 'pl', 'c', 'c', 'scala', 'ruby', 'cs', 'c', 'cs',\n",
+       "       'sbcl', 'java', 'ruby', 'pl', 'ocaml', 'ocaml', 'cs', 'hs', 'php',\n",
+       "       'c', 'pl', 'c', 'scala', 'ruby', 'c', 'cs', 'c', 'ocaml', 'hs',\n",
+       "       'scala', 'c', 'hs', 'ruby', 'c', 'c', 'ocaml', 'sbcl', 'ruby', 'c',\n",
+       "       'c', 'ruby', 'c', 'ocaml', 'java', 'c', 'ruby', 'ruby', 'php', 'c',\n",
+       "       'clj', 'cs', 'php', 'java', 'c', 'ruby', 'py', 'cs', 'scala', 'php',\n",
+       "       'c', 'js', 'cs', 'js', 'c', 'php', 'php', 'php', 'pl', 'c', 'ruby',\n",
+       "       'clj', 'php', 'c', 'pl', 'py', 'clj', 'c', 'c', 'py', 'sbcl', 'py',\n",
+       "       'ruby', 'php', 'py', 'php', 'c', 'php', 'ruby', 'ruby', 'ruby',\n",
+       "       'scala', 'py', 'ruby', 'clj', 'php', 'ruby', 'c', 'ocaml', 'racket',\n",
+       "       'php', 'hs', 'hs', 'sbcl', 'ocaml', 'py', 'scala', 'ruby', 'cs',\n",
+       "       'c', 'c', 'c', 'c', 'clj', 'sbcl', 'scala', 'cs', 'py', 'c', 'cs',\n",
+       "       'cs', 'hs', 'c', 'java', 'php', 'java', 'js', 'clj', 'ruby', 'c',\n",
+       "       'hs', 'ruby', 'c', 'php', 'py', 'scala', 'clj', 'cs', 'cs', 'ruby',\n",
+       "       'sbcl', 'cs', 'scala', 'cs', 'c', 'scala', 'clj', 'c', 'clj', 'pl',\n",
+       "       'ruby', 'racket', 'java', 'cs', 'js', 'ocaml', 'c', 'py', 'c',\n",
+       "       'scala', 'js', 'clj', 'c', 'clj', 'ruby', 'clj', 'racket', 'c',\n",
+       "       'ocaml', 'js', 'pl', 'java', 'hs', 'java', 'py', 'php', 'java',\n",
+       "       'ruby', 'sbcl', 'ruby', 'php', 'scala', 'py', 'c', 'racket', 'php',\n",
+       "       'c', 'js', 'java', 'php', 'java', 'pl', 'c', 'py', 'php', 'py', 'c',\n",
+       "       'cs', 'py', 'cs', 'c', 'c', 'clj', 'java', 'ocaml', 'cs', 'java',\n",
+       "       'ocaml', 'cs'], \n",
+       "      dtype='<U6')"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.predict(Xt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/lclassifier.py b/lclassifier/lclassifier.py
similarity index 76%
rename from lclassifier.py
rename to lclassifier/lclassifier.py
index 962743c..bd14944 100644
--- a/lclassifier.py
+++ b/lclassifier/lclassifier.py
@@ -11,6 +11,7 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import SGDClassifier
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.base import TransformerMixin
 # other utilities
 import csv
 import re
@@ -22,7 +23,8 @@ def acceptable_file(text):
     else:
         return False
 
-def clean_ext(text):
+def clean_ext(textp):
+    text = textp.strip()
     if text == "gcc" or text == "h" or text == "gpp":
         return "c"
     elif text == "hack":
@@ -31,7 +33,7 @@ def clean_ext(text):
         return "ruby"
     elif text == "clojure":
         return "clj"
-    elif text == "python3" and text == "python":
+    elif text == "python3" or text == "python":
         return "py"
     elif text == "perl":
         return "pl"
@@ -49,6 +51,13 @@ def clean_ext(text):
 llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js",
          "ocaml", "pl", "php", "py", "ruby", "scala", "racket"]
 
+def list_uniques(alist):
+    rlist = []
+    for item in alist:
+        if item not in rlist:
+            rlist.append(item)
+    return rlist
+
 def load_file_names():
     l = [0 for i in range(5)]
     s = "benchmarksgame-2014-08-31/benchmarksgame/bench/"
@@ -74,6 +83,8 @@ def load_files(filelist, testlist):
     for filename in filelist:
         i = filename.rfind(".")
         ext = clean_ext(filename[i+1:])
+        if ext == "tcl":
+            print(filename)
     #    print(ext, end=" - ")
     #    print(ext+ str(ext in ext_list) + " - "+str(ext_list))
         if not ext in ext_list:
@@ -86,6 +97,9 @@ def load_files(filelist, testlist):
 #    return contents, ltype
 
     print(" number of usable files "+str(len(ltype)))
+    print(" ")
+    print(" number of read file types:  "+str(len(ext_list)))
+    print(" number of recognized types: "+str(len(llist)))
     print(" summary of tile types")
     for ext in ext_list:
         print(ext.ljust(12)+ "  ", end=" ")
@@ -104,11 +118,10 @@ def load_files(filelist, testlist):
         with open(filename) as file:
             di = filename.rfind("/")
             i = int(filename[di+1:])
-            print(filename+" "+str(i))
+#            print(filename+" "+str(i))
             testcont[i-1] = file.read()
     print(" ")
     return contents, ltype, testcont
-    #print(testcont[15])
     #print(testlist)
 
 def read_answers():
@@ -118,6 +131,7 @@ def read_answers():
         print(ans_list)
         for row in ans_list:
             ans.append(clean_ext(row[1]))
+    print(" number of testing file types: "+str(len(list_uniques(ans))))
 #            print(row[0])
     return ans
 
@@ -156,7 +170,7 @@ def print_matrix(matrix, p_max=None):
         #print([str(round(val, 3)) for val in vector])
 
 
-class CustomFeaturizer:
+class CustomFeaturizer(TransformerMixin):
     def __init__(self):
         pass
         #self.featurizers = featurizers
@@ -184,37 +198,45 @@ def transform(self, X):
         #              "^import", "^from", "final", "val", "type", "package",
         #              "object", "String", "string", "primitive", "fixnum",
         #              "error", "try"]
-        clojure = ["^\s*\(\w.*\s*$", "\(:\w+[]\s\w+]*\)"]
+        cish = ["^[ \t]*\*", "^[ \t]*/\*\*"]
+        clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "]
         python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s",
                   "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"]
-        js = ["^[ \t]var", "=\s*function",
+        js = ["^[ \t]*var", "=\s*function",
               "function\s*\w*\(\w*[\w\s,]*\)\s*\{"]
         ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$",
                 "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"]
         hs = ["&&&", "^\{-"]
         clj = ["^\(define", "^[ \t]*;+"]
-        java = ["^[ \t]*public \w* \w*", "^[ \t]*\*", "^[ \t]*/\*\*"]
+        java = ["^[ \t]*public \w* \w*", "^import .*;$"]
         scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
         tcl = ["^[ \t]*proc \w*::\w* \{"]
         php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
                 "^[ \t]*\$\w* ?=.*;$"]
         ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
         perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
-        gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<"]
+        gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<",
+               "^using .*;$", "sealed"]
 #        reg_list = char_list + word_list
         reg_list = clojure + python + js + ruby + hs + clj + java + scl\
-                   + tcl + php + ocaml + perl + gcc
+                   + tcl + php + ocaml + perl + gcc + cish
+#        print(len(reg_list))
         matrix = []
         for text in X:
-            vector = []
-            for reg_expr in reg_list:
+            v = [0] * len(reg_list)
+#            print(str(len(v))+" "+str(len(reg_list)))
+            for i in range(len(reg_list)):
 #                print(reg_expr)
+                reg_expr = reg_list[i]
                 prog = re.compile(reg_expr, flags=re.MULTILINE)
-                val = len(prog.findall(text))/len(text)
+                val = len(prog.findall(text))#/len(text)
                 #if val > 0:
                 #    val = 1
-                vector.append(val)
-            matrix.append(vector)
+#                print(i)
+                v[i] = val
+#            print(vector)
+            matrix.append(v)
+#        print(matrix[0])
         return matrix
 
 
@@ -232,19 +254,40 @@ def fit4(contents, ltype):
     return pipe
 
 
-def fit4(contents, ltype):
+def fit5(contents, ltype):
     custom_feature = CustomFeaturizer()
     pipe = make_pipeline(custom_feature, MultinomialNB())
     pipe.fit(contents, ltype)
     return pipe
 
 
-def fit4(contents, ltype):
+def fit6(contents, ltype):
     custom_feature = CustomFeaturizer()
     pipe = make_pipeline(custom_feature, RandomForestClassifier())
     pipe.fit(contents, ltype)
     return pipe
 
+
+def demo_class(X, y):
+    types = []
+    for ext in y:
+        if ext not in types:
+            types.append(ext)
+    typecont = [""] * len(types)
+    for i in range(len(X)):
+        text = X[i]
+        for j in range(len(types)):
+            ext = types[j]
+            if ext == y[i]:
+                typecont[j] += text
+    custom_feature = CustomFeaturizer()
+    M = custom_feature.transform(typecont)
+    for j in range(len(M)):
+        print(types[j].ljust(8)+" ", end="")
+        for k in range(len(M[0])):
+            print(str(int(M[j][k])).ljust(5), end="")
+        print("")
+
 #sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature,
 #                                  percent_periods_feature)
 #big_list = sms_featurizer.transform(sms_data[:10])
@@ -254,7 +297,7 @@ def fit4(contents, ltype):
     filelist, testlist = load_file_names()
     contents, ltype, testcont = load_files(filelist, testlist)
 
-    plist = [fit1, fit2, fit3, fit4]
+    plist = [fit2, fit3, fit4, fit5, fit6]
 
     X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)
     pipel = [0 for i in range(len(plist))]
@@ -263,7 +306,28 @@ def fit4(contents, ltype):
     #pipe1 = fit1(contents, ltype)
     #pipe2 = fit2(contents, ltype)
     pipe = fit4(X, y)
-    #print(pipe.transform(testlist))
+    M = pipe.transform(testcont)
+    print(str(len(M))+" "+str(len(M[0])))
+#    print(M[0])
+    M = pipe.transform(Xt)
+    print(str(len(M))+" "+str(len(M[0])))
+    print("  failed to classify")
+    A = pipe.predict(X)
+    for i in range(len(A)):
+        if A[i] != y[i]:
+#            print(" ")
+            print(y[i].ljust(6)+" misclassified as "+A[i])
+#            print(X[i])
+#    print(M[0])
+
+
+    cf = CustomFeaturizer()
+    M = cf.transform(testcont)
+    print(str(len(M))+" "+str(len(M[0])))
+#    print(M[0])
+    M = cf.transform(Xt)
+    print(str(len(M))+" "+str(len(M[0])))
+#    print(M[0])
     #print(testcont)
 
     ans = read_answers()
@@ -281,3 +345,5 @@ def fit4(contents, ltype):
     word_list = re.findall(r"^#", "# include ")
     print(word_list)
     print(len(word_list))
+
+    demo_class(testcont, ans)
diff --git a/lclassifier/output.txt b/lclassifier/output.txt
new file mode 100644
index 0000000..987c632
--- /dev/null
+++ b/lclassifier/output.txt
@@ -0,0 +1,129 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+32 15
+217 15
+  failed to classify
+hs     misclassified as py
+racket misclassified as sbcl
+racket misclassified as sbcl
+racket misclassified as sbcl
+clj    misclassified as sbcl
+racket misclassified as sbcl
+hs     misclassified as pl
+racket misclassified as sbcl
+scala  misclassified as c
+clj    misclassified as sbcl
+racket misclassified as sbcl
+hs     misclassified as py
+hs     misclassified as py
+racket misclassified as sbcl
+cs     misclassified as js
+clj    misclassified as sbcl
+cs     misclassified as py
+scala  misclassified as py
+clj    misclassified as sbcl
+racket misclassified as sbcl
+racket misclassified as sbcl
+ruby   misclassified as racket
+racket misclassified as sbcl
+scala  misclassified as py
+clj    misclassified as sbcl
+js     misclassified as racket
+hs     misclassified as pl
+hs     misclassified as sbcl
+racket misclassified as sbcl
+racket misclassified as sbcl
+js     misclassified as racket
+scala  misclassified as py
+racket misclassified as sbcl
+32 36
+217 36
+<_csv.reader object at 0x10855d908>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_train 1 0.986332574032
+ score_test  1 0.898617511521
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'ruby' 'py' 'py' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'ruby' 'sbcl' 'racket' 'racket' 'java'
+ 'ruby' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'php' 'ocaml' 'ocaml']
+ 
+ score_train 2 1.0
+ score_test  2 0.972350230415
+ score_quest 2 0.65625
+ pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'ruby' 'ruby' 'sbcl' 'java' 'java' 'scala'
+ 'scala' 'ruby' 'ruby' 'php' 'php' 'java' 'ocaml' 'ruby']
+ 
+ score_train 3 0.833712984055
+ score_test  3 0.78801843318
+ score_quest 3 0.71875
+ pred 3 ['clj' 'clj' 'clj' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'racket' 'hs' 'scala' 'racket' 'scala' 'java' 'racket'
+ 'scala' 'scala' 'racket' 'racket' 'java' 'php' 'php' 'ocaml' 'ocaml']
+ 
+ score_train 4 0.974943052392
+ score_test  4 0.981566820276
+ score_quest 4 0.71875
+ pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala'
+ 'scala' 'c' 'c' 'java' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_train 5 1.0
+ score_test  5 0.976958525346
+ score_quest 5 0.59375
+ pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' 'hs'
+ 'scala' 'ruby' 'ruby' 'php' 'php' 'cs' 'ocaml' 'ruby']
+ 
+['#']
+1
+clj      10   4    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    4    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+py       0    0    0    74   18   10   60   0    0    0    0    0    0    0    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+js       1    2    0    0    0    0    0    42   20   86   0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    2    0    
+ruby     0    0    0    0    0    0    0    0    0    0    20   11   3    7    4    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+haskell  0    0    0    0    0    59   0    0    0    0    0    0    0    0    0    1    10   0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    
+racket   126  5    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    52   5    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    78   13   
+scala    1    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    5    7    0    0    0    0    0    0    0    0    0    0    2    2    0    
+tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    
+php      0    0    0    0    0    0    0    0    0    8    0    0    0    0    0    0    0    0    0    17   0    0    0    0    11   5    0    0    0    0    0    0    0    0    92   26   
+ocaml    6    0    0    0    0    0    0    0    4    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    35   1    0    0    0    0    0    0    1    0    
diff --git a/test_lclassifier.py b/lclassifier/tests/test_lclassifier.py
similarity index 100%
rename from test_lclassifier.py
rename to lclassifier/tests/test_lclassifier.py
diff --git a/ref_program.py b/ref_program.py
new file mode 100644
index 0000000..d6b7a6d
--- /dev/null
+++ b/ref_program.py
@@ -0,0 +1,54 @@
+import csv
+import re
+import numpy as np
+import random
+
+#from textblob import TextBlob
+from collections import Counter
+
+from sklearn.pipeline import make_pipeline, make_union
+from sklearn.base import TransformerMixin
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix
+
+
+class DumbFeaturizer(TransformerMixin):
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        matrix = []
+        for i in range(len(X)):
+            vector = []
+            for j in range(11):
+                if j == X[i]:
+                    vector.append(1)
+                else:
+                    vector.append(0)
+            matrix.append(vector)
+        return matrix
+
+N = 22
+y = [0] * N
+X = [0] * N
+for k in range(N):
+    val = random.randrange(11)
+    y[k] = val
+    X[k] = val
+
+
+dumb = DumbFeaturizer()
+print(dumb.transform(X))
+
+pipe = make_pipeline(dumb, DecisionTreeClassifier())
+pipe.fit(X, y)
+# Our baseline
+print(pipe.score(X, y))
+print(" ")
+print(" transform ")
+print(pipe.transform(X))

From cef4b3df3857713f5d1b1d8a8053f047c5bb7804 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Sat, 6 Jun 2015 23:49:09 -0400
Subject: [PATCH 7/9] added command line argument feature, trying to wrap up

---
 lclassifier/lclassifier.py | 93 ++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py
index bd14944..3d5502e 100644
--- a/lclassifier/lclassifier.py
+++ b/lclassifier/lclassifier.py
@@ -15,6 +15,7 @@
 # other utilities
 import csv
 import re
+import sys
 
 
 def acceptable_file(text):
@@ -60,7 +61,7 @@ def list_uniques(alist):
 
 def load_file_names():
     l = [0 for i in range(5)]
-    s = "benchmarksgame-2014-08-31/benchmarksgame/bench/"
+    s = "../benchmarksgame-2014-08-31/benchmarksgame/bench/"
     max_lvl = 4
     for i in range(max_lvl):
         l[i] = glob(s+"*/"*i+"*.*")
@@ -70,7 +71,7 @@ def load_file_names():
     filelist = []
     for i in range(max_lvl):
         filelist += l[i]
-    testlist = glob("test/*")
+    testlist = glob("../test/*")
 
     print("   total samples "+str(len(filelist)))
     return filelist, testlist
@@ -125,7 +126,7 @@ def load_files(filelist, testlist):
     #print(testlist)
 
 def read_answers():
-    with open("test.csv") as csvfile:
+    with open("../test.csv") as csvfile:
         ans_list = csv.reader(csvfile, delimiter=",")
         ans = []
         print(ans_list)
@@ -224,19 +225,14 @@ def transform(self, X):
         matrix = []
         for text in X:
             v = [0] * len(reg_list)
-#            print(str(len(v))+" "+str(len(reg_list)))
             for i in range(len(reg_list)):
-#                print(reg_expr)
                 reg_expr = reg_list[i]
                 prog = re.compile(reg_expr, flags=re.MULTILINE)
                 val = len(prog.findall(text))#/len(text)
                 #if val > 0:
                 #    val = 1
-#                print(i)
                 v[i] = val
-#            print(vector)
             matrix.append(v)
-#        print(matrix[0])
         return matrix
 
 
@@ -288,12 +284,8 @@ def demo_class(X, y):
             print(str(int(M[j][k])).ljust(5), end="")
         print("")
 
-#sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature,
-#                                  percent_periods_feature)
-#big_list = sms_featurizer.transform(sms_data[:10])
-#print(big_list)
 
-if __name__ == "__main__":
+def default_action():
     filelist, testlist = load_file_names()
     contents, ltype, testcont = load_files(filelist, testlist)
 
@@ -301,34 +293,40 @@ def demo_class(X, y):
 
     X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)
     pipel = [0 for i in range(len(plist))]
+    print(" score for    training_set     test_set")
+    for i in range(len(plist)):
+        pipe = plist[i](X, y)
+        print(str(i).ljust(4)+" "+str(round(pipe.score(X, y),4)).ljust(8)\
+              +str(round(pipe.score(Xt, yt),4)).ljust(8))
+    print(" ")
     for i in range(len(plist)):
-        pipel[i] = plist[i](X, y)
-    #pipe1 = fit1(contents, ltype)
-    #pipe2 = fit2(contents, ltype)
-    pipe = fit4(X, y)
-    M = pipe.transform(testcont)
-    print(str(len(M))+" "+str(len(M[0])))
-#    print(M[0])
-    M = pipe.transform(Xt)
-    print(str(len(M))+" "+str(len(M[0])))
+        pipel[i] = plist[i](contents, ltype)
+
     print("  failed to classify")
+    failed_to_classify = {}
+    wrongly_classified = {}
     A = pipe.predict(X)
     for i in range(len(A)):
         if A[i] != y[i]:
 #            print(" ")
             print(y[i].ljust(6)+" misclassified as "+A[i])
-#            print(X[i])
-#    print(M[0])
-
-
-    cf = CustomFeaturizer()
-    M = cf.transform(testcont)
-    print(str(len(M))+" "+str(len(M[0])))
-#    print(M[0])
-    M = cf.transform(Xt)
-    print(str(len(M))+" "+str(len(M[0])))
-#    print(M[0])
-    #print(testcont)
+            if y[i] in failed_to_classify:
+                failed_to_classify[y[i]] += 1
+            else:
+                failed_to_classify[y[i]] = 1
+            if A[i] in wrongly_classified:
+                wrongly_classified[A[i]] += 1
+            else:
+                wrongly_classified[A[i]] = 1
+    print("")
+    print(" failure counts")
+    print("  wrongly classified:")
+    for ext in wrongly_classified:
+        print(ext.ljust(7) + "#"*wrongly_classified[ext])
+    print("  failed to classify")
+    for ext in failed_to_classify:
+        print(ext.ljust(7) + "#"*failed_to_classify[ext])
+    print(" ")
 
     ans = read_answers()
     print(ans)
@@ -336,14 +334,29 @@ def demo_class(X, y):
     i = 0
     for pipe in pipel:
         i += 1
-        print(" score_train "+str(i)+" "+str(pipe.score(X, y)))
-        print(" score_test  "+str(i)+" "+str(pipe.score(Xt, yt)))
         print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans)))
         print(" pred "+str(i)+" "+str(pipe.predict(testcont)))
         print(" ")
 
-    word_list = re.findall(r"^#", "# include ")
-    print(word_list)
-    print(len(word_list))
-
     demo_class(testcont, ans)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        default_action()
+    elif len(sys.argv) == 2:
+        test_file = sys.argv[1]
+        print("Estimating file type of "+ test_file)
+
+        filelist, testlist = load_file_names()
+        X, y, testcont = load_files(filelist, testlist)
+        pipe = fit6(X, y)
+        with open(test_file) as f:
+            test_contents = f.read()
+#        print(test_contents)
+        est_ext = pipe.predict([test_contents])
+
+        print("Predicted extension: "+str(est_ext))
+
+    else:
+        print("error: command line arguments not supported")

From cfff3c0b5a4a7fa04703b8a3a319a7841325995b Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Sun, 7 Jun 2015 10:26:33 -0400
Subject: [PATCH 8/9] polished workbook

---
 Lang_classifier_use.ipynb  | 289 ++++++++++++++++++++++++++++++++-----
 lclassifier/lclassifier.py |  92 ++++++------
 lclassifier/output.txt     | 111 +++++---------
 3 files changed, 343 insertions(+), 149 deletions(-)

diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb
index 3f20fdf..d66060b 100644
--- a/Lang_classifier_use.ipynb
+++ b/Lang_classifier_use.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
@@ -11,9 +11,16 @@
     "from lclassifier.lclassifier import *"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Demonstration of Language Classifier (lclassifier)"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
@@ -24,18 +31,40 @@
        "True"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "acceptable_file(\"py\")"
+    "acceptable_file(\"py\") # testing that import is functional"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'py'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_ext(\"python3\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
@@ -98,18 +127,227 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pipe = fit6(Xt, yt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "testing set outcomes\n",
+      "scala    scala   \n",
+      "c        c       \n",
+      "c        c       \n",
+      "clj      clj     \n",
+      "java     java    \n",
+      "py       py      \n",
+      "clj      clj     \n",
+      "js       js      \n",
+      "c        c       \n",
+      "pl       pl      \n",
+      "cs       cs      \n",
+      "c        c       \n",
+      "c        c       \n",
+      "ocaml    ocaml   \n",
+      "hs       hs      \n",
+      "sbcl     sbcl    \n",
+      "racket   racket  \n",
+      "php      php     \n",
+      "pl       pl      \n",
+      "ocaml    ocaml   \n",
+      "\n",
+      " overall score: 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "M = pipe.predict(Xt)\n",
+    "print(\"testing set outcomes\")\n",
+    "for i in range(20):\n",
+    "    print(M[i].ljust(8)+ \" \" + yt[i].ljust(8))\n",
+    "print(\"\")\n",
+    "print(\" overall score: \"+str(pipe.score(Xt, yt)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Test Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['php', 'php', 'php', 'clj', 'py', 'py', 'py', 'py', 'js', 'js',\n",
+       "       'js', 'js', 'ruby', 'ruby', 'ruby', 'hs', 'php', 'hs', 'racket',\n",
+       "       'php', 'racket', 'java', 'java', 'scala', 'scala', 'php', 'php',\n",
+       "       'java', 'php', 'java', 'ocaml', 'php'], \n",
+       "      dtype='<U6')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.predict(testcont) # prediction of file types for the data given"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<_csv.reader object at 0x10bfd6198>\n",
+      " number of testing file types: 11\n",
+      " actual_file_type  predicted_type\n",
+      "clj       php       \n",
+      "clj       php       \n",
+      "clj       php       \n",
+      "clj       clj       \n",
+      "py        py        \n",
+      "py        py        \n",
+      "py        py        \n",
+      "py        py        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "js        js        \n",
+      "ruby      ruby      \n",
+      "ruby      ruby      \n",
+      "ruby      ruby      \n",
+      "haskell   hs        \n",
+      "haskell   php       \n",
+      "haskell   hs        \n",
+      "racket    racket    \n",
+      "racket    php       \n",
+      "racket    racket    \n",
+      "java      java      \n",
+      "java      java      \n",
+      "scala     scala     \n",
+      "scala     scala     \n",
+      "tcl       php       \n",
+      "tcl       php       \n",
+      "php       java      \n",
+      "php       php       \n",
+      "php       java      \n",
+      "ocaml     ocaml     \n",
+      "ocaml     php       \n",
+      " \n",
+      " score: 0.625\n"
+     ]
+    }
+   ],
+   "source": [
+    "ans = read_answers()\n",
+    "M = pipe.predict(testcont)\n",
+    "print(\" actual_file_type  predicted_type\")\n",
+    "for i in range(len(ans)):\n",
+    "    print(ans[i].ljust(10)+M[i].ljust(10))\n",
+    "print(\" \")\n",
+    "print(\" score: \"+str(pipe.score(testcont, ans)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This score isn't very good, but it would be difficult to match all these no matter what methods were being used due to the small quantity of training data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Single file demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "pipe = fit5(Xt, yt)"
+    "py_file = '''JOIN_RETRANSMIT = 0.7\n",
+    "CATCHUP_INTERVAL = 0.6\n",
+    "ACCEPT_RETRANSMIT = 1.0\n",
+    "PREPARE_RETRANSMIT = 1.0\n",
+    "INVOKE_RETRANSMIT = 0.5\n",
+    "LEADER_TIMEOUT = 1.0\n",
+    "NULL_BALLOT = Ballot(-1, -1)  # sorts before all real ballots\n",
+    "NOOP_PROPOSAL = Proposal(None, None, None)  # no-op to fill otherwise empty slots\n",
+    "\n",
+    "class Node(object):\n",
+    "    unique_ids = itertools.count()\n",
+    "\n",
+    "    def __init__(self, network, address):\n",
+    "        self.network = network\n",
+    "        self.address = address or 'N%d' % self.unique_ids.next()\n",
+    "        self.logger = SimTimeLogger(logging.getLogger(self.address), {'network': self.network})\n",
+    "        self.logger.info('starting')\n",
+    "        self.roles = []\n",
+    "        self.send = functools.partial(self.network.send, self)\n",
+    "\n",
+    "    def register(self, roles):\n",
+    "        self.roles.append(roles)\n",
+    "\n",
+    "    def unregister(self, roles):\n",
+    "        self.roles.remove(roles)\n",
+    "\n",
+    "    def receive(self, sender, message):\n",
+    "        handler_name = 'do_%s' % type(message).__name__\n",
+    "\n",
+    "        for comp in self.roles[:]:\n",
+    "            if not hasattr(comp, handler_name):\n",
+    "                continue\n",
+    "            comp.logger.debug(\"received %s from %s\", message, sender)\n",
+    "            fn = getattr(comp, handler_name)\n",
+    "            fn(sender=sender, **message._asdict())\n",
+    "\n",
+    "class Timer(object):\n",
+    "\n",
+    "    def __init__(self, expires, address, callback):\n",
+    "        self.expires = expires\n",
+    "        self.address = address\n",
+    "        self.callback = callback\n",
+    "        self.cancelled = False'''"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "metadata": {
     "collapsed": false
    },
@@ -117,39 +355,24 @@
     {
      "data": {
       "text/plain": [
-       "array(['hs', 'c', 'java', 'ruby', 'clj', 'hs', 'racket', 'c', 'clj',\n",
-       "       'ruby', 'c', 'pl', 'c', 'c', 'scala', 'ruby', 'cs', 'c', 'cs',\n",
-       "       'sbcl', 'java', 'ruby', 'pl', 'ocaml', 'ocaml', 'cs', 'hs', 'php',\n",
-       "       'c', 'pl', 'c', 'scala', 'ruby', 'c', 'cs', 'c', 'ocaml', 'hs',\n",
-       "       'scala', 'c', 'hs', 'ruby', 'c', 'c', 'ocaml', 'sbcl', 'ruby', 'c',\n",
-       "       'c', 'ruby', 'c', 'ocaml', 'java', 'c', 'ruby', 'ruby', 'php', 'c',\n",
-       "       'clj', 'cs', 'php', 'java', 'c', 'ruby', 'py', 'cs', 'scala', 'php',\n",
-       "       'c', 'js', 'cs', 'js', 'c', 'php', 'php', 'php', 'pl', 'c', 'ruby',\n",
-       "       'clj', 'php', 'c', 'pl', 'py', 'clj', 'c', 'c', 'py', 'sbcl', 'py',\n",
-       "       'ruby', 'php', 'py', 'php', 'c', 'php', 'ruby', 'ruby', 'ruby',\n",
-       "       'scala', 'py', 'ruby', 'clj', 'php', 'ruby', 'c', 'ocaml', 'racket',\n",
-       "       'php', 'hs', 'hs', 'sbcl', 'ocaml', 'py', 'scala', 'ruby', 'cs',\n",
-       "       'c', 'c', 'c', 'c', 'clj', 'sbcl', 'scala', 'cs', 'py', 'c', 'cs',\n",
-       "       'cs', 'hs', 'c', 'java', 'php', 'java', 'js', 'clj', 'ruby', 'c',\n",
-       "       'hs', 'ruby', 'c', 'php', 'py', 'scala', 'clj', 'cs', 'cs', 'ruby',\n",
-       "       'sbcl', 'cs', 'scala', 'cs', 'c', 'scala', 'clj', 'c', 'clj', 'pl',\n",
-       "       'ruby', 'racket', 'java', 'cs', 'js', 'ocaml', 'c', 'py', 'c',\n",
-       "       'scala', 'js', 'clj', 'c', 'clj', 'ruby', 'clj', 'racket', 'c',\n",
-       "       'ocaml', 'js', 'pl', 'java', 'hs', 'java', 'py', 'php', 'java',\n",
-       "       'ruby', 'sbcl', 'ruby', 'php', 'scala', 'py', 'c', 'racket', 'php',\n",
-       "       'c', 'js', 'java', 'php', 'java', 'pl', 'c', 'py', 'php', 'py', 'c',\n",
-       "       'cs', 'py', 'cs', 'c', 'c', 'clj', 'java', 'ocaml', 'cs', 'java',\n",
-       "       'ocaml', 'cs'], \n",
+       "array(['py'], \n",
        "      dtype='<U6')"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pipe.predict(Xt)"
+    "pipe.predict([py_file])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This correctly predicted that the file is python"
    ]
   },
   {
diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py
index 3d5502e..884e14e 100644
--- a/lclassifier/lclassifier.py
+++ b/lclassifier/lclassifier.py
@@ -24,6 +24,7 @@ def acceptable_file(text):
     else:
         return False
 
+
 def clean_ext(textp):
     text = textp.strip()
     if text == "gcc" or text == "h" or text == "gpp":
@@ -51,6 +52,8 @@ def clean_ext(textp):
 
 llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js",
          "ocaml", "pl", "php", "py", "ruby", "scala", "racket"]
+main_dir = "/Users/admin/Documents/week5/programming-language-classifier"
+
 
 def list_uniques(alist):
     rlist = []
@@ -59,21 +62,22 @@ def list_uniques(alist):
             rlist.append(item)
     return rlist
 
+
 def load_file_names():
     l = [0 for i in range(5)]
-    s = "../benchmarksgame-2014-08-31/benchmarksgame/bench/"
+    s = main_dir+"/benchmarksgame-2014-08-31/benchmarksgame/bench/"
     max_lvl = 4
     for i in range(max_lvl):
-        l[i] = glob(s+"*/"*i+"*.*")
+        l[i] = glob(s + "*/" * i + "*.*")
 #    l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*")
 #    l2 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*/*.*")
 #    filelist = l1 + l2
     filelist = []
     for i in range(max_lvl):
         filelist += l[i]
-    testlist = glob("../test/*")
+    testlist = glob(main_dir+"/test/*")
 
-    print("   total samples "+str(len(filelist)))
+    print("   total samples " + str(len(filelist)))
     return filelist, testlist
 
 
@@ -83,27 +87,27 @@ def load_files(filelist, testlist):
     ext_list = []
     for filename in filelist:
         i = filename.rfind(".")
-        ext = clean_ext(filename[i+1:])
+        ext = clean_ext(filename[i + 1:])
         if ext == "tcl":
             print(filename)
     #    print(ext, end=" - ")
     #    print(ext+ str(ext in ext_list) + " - "+str(ext_list))
-        if not ext in ext_list:
+        if ext not in ext_list:
             ext_list.append(ext)
         if acceptable_file(ext):
             ltype.append(ext)
             with open(filename, encoding="ISO-8859-1") as file:
-    #            print(filename)
+                #            print(filename)
                 contents.append(file.read())
 #    return contents, ltype
 
-    print(" number of usable files "+str(len(ltype)))
+    print(" number of usable files " + str(len(ltype)))
     print(" ")
-    print(" number of read file types:  "+str(len(ext_list)))
-    print(" number of recognized types: "+str(len(llist)))
+    print(" number of read file types:  " + str(len(ext_list)))
+    print(" number of recognized types: " + str(len(llist)))
     print(" summary of tile types")
     for ext in ext_list:
-        print(ext.ljust(12)+ "  ", end=" ")
+        print(ext.ljust(12) + "  ", end=" ")
         if ext in llist:
             print(ltype.count(ext), end=" ")
         print(" ")
@@ -115,32 +119,33 @@ def load_files(filelist, testlist):
 
     testcont = [0] * 32
     for filename in testlist:
-    #    print(filename)
+        #    print(filename)
         with open(filename) as file:
             di = filename.rfind("/")
-            i = int(filename[di+1:])
+            i = int(filename[di + 1:])
 #            print(filename+" "+str(i))
-            testcont[i-1] = file.read()
+            testcont[i - 1] = file.read()
     print(" ")
     return contents, ltype, testcont
-    #print(testlist)
+    # print(testlist)
+
 
 def read_answers():
-    with open("../test.csv") as csvfile:
+    with open(main_dir+"/test.csv") as csvfile:
         ans_list = csv.reader(csvfile, delimiter=",")
         ans = []
         print(ans_list)
         for row in ans_list:
             ans.append(clean_ext(row[1]))
-    print(" number of testing file types: "+str(len(list_uniques(ans))))
+    print(" number of testing file types: " + str(len(list_uniques(ans))))
 #            print(row[0])
     return ans
 
 
 def fit1(contents, ltype):
     pipe = Pipeline([('bag_of_words', CountVectorizer()),
-                          ('tfidf', TfidfTransformer()),
-                          ('bayes', MultinomialNB())])
+                     ('tfidf', TfidfTransformer()),
+                     ('bayes', MultinomialNB())])
     pipe.fit(contents, ltype)
     return pipe
 #    print(pipe.score(contents, ltype))
@@ -150,14 +155,15 @@ def fit1(contents, ltype):
 
 def fit2(contents, ltype):
     pipe = Pipeline([('bag_of_words', CountVectorizer()),
-#                          ('tfidf', TfidfTransformer()),
-                          ('bayes', MultinomialNB())])
+                     #                          ('tfidf', TfidfTransformer()),
+                     ('bayes', MultinomialNB())])
     pipe.fit(contents, ltype)
     return pipe
 #    print(pipe.score(contents, ltype))
 #    print(pipe.predict(testcont))
 #    return pipe.score(contents, ltype)
 
+
 def print_matrix(matrix, p_max=None):
     if p_max is None:
         upper_limit = len(matrix)
@@ -166,12 +172,13 @@ def print_matrix(matrix, p_max=None):
     for i in range(upper_limit):
         vector = matrix[i]
         for val in vector:
-            print(str(round(val, 3)).ljust(5)+",", end="")
+            print(str(round(val, 3)).ljust(5) + ",", end="")
         print("")
         #print([str(round(val, 3)) for val in vector])
 
 
 class CustomFeaturizer(TransformerMixin):
+
     def __init__(self):
         pass
         #self.featurizers = featurizers
@@ -213,24 +220,23 @@ def transform(self, X):
         scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
         tcl = ["^[ \t]*proc \w*::\w* \{"]
         php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
-                "^[ \t]*\$\w* ?=.*;$"]
+               "^[ \t]*\$\w* ?=.*;$"]
         ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
         perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
         gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<",
                "^using .*;$", "sealed"]
-#        reg_list = char_list + word_list
+
         reg_list = clojure + python + js + ruby + hs + clj + java + scl\
-                   + tcl + php + ocaml + perl + gcc + cish
-#        print(len(reg_list))
+            + tcl + php + ocaml + perl + gcc + cish
+
         matrix = []
         for text in X:
             v = [0] * len(reg_list)
             for i in range(len(reg_list)):
                 reg_expr = reg_list[i]
                 prog = re.compile(reg_expr, flags=re.MULTILINE)
-                val = len(prog.findall(text))#/len(text)
-                #if val > 0:
-                #    val = 1
+                val = len(prog.findall(text))  # /len(text)
+                # this was found to have best results over normalized forms
                 v[i] = val
             matrix.append(v)
         return matrix
@@ -258,6 +264,8 @@ def fit5(contents, ltype):
 
 
 def fit6(contents, ltype):
+    '''Random Forest uses multiple decision trees and selects the
+       tree out of all of those which has occurs the most'''
     custom_feature = CustomFeaturizer()
     pipe = make_pipeline(custom_feature, RandomForestClassifier())
     pipe.fit(contents, ltype)
@@ -278,10 +286,11 @@ def demo_class(X, y):
                 typecont[j] += text
     custom_feature = CustomFeaturizer()
     M = custom_feature.transform(typecont)
+    ratio = 1000 / max([max(vt) for vt in M])
     for j in range(len(M)):
-        print(types[j].ljust(8)+" ", end="")
+        print(types[j].ljust(8) + " ", end="")
         for k in range(len(M[0])):
-            print(str(int(M[j][k])).ljust(5), end="")
+            print(str(int(ratio*M[j][k])).ljust(5), end="")
         print("")
 
 
@@ -296,8 +305,8 @@ def default_action():
     print(" score for    training_set     test_set")
     for i in range(len(plist)):
         pipe = plist[i](X, y)
-        print(str(i).ljust(4)+" "+str(round(pipe.score(X, y),4)).ljust(8)\
-              +str(round(pipe.score(Xt, yt),4)).ljust(8))
+        print(str(i).ljust(4) + " " + str(round(pipe.score(X, y), 4)).ljust(8)
+              + str(round(pipe.score(Xt, yt), 4)).ljust(8))
     print(" ")
     for i in range(len(plist)):
         pipel[i] = plist[i](contents, ltype)
@@ -308,8 +317,8 @@ def default_action():
     A = pipe.predict(X)
     for i in range(len(A)):
         if A[i] != y[i]:
-#            print(" ")
-            print(y[i].ljust(6)+" misclassified as "+A[i])
+            #            print(" ")
+            print(y[i].ljust(6) + " misclassified as " + A[i])
             if y[i] in failed_to_classify:
                 failed_to_classify[y[i]] += 1
             else:
@@ -322,10 +331,10 @@ def default_action():
     print(" failure counts")
     print("  wrongly classified:")
     for ext in wrongly_classified:
-        print(ext.ljust(7) + "#"*wrongly_classified[ext])
+        print(ext.ljust(7) + "#" * wrongly_classified[ext])
     print("  failed to classify")
     for ext in failed_to_classify:
-        print(ext.ljust(7) + "#"*failed_to_classify[ext])
+        print(ext.ljust(7) + "#" * failed_to_classify[ext])
     print(" ")
 
     ans = read_answers()
@@ -334,8 +343,8 @@ def default_action():
     i = 0
     for pipe in pipel:
         i += 1
-        print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans)))
-        print(" pred "+str(i)+" "+str(pipe.predict(testcont)))
+        print(" score_quest " + str(i) + " " + str(pipe.score(testcont, ans)))
+        print(" pred " + str(i) + " " + str(pipe.predict(testcont)))
         print(" ")
 
     demo_class(testcont, ans)
@@ -346,17 +355,16 @@ def default_action():
         default_action()
     elif len(sys.argv) == 2:
         test_file = sys.argv[1]
-        print("Estimating file type of "+ test_file)
+        print("Estimating file type of " + test_file)
 
         filelist, testlist = load_file_names()
         X, y, testcont = load_files(filelist, testlist)
         pipe = fit6(X, y)
         with open(test_file) as f:
             test_contents = f.read()
-#        print(test_contents)
         est_ext = pipe.predict([test_contents])
 
-        print("Predicted extension: "+str(est_ext))
+        print("Predicted extension: " + str(est_ext))
 
     else:
         print("error: command line arguments not supported")
diff --git a/lclassifier/output.txt b/lclassifier/output.txt
index 987c632..577a52b 100644
--- a/lclassifier/output.txt
+++ b/lclassifier/output.txt
@@ -38,92 +38,55 @@ txt
 ozf             
  not included:  
  
-32 15
-217 15
+ score for    training_set     test_set
+0    0.9818  0.871   
+1    1.0     0.977   
+2    0.9658  0.9355  
+3    0.9795  0.9677  
+4    1.0     0.977   
+ 
+  failed to classify
+
+ failure counts
+  wrongly classified:
   failed to classify
-hs     misclassified as py
-racket misclassified as sbcl
-racket misclassified as sbcl
-racket misclassified as sbcl
-clj    misclassified as sbcl
-racket misclassified as sbcl
-hs     misclassified as pl
-racket misclassified as sbcl
-scala  misclassified as c
-clj    misclassified as sbcl
-racket misclassified as sbcl
-hs     misclassified as py
-hs     misclassified as py
-racket misclassified as sbcl
-cs     misclassified as js
-clj    misclassified as sbcl
-cs     misclassified as py
-scala  misclassified as py
-clj    misclassified as sbcl
-racket misclassified as sbcl
-racket misclassified as sbcl
-ruby   misclassified as racket
-racket misclassified as sbcl
-scala  misclassified as py
-clj    misclassified as sbcl
-js     misclassified as racket
-hs     misclassified as pl
-hs     misclassified as sbcl
-racket misclassified as sbcl
-racket misclassified as sbcl
-js     misclassified as racket
-scala  misclassified as py
-racket misclassified as sbcl
-32 36
-217 36
-<_csv.reader object at 0x10855d908>
+ 
+<_csv.reader object at 0x113420a58>
  number of testing file types: 11
 ['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
- score_train 1 0.986332574032
- score_test  1 0.898617511521
  score_quest 1 0.59375
- pred 1 ['clj' 'clj' 'clj' 'ruby' 'py' 'py' 'ruby' 'py' 'js' 'js' 'clj' 'php'
- 'ruby' 'clj' 'ruby' 'hs' 'hs' 'ruby' 'sbcl' 'racket' 'racket' 'java'
- 'ruby' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'php' 'ocaml' 'ocaml']
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
  
- score_train 2 1.0
- score_test  2 0.972350230415
  score_quest 2 0.65625
  pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
- 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'ruby' 'ruby' 'sbcl' 'java' 'java' 'scala'
- 'scala' 'ruby' 'ruby' 'php' 'php' 'java' 'ocaml' 'ruby']
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'racket' 'sbcl' 'java' 'java'
+ 'scala' 'scala' 'ruby' 'ruby' 'java' 'java' 'java' 'ocaml' 'ruby']
  
- score_train 3 0.833712984055
- score_test  3 0.78801843318
- score_quest 3 0.71875
- pred 3 ['clj' 'clj' 'clj' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
- 'ruby' 'ruby' 'hs' 'racket' 'hs' 'scala' 'racket' 'scala' 'java' 'racket'
- 'scala' 'scala' 'racket' 'racket' 'java' 'php' 'php' 'ocaml' 'ocaml']
+ score_quest 3 0.6875
+ pred 3 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
+ 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' 'racket' 'pl' 'pl'
+ 'scala' 'scala' 'racket' 'racket' 'pl' 'php' 'js' 'ocaml' 'ocaml']
  
- score_train 4 0.974943052392
- score_test  4 0.981566820276
  score_quest 4 0.71875
  pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
  'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala'
- 'scala' 'c' 'c' 'java' 'php' 'js' 'ocaml' 'ocaml']
+ 'scala' 'py' 'py' 'java' 'php' 'js' 'ocaml' 'ocaml']
  
- score_train 5 1.0
- score_test  5 0.976958525346
- score_quest 5 0.59375
+ score_quest 5 0.625
  pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby'
- 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' 'hs'
- 'scala' 'ruby' 'ruby' 'php' 'php' 'cs' 'ocaml' 'ruby']
+ 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java'
+ 'scala' 'scala' 'ruby' 'ruby' 'java' 'php' 'php' 'ocaml' 'ruby']
  
-['#']
-1
-clj      10   4    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    4    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
-py       0    0    0    74   18   10   60   0    0    0    0    0    0    0    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
-js       1    2    0    0    0    0    0    42   20   86   0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    2    0    
-ruby     0    0    0    0    0    0    0    0    0    0    20   11   3    7    4    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
-haskell  0    0    0    0    0    59   0    0    0    0    0    0    0    0    0    1    10   0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    
-racket   126  5    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    52   5    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
-java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    8    0    0    0    0    0    0    0    0    0    0    0    0    0    0    78   13   
-scala    1    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    5    7    0    0    0    0    0    0    0    0    0    0    2    2    0    
-tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    
-php      0    0    0    0    0    0    0    0    0    8    0    0    0    0    0    0    0    0    0    17   0    0    0    0    11   5    0    0    0    0    0    0    0    0    92   26   
-ocaml    6    0    0    0    0    0    0    0    4    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    35   1    0    0    0    0    0    0    1    0    
+clj      79   31   63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+py       0    0    0    587  142  79   476  0    0    0    0    0    0    0    63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+js       7    15   0    0    0    0    0    333  158  682  0    0    0    0    0    0    0    0    15   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    15   0    
+ruby     0    0    0    0    0    0    0    0    0    0    158  87   23   55   31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+haskell  0    0    0    0    0    468  0    0    0    0    0    0    0    0    0    7    79   0    0    0    0    0    0    0    0    0    7    0    0    0    0    0    0    0    0    0    
+racket   1000 39   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    412  39   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    
+java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    7    0    0    0    0    63   0    0    0    0    0    0    0    0    0    0    0    0    0    0    619  103  
+scala    7    0    0    0    0    7    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    39   55   0    0    0    0    0    0    0    0    0    0    15   15   0    
+tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    15   0    0    0    0    0    0    0    0    0    0    0    0    
+php      0    0    0    0    0    0    0    0    0    63   0    0    0    0    0    0    0    0    0    134  0    0    0    0    87   39   0    0    0    0    0    0    0    0    730  206  
+ocaml    47   0    0    0    0    0    0    0    31   0    15   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    277  7    0    0    0    0    0    0    7    0    

From 52c7a2563feeb9ee6b7e7b1ae05f6725f7ad85c6 Mon Sep 17 00:00:00 2001
From: Alan R <alan.rominger@gmail.com>
Date: Sun, 7 Jun 2015 11:38:25 -0400
Subject: [PATCH 9/9] compared with Ben's rules and aux set of my rules

---
 lclassifier/ben_output.txt | 319 +++++++++++++++++++++++++++++++++++++
 lclassifier/bens_rules.py  |  19 +++
 lclassifier/lclassifier.py | 150 +++++++++++------
 lclassifier/old_output.txt |  96 +++++++++++
 4 files changed, 534 insertions(+), 50 deletions(-)
 create mode 100644 lclassifier/ben_output.txt
 create mode 100644 lclassifier/bens_rules.py
 create mode 100644 lclassifier/old_output.txt

diff --git a/lclassifier/ben_output.txt b/lclassifier/ben_output.txt
new file mode 100644
index 0000000..c8534c9
--- /dev/null
+++ b/lclassifier/ben_output.txt
@@ -0,0 +1,319 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+ score for    training_set     test_set
+0    0.9932  0.8387  
+1    0.5399  0.3733  
+2    0.2346  0.235   
+3    0.2916  0.2765  
+4    0.533   0.3779  
+ 
+  failed to classify
+js     misclassified as c
+ocaml  misclassified as c
+java   misclassified as ruby
+cs     misclassified as c
+java   misclassified as ruby
+java   misclassified as c
+scala  misclassified as c
+php    misclassified as c
+scala  misclassified as c
+java   misclassified as c
+c      misclassified as ruby
+ruby   misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+cs     misclassified as c
+js     misclassified as c
+hs     misclassified as py
+scala  misclassified as c
+java   misclassified as c
+php    misclassified as c
+java   misclassified as c
+php    misclassified as ruby
+js     misclassified as c
+java   misclassified as ocaml
+hs     misclassified as clj
+pl     misclassified as c
+php    misclassified as c
+hs     misclassified as py
+pl     misclassified as ruby
+scala  misclassified as c
+php    misclassified as c
+ocaml  misclassified as c
+ocaml  misclassified as c
+java   misclassified as c
+ruby   misclassified as pl
+cs     misclassified as c
+ruby   misclassified as pl
+py     misclassified as c
+scala  misclassified as c
+scala  misclassified as hs
+js     misclassified as c
+java   misclassified as c
+cs     misclassified as c
+php    misclassified as c
+php    misclassified as ruby
+java   misclassified as c
+php    misclassified as pl
+php    misclassified as c
+hs     misclassified as ocaml
+pl     misclassified as ruby
+java   misclassified as c
+cs     misclassified as ruby
+php    misclassified as pl
+cs     misclassified as c
+c      misclassified as ruby
+java   misclassified as c
+py     misclassified as ruby
+hs     misclassified as c
+scala  misclassified as c
+c      misclassified as pl
+cs     misclassified as c
+c      misclassified as ruby
+cs     misclassified as c
+py     misclassified as c
+php    misclassified as ruby
+ruby   misclassified as hs
+pl     misclassified as ruby
+cs     misclassified as c
+scala  misclassified as c
+java   misclassified as c
+php    misclassified as c
+java   misclassified as c
+hs     misclassified as c
+ruby   misclassified as pl
+java   misclassified as ruby
+ocaml  misclassified as c
+hs     misclassified as ruby
+java   misclassified as pl
+hs     misclassified as c
+js     misclassified as php
+clj    misclassified as racket
+clj    misclassified as racket
+ruby   misclassified as c
+hs     misclassified as c
+ocaml  misclassified as c
+scala  misclassified as c
+cs     misclassified as c
+php    misclassified as c
+java   misclassified as ruby
+ocaml  misclassified as py
+clj    misclassified as racket
+php    misclassified as c
+java   misclassified as c
+ruby   misclassified as pl
+java   misclassified as c
+scala  misclassified as c
+php    misclassified as c
+py     misclassified as scala
+php    misclassified as pl
+py     misclassified as scala
+pl     misclassified as ruby
+js     misclassified as ruby
+php    misclassified as ruby
+hs     misclassified as c
+java   misclassified as c
+scala  misclassified as c
+c      misclassified as scala
+java   misclassified as c
+java   misclassified as c
+java   misclassified as c
+java   misclassified as c
+hs     misclassified as py
+ruby   misclassified as c
+php    misclassified as ruby
+hs     misclassified as py
+java   misclassified as c
+ruby   misclassified as pl
+c      misclassified as pl
+py     misclassified as ruby
+py     misclassified as ruby
+php    misclassified as ruby
+pl     misclassified as c
+java   misclassified as c
+cs     misclassified as c
+pl     misclassified as ruby
+js     misclassified as c
+java   misclassified as c
+pl     misclassified as c
+c      misclassified as scala
+js     misclassified as c
+clj    misclassified as racket
+hs     misclassified as scala
+pl     misclassified as c
+ruby   misclassified as c
+php    misclassified as c
+java   misclassified as c
+cs     misclassified as c
+php    misclassified as ruby
+ocaml  misclassified as ruby
+php    misclassified as c
+cs     misclassified as c
+pl     misclassified as c
+py     misclassified as c
+java   misclassified as c
+java   misclassified as c
+pl     misclassified as c
+php    misclassified as c
+pl     misclassified as ruby
+scala  misclassified as c
+ruby   misclassified as c
+clj    misclassified as racket
+php    misclassified as c
+java   misclassified as c
+ocaml  misclassified as py
+java   misclassified as c
+clj    misclassified as racket
+php    misclassified as ruby
+cs     misclassified as c
+ocaml  misclassified as scala
+ruby   misclassified as pl
+clj    misclassified as racket
+pl     misclassified as ruby
+ocaml  misclassified as c
+pl     misclassified as c
+cs     misclassified as c
+scala  misclassified as c
+hs     misclassified as c
+scala  misclassified as c
+js     misclassified as php
+hs     misclassified as c
+php    misclassified as ruby
+java   misclassified as c
+ruby   misclassified as pl
+scala  misclassified as c
+hs     misclassified as c
+cs     misclassified as c
+pl     misclassified as c
+js     misclassified as c
+cs     misclassified as scala
+pl     misclassified as c
+cs     misclassified as c
+ocaml  misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+ruby   misclassified as c
+c      misclassified as ruby
+js     misclassified as ruby
+php    misclassified as ruby
+pl     misclassified as c
+hs     misclassified as c
+pl     misclassified as ruby
+java   misclassified as c
+hs     misclassified as py
+scala  misclassified as c
+cs     misclassified as c
+c      misclassified as ruby
+java   misclassified as c
+js     misclassified as ruby
+ruby   misclassified as py
+php    misclassified as c
+java   misclassified as ruby
+java   misclassified as c
+php    misclassified as c
+py     misclassified as ocaml
+cs     misclassified as c
+
+ failure counts
+  wrongly classified:
+c      ###########################################################################################################################
+ruby   #######################################
+racket #######
+py     ########
+hs     ##
+clj    #
+ocaml  ###
+php    ##
+pl     #############
+scala  #######
+  failed to classify
+ruby   ###############
+cs     ######################
+scala  ################
+ocaml  ###########
+pl     ##################
+clj    #######
+c      ###########
+js     ############
+php    #############################
+hs     ##################
+py     #########
+java   #####################################
+ 
+<_csv.reader object at 0x10a1a2c18>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 2 0.125
+ pred 2 ['java' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php'
+ 'ruby' 'php' 'pl' 'ruby' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' 'c'
+ 'c' 'ruby' 'c' 'c' 'php' 'hs' 'ocaml' 'c' 'py']
+ 
+ score_quest 3 0.125
+ pred 3 ['scala' 'ocaml' 'ocaml' 'scala' 'ruby' 'ruby' 'ocaml' 'ruby' 'cs' 'cs'
+ 'cs' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'scala' 'ocaml'
+ 'scala' 'ocaml' 'ruby' 'ruby' 'ruby' 'scala' 'ruby' 'ruby' 'cs' 'cs'
+ 'ruby' 'ocaml']
+ 
+ score_quest 4 0.0625
+ pred 4 ['c' 'clj' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c'
+ 'c' 'sbcl' 'sbcl' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c']
+ 
+ score_quest 5 0.125
+ pred 5 ['clj' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php'
+ 'ruby' 'php' 'pl' 'py' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c'
+ 'ruby' 'c' 'ruby' 'clj' 'ruby' 'php' 'hs' 'ocaml' 'c' 'py']
+ 
+clj      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    571  142  142  0    
+py       0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  285  
+js       0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    428  285  142  428  
+ruby     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    142  142  142  571  
+haskell  0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  285  
+racket   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1000 0    142  142  
+java     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    142  142  142  0    
+scala    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  142  
+tcl      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  0    
+php      0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  428  
+ocaml    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    285  142  142  142  
diff --git a/lclassifier/bens_rules.py b/lclassifier/bens_rules.py
new file mode 100644
index 0000000..37f8572
--- /dev/null
+++ b/lclassifier/bens_rules.py
@@ -0,0 +1,19 @@
+
+
+
+    elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b',
+                '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b',
+                '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b']
+    results = []
+    for element in elements:
+        results.append(len(re.findall(element, text)))
+
+    elements = ['[)]+','[}]+', '[\]]+', '[=]+']
+
+    for element in elements:
+        runs = sorted(re.findall(element, text), key=len)
+        if runs:
+            results.append(len(runs[-1]))
+        else:
+            results.append(0)
+    return results
diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py
index 884e14e..9e96436 100644
--- a/lclassifier/lclassifier.py
+++ b/lclassifier/lclassifier.py
@@ -176,6 +176,99 @@ def print_matrix(matrix, p_max=None):
         print("")
         #print([str(round(val, 3)) for val in vector])
 
+def ben_transform(X):
+    elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b',
+                '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b',
+                '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b']
+
+    elements2 = ['[)]+','[}]+', '[\]]+', '[=]+']
+
+    matrix = []
+    for text in X:
+        results = []
+        for element in elements:
+            results.append(len(re.findall(element, text)))
+
+        for element in elements2:
+            runs = sorted(re.findall(element, text), key=len)
+            if runs:
+                results.append(len(runs[-1]))
+            else:
+                results.append(0)
+        matrix.append(results)
+    return matrix
+
+
+def alan_transform(X):
+    cish = ["^[ \t]*\*", "^[ \t]*/\*\*"]
+    clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "]
+    python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s",
+              "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"]
+    js = ["^[ \t]*var", "=\s*function",
+          "function\s*\w*\(\w*[\w\s,]*\)\s*\{"]
+    ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$",
+            "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"]
+    hs = ["&&&", "^\{-"]
+    clj = ["^\(define", "^[ \t]*;+"]
+    java = ["^[ \t]*public \w* \w*", "^import .*;$"]
+    scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
+    tcl = ["^[ \t]*proc \w*::\w* \{"]
+    php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
+           "^[ \t]*\$\w* ?=.*;$"]
+    ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
+    perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
+    gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<",
+           "^using .*;$", "sealed"]
+
+    reg_list = clojure + python + js + ruby + hs + clj + java + scl\
+        + tcl + php + ocaml + perl + gcc + cish
+
+    matrix = []
+    for text in X:
+        v = [0] * len(reg_list)
+        for i in range(len(reg_list)):
+            reg_expr = reg_list[i]
+            prog = re.compile(reg_expr, flags=re.MULTILINE)
+            val = len(prog.findall(text))  # /len(text)
+            # this was found to have best results over normalized forms
+            v[i] = val
+        matrix.append(v)
+    return matrix
+
+
+def old_transform(X):
+    char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
+                "this\.", "^end", ";", "\*", "%", "^do",
+                "\<\$php", "/\*", "__", "=", "==",
+                "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
+                "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
+                "int", "\b\*\w", "\(&\w", "argv", "\[\]"
+                "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
+                "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
+                "\(\w+:", "@", "\b@\w"]
+    word_list = ["private", "static", "make","let", "def", "^\(defn",
+                 "defn", "do", "class", "^function", "public",
+                 "unset", "printf\(", "return", "NULL", "void",
+                 "main\(", "main_", "void\s\*\w", "\{else\}",
+                 "char", "array\(", "__init__", "__str__", "token",
+                 "^import", "^from", "final", "val", "type", "package",
+                 "object", "String", "string", "primitive", "fixnum",
+                 "error", "try"]
+
+    reg_list = char_list + word_list
+
+    matrix = []
+    for text in X:
+        v = [0] * len(reg_list)
+        for i in range(len(reg_list)):
+            reg_expr = reg_list[i]
+            prog = re.compile(reg_expr, flags=re.MULTILINE)
+            val = len(prog.findall(text))  # /len(text)
+            # this was found to have best results over normalized forms
+            v[i] = val
+        matrix.append(v)
+    return matrix
+
 
 class CustomFeaturizer(TransformerMixin):
 
@@ -189,56 +282,13 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        # char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b",
-        #             "this\.", "^end", ";", "\*", "%", "^do",
-        #             "\<\$php", "/\*", "__", "=", "==",
-        #             "===", "\(\)", "\{\}", ":", "\+\+", "\+=",
-        #             "^#include", "^ \*", ":\s*$", "\<\<|\>\>",
-        #             "int", "\b\*\w", "\(&\w", "argv", "\[\]"
-        #             "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w",
-        #             "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{",
-        #             "\(\w+:", "@", "\b@\w"]
-        # word_list = ["private", "static", "make","let", "def", "^\(defn",
-        #              "defn", "do", "class", "^function", "public",
-        #              "unset", "printf\(", "return", "NULL", "void",
-        #              "main\(", "main_", "void\s\*\w", "\{else\}",
-        #              "char", "array\(", "__init__", "__str__", "token",
-        #              "^import", "^from", "final", "val", "type", "package",
-        #              "object", "String", "string", "primitive", "fixnum",
-        #              "error", "try"]
-        cish = ["^[ \t]*\*", "^[ \t]*/\*\*"]
-        clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "]
-        python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s",
-                  "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"]
-        js = ["^[ \t]*var", "=\s*function",
-              "function\s*\w*\(\w*[\w\s,]*\)\s*\{"]
-        ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$",
-                "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"]
-        hs = ["&&&", "^\{-"]
-        clj = ["^\(define", "^[ \t]*;+"]
-        java = ["^[ \t]*public \w* \w*", "^import .*;$"]
-        scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="]
-        tcl = ["^[ \t]*proc \w*::\w* \{"]
-        php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*",
-               "^[ \t]*\$\w* ?=.*;$"]
-        ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"]
-        perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"]
-        gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<",
-               "^using .*;$", "sealed"]
-
-        reg_list = clojure + python + js + ruby + hs + clj + java + scl\
-            + tcl + php + ocaml + perl + gcc + cish
-
-        matrix = []
-        for text in X:
-            v = [0] * len(reg_list)
-            for i in range(len(reg_list)):
-                reg_expr = reg_list[i]
-                prog = re.compile(reg_expr, flags=re.MULTILINE)
-                val = len(prog.findall(text))  # /len(text)
-                # this was found to have best results over normalized forms
-                v[i] = val
-            matrix.append(v)
+
+        #matrix = ben_transform(X)
+
+        #matrix = old_transform(X)
+
+        matrix = alan_transform(X)
+
         return matrix
 
 
diff --git a/lclassifier/old_output.txt b/lclassifier/old_output.txt
new file mode 100644
index 0000000..be1c816
--- /dev/null
+++ b/lclassifier/old_output.txt
@@ -0,0 +1,96 @@
+   total samples 931
+ number of usable files 656
+ 
+ number of read file types:  32
+ number of recognized types: 14
+ summary of tile types
+ats             
+clj            38  
+cs             41  
+dart            
+erlang          
+fpascal         
+fsharp          
+c              129  
+hs             33  
+gnat            
+go              
+php            55  
+ifc             
+java           51  
+js             25  
+ruby           73  
+lua             
+ocaml          35  
+oz              
+pl             34  
+py             36  
+racket         29  
+rust            
+sbcl           34  
+scala          43  
+vw              
+cint            
+javasteady      
+parrot          
+cc              
+txt             
+ozf             
+ not included:  
+ 
+ score for    training_set     test_set
+0    0.9863  0.9124  
+1    1.0     0.9401  
+2    0.82    0.7926  
+3    0.9636  0.9447  
+4    0.9977  0.9724  
+ 
+  failed to classify
+sbcl   misclassified as racket
+
+ failure counts
+  wrongly classified:
+racket #
+  failed to classify
+sbcl   #
+ 
+<_csv.reader object at 0x1113abc18>
+ number of testing file types: 11
+['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
+ score_quest 1 0.59375
+ pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php'
+ 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java'
+ 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml']
+ 
+ score_quest 2 0.375
+ pred 2 ['clj' 'clj' 'js' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' 'js' 'hs'
+ 'hs' 'hs' 'hs' 'js' 'hs' 'js' 'js' 'racket' 'ocaml' 'js' 'pl' 'scala'
+ 'ocaml' 'ocaml' 'php' 'js' 'php' 'ocaml' 'ocaml']
+ 
+ score_quest 3 0.5625
+ pred 3 ['clj' 'clj' 'cs' 'clj' 'py' 'py' 'sbcl' 'py' 'js' 'js' 'ruby' 'java'
+ 'ruby' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'c' 'c'
+ 'scala' 'scala' 'hs' 'hs' 'c' 'ruby' 'hs' 'ocaml' 'ocaml']
+ 
+ score_quest 4 0.59375
+ pred 4 ['clj' 'clj' 'ocaml' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'scala' 'cs'
+ 'scala' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'sbcl' 'racket' 'racket' 'js' 'js'
+ 'scala' 'scala' 'php' 'php' 'sbcl' 'php' 'php' 'ocaml' 'ocaml']
+ 
+ score_quest 5 0.4375
+ pred 5 ['clj' 'clj' 'ocaml' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala'
+ 'racket' 'scala' 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket'
+ 'ocaml' 'c' 'js' 'scala' 'scala' 'pl' 'php' 'php' 'racket' 'php' 'ocaml'
+ 'racket']
+ 
+clj      0    1    2    0    0    23   0    0    0    5    0    1    0    0    0    0    0    0    0    0    0    20   0    0    0    0    1    1    3    0    0    0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0    0    3    10   9    9    10   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    9    0    0    0    0    1    0    0    1    6    0    0    0    0    
+py       3    0    19   0    6    97   0    0    0    6    9    44   0    0    0    75   366  11   0    66   14   217  0    0    0    0    198  0    13   0    0    0    0    0    0    1    0    0    3    0    0    0    0    0    0    0    0    2    0    91   0    0    41   22   0    0    0    0    26   2    0    0    0    0    0    31   0    29   2    0    10   2    0    7    5    0    6    0    0    0    0    0    5    
+js       0    2    244  3    9    100  0    11   0    45   29   3    0    0    5    1    297  27   7    55   18   87   3    0    0    0    6    0    0    0    0    0    0    1    0    39   0    1    0    0    15   0    0    1    0    0    0    1    1    20   0    0    114  1    27   0    0    0    66   0    0    0    0    0    0    2    0    0    0    0    0    0    1    61   18   0    9    1    15   0    0    11   1    
+ruby     0    0    5    0    10   7    0    0    3    0    5    0    0    0    0    0    44   7    3    0    1    33   0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    7    19   0    1    0    1    0    20   0    0    5    9    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    1    
+haskell  0    92   33   63   28   24   0    0    0    6    11   0    0    0    0    0    92   2    0    5    0    81   6    0    0    0    1    1    5    0    0    0    0    0    13   0    0    0    0    0    0    0    0    16   0    0    0    0    3    6    0    0    179  1    0    0    0    0    14   0    0    0    0    0    0    1    0    0    0    0    76   0    0    0    0    0    0    145  1    0    0    5    0    
+racket   0    1    0    7    20   48   0    0    0    7    0    0    0    0    0    15   22   0    0    3    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    14   11   70   0    0    5    0    0    0    0    0    0    0    0    0    0    0    0    6    0    0    0    0    0    0    0    5    0    0    0    0    32   65   5    19   0    
+java     0    0    6    0    3    1    0    0    0    16   136  0    0    0    18   0    0    0    0    2    0    1    0    0    0    32   0    0    10   0    0    0    0    0    0    2    0    0    0    0    0    0    0    37   0    0    0    0    3    5    0    0    2    0    0    11   0    0    7    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    11   1    0    0    0    3    
+scala    0    3    33   32   0    28   0    0    0    0    6    0    0    0    1    0    71   0    0    0    0    57   0    0    0    2    0    0    0    0    0    0    0    0    0    3    0    0    0    0    0    1    23   2    0    0    0    0    0    16   0    0    2    31   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    15   37   9    2    9    13   0    0    0    0    1    
+tcl      0    0    48   92   13   41   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    31   0    0    0    0    0    10   0    0    0    6    0    0    0    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    2    0    
+php      0    93   53   185  1    14   0    0    0    74   999  7    0    0    33   5    27   3    2    28   0    18   0    0    0    0    0    0    5    0    0    0    0    0    1    2    0    0    0    0    0    0    0    36   0    0    0    0    0    0    0    0    6    5    0    23   0    0    35   0    0    0    0    0    0    0    3    0    0    0    0    0    0    20   0    0    1    0    18   0    0    0    0    
+ocaml    0    83   19   0    2    11   0    0    0    13   76   0    0    0    0    0    75   0    0    3    2    35   0    0    0    0    1    0    11   0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    1    57   3    0    0    0    0    0    0    0    0    9    0    0    0    0    0    0    0    0    0    0    0    0    0    0    10   22   0    0    20   9    0    0    0    2