From 2483ffcb3c05fa3b4f15b9ec157b00588ad5c050 Mon Sep 17 00:00:00 2001 From: Alan R Date: Wed, 3 Jun 2015 18:44:50 -0400 Subject: [PATCH 1/9] Have a corelation score, but lots left to do --- .gitignore | 2 ++ lclassifier.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 lclassifier.py diff --git a/.gitignore b/.gitignore index f00dbf2..02937f0 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,5 @@ docs/_build/ # PyBuilder target/ +.DS_store +benchmarksgame-2014-08-31/ diff --git a/lclassifier.py b/lclassifier.py new file mode 100644 index 0000000..e7f5e23 --- /dev/null +++ b/lclassifier.py @@ -0,0 +1,77 @@ +from glob import glob + +l1 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/*") +filelist = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*.*") + +print(str(len(l1))+" l2 "+str(len(filelist))) + +contents = [] +ltype = [] +for filename in filelist: + if "ocaml-2" not in filename: + i = filename.index(".") + ltype.append(filename[i:]) + with open(filename) as file: + contents.append(file.read()) + +testcont = [] +testlist = glob("test/*") +for filename in testlist: + print(filename) + with open(filename) as file: + testcont.append(file.read()) + +print(" ") +print(ltype) +print(" ") +#print(testcont[15]) +#print(testlist) + +#from scikit-learn.datasets import load_iris +from sklearn import datasets +iris = datasets.load_iris() +print(iris.keys()) +print(" ") +#print(iris.data) +print(" ") +print(iris.target) + +from sklearn import neighbors, datasets + +iris = datasets.load_iris() +X, y = iris.data, iris.target + +# create the model +knn = neighbors.KNeighborsClassifier(n_neighbors=5) + +# fit the model +knn.fit(X, y) + +# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? +# call the "predict" method: +result = knn.predict([[3, 5, 4, 2],]) + +print(iris.target_names[result]) + + + +import pandas as pd +import numpy as np +from sklearn import linear_model +from sklearn.cross_validation import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +from sklearn.feature_extraction.text import TfidfTransformer + + + +pipe = Pipeline([('bag_of_words', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('bayes', MultinomialNB())]) + +pipe.fit(contents, ltype) + +print(pipe.score(contents, ltype)) + +print(pipe.predict(testcont)) From aaa83f0155e13e4fcd1cbd3b52db92a9850d78df Mon Sep 17 00:00:00 2001 From: Alan R Date: Thu, 4 Jun 2015 17:14:39 -0400 Subject: [PATCH 2/9] added tests, close to custom features but not quite --- .envrc | 1 + .gitignore | 2 + lclassifier.py | 254 ++++++++++++++++++++++++++++++++------------ test_lclassifier.py | 9 ++ 4 files changed, 198 insertions(+), 68 deletions(-) create mode 100644 .envrc create mode 100644 test_lclassifier.py diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..94840b3 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +layout python3 diff --git a/.gitignore b/.gitignore index 02937f0..6d081c5 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,5 @@ target/ .DS_store benchmarksgame-2014-08-31/ + +.direnv/ diff --git a/lclassifier.py b/lclassifier.py index e7f5e23..6c82da0 100644 --- a/lclassifier.py +++ b/lclassifier.py @@ -1,60 +1,4 @@ from glob import glob - -l1 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/*") -filelist = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*.*") - -print(str(len(l1))+" l2 "+str(len(filelist))) - -contents = [] -ltype = [] -for filename in filelist: - if "ocaml-2" not in filename: - i = filename.index(".") - ltype.append(filename[i:]) - with open(filename) as file: - contents.append(file.read()) - -testcont = [] -testlist = glob("test/*") -for filename in testlist: - print(filename) - with open(filename) as file: - testcont.append(file.read()) - -print(" ") -print(ltype) -print(" ") -#print(testcont[15]) -#print(testlist) - -#from scikit-learn.datasets import load_iris -from sklearn import datasets -iris = datasets.load_iris() -print(iris.keys()) -print(" ") -#print(iris.data) -print(" ") -print(iris.target) - -from sklearn import neighbors, datasets - -iris = datasets.load_iris() -X, y = iris.data, iris.target - -# create the model -knn = neighbors.KNeighborsClassifier(n_neighbors=5) - -# fit the model -knn.fit(X, y) - -# What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? -# call the "predict" method: -result = knn.predict([[3, 5, 4, 2],]) - -print(iris.target_names[result]) - - - import pandas as pd import numpy as np from sklearn import linear_model @@ -63,15 +7,189 @@ from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer - - - -pipe = Pipeline([('bag_of_words', CountVectorizer()), - ('tfidf', TfidfTransformer()), - ('bayes', MultinomialNB())]) - -pipe.fit(contents, ltype) - -print(pipe.score(contents, ltype)) - -print(pipe.predict(testcont)) +import csv + + +def acceptable_file(text): + if text in llist: + return True + else: + return False + +def clean_ext(text): + if text == "gcc" or text == "h" or text == "gpp": + return "c" + elif text == "hack": + return "php" + elif text == "yarv" or text == "jruby": + return "ruby" + elif text == "clojure": + return "clj" + elif text == "python3" and text == "python": + return "py" + elif text == "perl": + return "pl" + elif text == "javascript": + return "js" + elif text == "csharp": + return "cs" + elif text == "ghc": + return "hs" + elif text == "scheme": + return "racket" + else: + return text + +llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js", + "ocaml", "pl", "php", "py", "ruby", "scala", "racket"] + +def load_file_names(): + l = [0 for i in range(5)] + s = "benchmarksgame-2014-08-31/benchmarksgame/" + max_lvl = 5 + for i in range(max_lvl): + l[i] = glob(s+"*/"*i+"*.*") +# l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*") +# l2 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*/*.*") +# filelist = l1 + l2 + filelist = [] + for i in range(max_lvl): + filelist += l[i] + testlist = glob("test/*") + + print(" total samples "+str(len(filelist))) + return filelist, testlist + + +def load_files(filelist, testlist): + contents = [] + ltype = [] + ext_list = [] + for filename in filelist: + i = filename.rfind(".") + ext = clean_ext(filename[i+1:]) + # print(ext, end=" - ") + # print(ext+ str(ext in ext_list) + " - "+str(ext_list)) + if not ext in ext_list: + ext_list.append(ext) + if acceptable_file(ext): + ltype.append(ext) + with open(filename, encoding="ISO-8859-1") as file: + # print(filename) + contents.append(file.read()) +# return contents, ltype + + print(" number of usable files "+str(len(ltype))) + print(" summary of tile types") + for ext in ext_list: + print(ext.ljust(12)+ " ", end=" ") + if ext in llist: + print(ltype.count(ext), end=" ") + print(" ") + print(" not included: ", end="") + for ext in llist: + if ext not in ext_list: + print(ext, end=" : ") + print(" ") + + testcont = [] + for filename in testlist: + # print(filename) + with open(filename) as file: + testcont.append(file.read()) + + print(" ") + return contents, ltype, testcont + #print(testcont[15]) + #print(testlist) + +def read_answers(): + with open("test.csv") as csvfile: + ans_list = csv.reader(csvfile, delimiter=",") + ans = [] + print(ans_list) + for row in ans_list: + ans.append(clean_ext(row[1])) + return ans + + +def fit1(contents, ltype): + pipe = Pipeline([('bag_of_words', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('bayes', MultinomialNB())]) + pipe.fit(contents, ltype) + return pipe +# print(pipe.score(contents, ltype)) +# print(pipe.predict(testcont)) +# return pipe.score(contents, ltype) + + +def fit2(contents, ltype): + pipe = Pipeline([('bag_of_words', CountVectorizer()), +# ('tfidf', TfidfTransformer()), + ('bayes', MultinomialNB())]) + pipe.fit(contents, ltype) + return pipe +# print(pipe.score(contents, ltype)) +# print(pipe.predict(testcont)) +# return pipe.score(contents, ltype) + + +class CustomFeaturizer: + def __init__(self, *featurizers): + self.featurizers = featurizers + + def fit(self, X, y=None): + """All scikit-lear compatible transforms and classifiers have the same interface, and + fit always returns the same object.""" + return self + + def transform(self, X): + fvs = [] + for datum in X: + fvs.append([f(datum) for f in self.featurizers]) + return fvs + + +def fit3(contents, ltype): + pipe = Pipeline([('custom_feature', CustomFeaturizer()), + ('bayes', MultinomialNB())]) + MultinomialNB() + model = MultinomialNB(X, y) + pipe.fit(contents, ltype) + return pipe + + +#sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature, +# percent_periods_feature) +#big_list = sms_featurizer.transform(sms_data[:10]) +#print(big_list) + +if __name__ == "__main__": + filelist, testlist = load_file_names() + contents, ltype, testcont = load_files(filelist, testlist) + + plist = [fit1, fit2] + + pipel = [0 for i in range(len(plist))] + for i in range(len(plist)): + pipel[i] = plist[i](contents, ltype) + #pipe1 = fit1(contents, ltype) + #pipe2 = fit2(contents, ltype) + + ans = read_answers() + print(ans) + + i = 0 + for pipe in pipel: + i += 1 + print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype))) + print(" pred "+str(i)+" "+str(pipe.predict(testlist))) + print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans))) + #print(" score2 "+str(pipe2.score(contents, ltype))) + + #print(" pred1 "+str(pipe1.predict(testlist))) + #print(" pred2 "+str(pipe2.predict(testlist))) + + #print(" score1 "+str(pipe1.score(testlist, ans))) + #print(" score2 "+str(pipe2.score(testlist, ans))) diff --git a/test_lclassifier.py b/test_lclassifier.py new file mode 100644 index 0000000..431962e --- /dev/null +++ b/test_lclassifier.py @@ -0,0 +1,9 @@ +from lclassifier import * + +def test_ext(): + ext = "cowboy" + assert acceptable_file(ext) == False + +def test_correct_ext(): + ext = "perl" + assert clean_ext(ext) == "pl" From e530c5ae39ee73cba4aec88e9520813c77ae28dd Mon Sep 17 00:00:00 2001 From: Alan R Date: Thu, 4 Jun 2015 22:14:23 -0400 Subject: [PATCH 3/9] custom featurizer running --- lclassifier.py | 66 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/lclassifier.py b/lclassifier.py index 6c82da0..3a6d882 100644 --- a/lclassifier.py +++ b/lclassifier.py @@ -5,9 +5,11 @@ from sklearn.cross_validation import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import Pipeline +from sklearn.tree import DecisionTreeClassifier +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.feature_extraction.text import TfidfTransformer import csv +import re def acceptable_file(text): @@ -134,29 +136,56 @@ def fit2(contents, ltype): # print(pipe.predict(testcont)) # return pipe.score(contents, ltype) +def print_matrix(matrix, p_max=None): + if p_max is None: + upper_limit = len(matrix) + else: + upper_limit = p_max + for i in range(upper_limit): + vector = matrix[i] + for val in vector: + print(str(round(val, 3)).ljust(5)+",", end="") + print("") + #print([str(round(val, 3)) for val in vector]) + class CustomFeaturizer: - def __init__(self, *featurizers): - self.featurizers = featurizers + def __init__(self): + pass + #self.featurizers = featurizers def fit(self, X, y=None): - """All scikit-lear compatible transforms and classifiers have the same interface, and - fit always returns the same object.""" + """All scikit-lear compatible transforms and classifiers have the + same interface, and fit always returns the same object.""" return self def transform(self, X): - fvs = [] - for datum in X: - fvs.append([f(datum) for f in self.featurizers]) - return fvs + reg_list = ["^#", "-\>", "\{", "\$", "let", "def", + "private", "static", "\<", "\[", "func\b", + "this\."] + matrix = [] + for text in X: + vector = [] + for reg_expr in reg_list: + prog = re.compile(reg_expr) + vector.append(len(prog.findall(text))/len(text)) + matrix.append(vector) + return matrix def fit3(contents, ltype): - pipe = Pipeline([('custom_feature', CustomFeaturizer()), - ('bayes', MultinomialNB())]) - MultinomialNB() - model = MultinomialNB(X, y) +# pipe = Pipeline([('custom_feature', CustomFeaturizer()), +# ('bayes', MultinomialNB())]) +# MultinomialNB() +# model = MultinomialNB(X, y) +# pipe.fit(contents, ltype) + + custom_feature = CustomFeaturizer() +# custom_feature.fit(contents, ltype) + + pipe = make_pipeline(custom_feature, DecisionTreeClassifier()) pipe.fit(contents, ltype) + return pipe @@ -169,7 +198,7 @@ def fit3(contents, ltype): filelist, testlist = load_file_names() contents, ltype, testcont = load_files(filelist, testlist) - plist = [fit1, fit2] + plist = [fit1, fit2, fit3] pipel = [0 for i in range(len(plist))] for i in range(len(plist)): @@ -186,6 +215,15 @@ def fit3(contents, ltype): print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype))) print(" pred "+str(i)+" "+str(pipe.predict(testlist))) print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans))) + + word_list = re.findall(r"^#", "# include ") + print(word_list) + print(len(word_list)) + + + custom_feature = CustomFeaturizer() + matrix = custom_feature.transform(contents) + print_matrix(matrix, 10) #print(" score2 "+str(pipe2.score(contents, ltype))) #print(" pred1 "+str(pipe1.predict(testlist))) From 4f03b3897a8dcfeb30c2374775d3e9dadde78771 Mon Sep 17 00:00:00 2001 From: Alan R Date: Thu, 4 Jun 2015 23:27:41 -0400 Subject: [PATCH 4/9] massive list of regex, improved but method showing limits --- lclassifier.py | 70 ++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/lclassifier.py b/lclassifier.py index 3a6d882..286c893 100644 --- a/lclassifier.py +++ b/lclassifier.py @@ -4,10 +4,13 @@ from sklearn import linear_model from sklearn.cross_validation import train_test_split from sklearn.feature_extraction.text import CountVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.tree import DecisionTreeClassifier from sklearn.pipeline import Pipeline, make_pipeline from sklearn.feature_extraction.text import TfidfTransformer +# estimators +from sklearn.naive_bayes import MultinomialNB +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import SGDClassifier +# other utilities import csv import re @@ -160,32 +163,48 @@ def fit(self, X, y=None): return self def transform(self, X): - reg_list = ["^#", "-\>", "\{", "\$", "let", "def", - "private", "static", "\<", "\[", "func\b", - "this\."] + char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", + "this\.", "^end", ";", "\*", "%", "^do", + "\<\$php", "/\*", "__", "=", "==", + "===", "\(\)", "\{\}", ":", "\+\+", "\+=", + "^#include", "^ \*", ":\s*$", "\<\<|\>\>", + "int", "\b\*\w", "\(&\w", "argv", "\[\]" + "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", + "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", + "\(\w+:", "@", "\b@\w"] + word_list = ["private", "static", "make","let", "def", "^\(defn", + "defn", "do", "class", "^function", "public", + "unset", "printf\(", "return", "NULL", "void", + "main\(", "main_", "void\s\*\w", "\{else\}", + "char", "array\(", "__init__", "__str__", "token", + "^import", "^from", "final", "val", "type", "package", + "object", "String", "string", "primitive", "fixnum", + "error", "try"] + reg_list = char_list + word_list matrix = [] for text in X: vector = [] for reg_expr in reg_list: prog = re.compile(reg_expr) - vector.append(len(prog.findall(text))/len(text)) + val = len(prog.findall(text))/len(text) + if val > 0: + val = 1 + vector.append(val) matrix.append(vector) return matrix def fit3(contents, ltype): -# pipe = Pipeline([('custom_feature', CustomFeaturizer()), -# ('bayes', MultinomialNB())]) -# MultinomialNB() -# model = MultinomialNB(X, y) -# pipe.fit(contents, ltype) - custom_feature = CustomFeaturizer() -# custom_feature.fit(contents, ltype) - pipe = make_pipeline(custom_feature, DecisionTreeClassifier()) pipe.fit(contents, ltype) + return pipe + +def fit4(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, SGDClassifier()) + pipe.fit(contents, ltype) return pipe @@ -198,11 +217,12 @@ def fit3(contents, ltype): filelist, testlist = load_file_names() contents, ltype, testcont = load_files(filelist, testlist) - plist = [fit1, fit2, fit3] + plist = [fit1, fit2, fit3, fit4] + X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33) pipel = [0 for i in range(len(plist))] for i in range(len(plist)): - pipel[i] = plist[i](contents, ltype) + pipel[i] = plist[i](X, y) #pipe1 = fit1(contents, ltype) #pipe2 = fit2(contents, ltype) @@ -212,22 +232,12 @@ def fit3(contents, ltype): i = 0 for pipe in pipel: i += 1 - print(" score_train "+str(i)+" "+str(pipe.score(contents, ltype))) + print(" score_train "+str(i)+" "+str(pipe.score(X, y))) + print(" score_test "+str(i)+" "+str(pipe.score(Xt, yt))) + print(" score_quest "+str(i)+" "+str(pipe.score(testlist, ans))) print(" pred "+str(i)+" "+str(pipe.predict(testlist))) - print(" score_test "+str(i)+" "+str(pipe.score(testlist, ans))) + print(" ") word_list = re.findall(r"^#", "# include ") print(word_list) print(len(word_list)) - - - custom_feature = CustomFeaturizer() - matrix = custom_feature.transform(contents) - print_matrix(matrix, 10) - #print(" score2 "+str(pipe2.score(contents, ltype))) - - #print(" pred1 "+str(pipe1.predict(testlist))) - #print(" pred2 "+str(pipe2.predict(testlist))) - - #print(" score1 "+str(pipe1.score(testlist, ans))) - #print(" score2 "+str(pipe2.score(testlist, ans))) From 5994e5a25a193e0df78010a39116fadb03a82eb0 Mon Sep 17 00:00:00 2001 From: Alan R Date: Fri, 5 Jun 2015 14:11:13 -0400 Subject: [PATCH 5/9] fixed careless error, classifying up to 50% now --- .gitignore | 1 + lclassifier.py | 96 ++++++++++++++++++++++++++++++++------------- test_lclassifier.py | 31 +++++++++++++++ 3 files changed, 100 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 6d081c5..a7c3822 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,4 @@ target/ benchmarksgame-2014-08-31/ .direnv/ +test_w_ext/ diff --git a/lclassifier.py b/lclassifier.py index 286c893..962743c 100644 --- a/lclassifier.py +++ b/lclassifier.py @@ -10,6 +10,7 @@ from sklearn.naive_bayes import MultinomialNB from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import SGDClassifier +from sklearn.ensemble import RandomForestClassifier # other utilities import csv import re @@ -50,8 +51,8 @@ def clean_ext(text): def load_file_names(): l = [0 for i in range(5)] - s = "benchmarksgame-2014-08-31/benchmarksgame/" - max_lvl = 5 + s = "benchmarksgame-2014-08-31/benchmarksgame/bench/" + max_lvl = 4 for i in range(max_lvl): l[i] = glob(s+"*/"*i+"*.*") # l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*") @@ -97,12 +98,14 @@ def load_files(filelist, testlist): print(ext, end=" : ") print(" ") - testcont = [] + testcont = [0] * 32 for filename in testlist: # print(filename) with open(filename) as file: - testcont.append(file.read()) - + di = filename.rfind("/") + i = int(filename[di+1:]) + print(filename+" "+str(i)) + testcont[i-1] = file.read() print(" ") return contents, ltype, testcont #print(testcont[15]) @@ -115,6 +118,7 @@ def read_answers(): print(ans_list) for row in ans_list: ans.append(clean_ext(row[1])) +# print(row[0]) return ans @@ -163,32 +167,52 @@ def fit(self, X, y=None): return self def transform(self, X): - char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", - "this\.", "^end", ";", "\*", "%", "^do", - "\<\$php", "/\*", "__", "=", "==", - "===", "\(\)", "\{\}", ":", "\+\+", "\+=", - "^#include", "^ \*", ":\s*$", "\<\<|\>\>", - "int", "\b\*\w", "\(&\w", "argv", "\[\]" - "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", - "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", - "\(\w+:", "@", "\b@\w"] - word_list = ["private", "static", "make","let", "def", "^\(defn", - "defn", "do", "class", "^function", "public", - "unset", "printf\(", "return", "NULL", "void", - "main\(", "main_", "void\s\*\w", "\{else\}", - "char", "array\(", "__init__", "__str__", "token", - "^import", "^from", "final", "val", "type", "package", - "object", "String", "string", "primitive", "fixnum", - "error", "try"] - reg_list = char_list + word_list + # char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", + # "this\.", "^end", ";", "\*", "%", "^do", + # "\<\$php", "/\*", "__", "=", "==", + # "===", "\(\)", "\{\}", ":", "\+\+", "\+=", + # "^#include", "^ \*", ":\s*$", "\<\<|\>\>", + # "int", "\b\*\w", "\(&\w", "argv", "\[\]" + # "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", + # "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", + # "\(\w+:", "@", "\b@\w"] + # word_list = ["private", "static", "make","let", "def", "^\(defn", + # "defn", "do", "class", "^function", "public", + # "unset", "printf\(", "return", "NULL", "void", + # "main\(", "main_", "void\s\*\w", "\{else\}", + # "char", "array\(", "__init__", "__str__", "token", + # "^import", "^from", "final", "val", "type", "package", + # "object", "String", "string", "primitive", "fixnum", + # "error", "try"] + clojure = ["^\s*\(\w.*\s*$", "\(:\w+[]\s\w+]*\)"] + python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s", + "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"] + js = ["^[ \t]var", "=\s*function", + "function\s*\w*\(\w*[\w\s,]*\)\s*\{"] + ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$", + "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"] + hs = ["&&&", "^\{-"] + clj = ["^\(define", "^[ \t]*;+"] + java = ["^[ \t]*public \w* \w*", "^[ \t]*\*", "^[ \t]*/\*\*"] + scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="] + tcl = ["^[ \t]*proc \w*::\w* \{"] + php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*", + "^[ \t]*\$\w* ?=.*;$"] + ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"] + perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"] + gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<"] +# reg_list = char_list + word_list + reg_list = clojure + python + js + ruby + hs + clj + java + scl\ + + tcl + php + ocaml + perl + gcc matrix = [] for text in X: vector = [] for reg_expr in reg_list: - prog = re.compile(reg_expr) +# print(reg_expr) + prog = re.compile(reg_expr, flags=re.MULTILINE) val = len(prog.findall(text))/len(text) - if val > 0: - val = 1 + #if val > 0: + # val = 1 vector.append(val) matrix.append(vector) return matrix @@ -208,6 +232,19 @@ def fit4(contents, ltype): return pipe +def fit4(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, MultinomialNB()) + pipe.fit(contents, ltype) + return pipe + + +def fit4(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, RandomForestClassifier()) + pipe.fit(contents, ltype) + return pipe + #sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature, # percent_periods_feature) #big_list = sms_featurizer.transform(sms_data[:10]) @@ -225,6 +262,9 @@ def fit4(contents, ltype): pipel[i] = plist[i](X, y) #pipe1 = fit1(contents, ltype) #pipe2 = fit2(contents, ltype) + pipe = fit4(X, y) + #print(pipe.transform(testlist)) + #print(testcont) ans = read_answers() print(ans) @@ -234,8 +274,8 @@ def fit4(contents, ltype): i += 1 print(" score_train "+str(i)+" "+str(pipe.score(X, y))) print(" score_test "+str(i)+" "+str(pipe.score(Xt, yt))) - print(" score_quest "+str(i)+" "+str(pipe.score(testlist, ans))) - print(" pred "+str(i)+" "+str(pipe.predict(testlist))) + print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans))) + print(" pred "+str(i)+" "+str(pipe.predict(testcont))) print(" ") word_list = re.findall(r"^#", "# include ") diff --git a/test_lclassifier.py b/test_lclassifier.py index 431962e..502b789 100644 --- a/test_lclassifier.py +++ b/test_lclassifier.py @@ -7,3 +7,34 @@ def test_ext(): def test_correct_ext(): ext = "perl" assert clean_ext(ext) == "pl" + +def test_reg_use(): + reg_expr = "\s__\w*__\(" + prog = re.compile(reg_expr) + text ='''import packlag +def __init__(self): + var = thing''' + val = prog.findall(text) + print(val) + assert len(val) == 1 + + reg_expr = "\):[ \t]*\n[ \t]*\w" + prog = re.compile(reg_expr) + val = prog.findall(text) + print(val) + assert len(val) == 1 + + reg_expr = "(^from|^import)\s" + prog = re.compile(reg_expr) + val = prog.findall(text) + print(val) + assert len(val) == 1 + + textjs = '''function noAction() { + } + ''' + reg_expr = "function\s*\w*\(\w*[\w\s,]*\)\s*\{" + prog = re.compile(reg_expr) + val = prog.findall(textjs) + print(val) + assert len(val) == 1 From 585fe3a93b0f9d05034a3823c2f26858dcbbf510 Mon Sep 17 00:00:00 2001 From: Alan R Date: Sat, 6 Jun 2015 10:26:13 -0400 Subject: [PATCH 6/9] match rates up to 70%, organized directory structure --- Lang_classifier_use.ipynb | 186 ++++++++++++++++++ lclassifier.py => lclassifier/lclassifier.py | 104 ++++++++-- lclassifier/output.txt | 129 ++++++++++++ .../tests/test_lclassifier.py | 0 ref_program.py | 54 +++++ 5 files changed, 454 insertions(+), 19 deletions(-) create mode 100644 Lang_classifier_use.ipynb rename lclassifier.py => lclassifier/lclassifier.py (76%) create mode 100644 lclassifier/output.txt rename test_lclassifier.py => lclassifier/tests/test_lclassifier.py (100%) create mode 100644 ref_program.py diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb new file mode 100644 index 0000000..3f20fdf --- /dev/null +++ b/Lang_classifier_use.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from lclassifier.lclassifier import *" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acceptable_file(\"py\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total samples 931\n", + " number of usable files 656\n", + " \n", + " number of read file types: 32\n", + " number of recognized types: 14\n", + " summary of tile types\n", + "ats \n", + "clj 38 \n", + "cs 41 \n", + "dart \n", + "erlang \n", + "fpascal \n", + "fsharp \n", + "c 129 \n", + "hs 33 \n", + "gnat \n", + "go \n", + "php 55 \n", + "ifc \n", + "java 51 \n", + "js 25 \n", + "ruby 73 \n", + "lua \n", + "ocaml 35 \n", + "oz \n", + "pl 34 \n", + "py 36 \n", + "racket 29 \n", + "rust \n", + "sbcl 34 \n", + "scala 43 \n", + "vw \n", + "cint \n", + "javasteady \n", + "parrot \n", + "cc \n", + "txt \n", + "ozf \n", + " not included: \n", + " \n" + ] + } + ], + "source": [ + "filelist, testlist = load_file_names()\n", + "contents, ltype, testcont = load_files(filelist, testlist)\n", + "\n", + "plist = [fit2, fit3, fit4, fit5, fit6]\n", + "\n", + "X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pipe = fit5(Xt, yt)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['hs', 'c', 'java', 'ruby', 'clj', 'hs', 'racket', 'c', 'clj',\n", + " 'ruby', 'c', 'pl', 'c', 'c', 'scala', 'ruby', 'cs', 'c', 'cs',\n", + " 'sbcl', 'java', 'ruby', 'pl', 'ocaml', 'ocaml', 'cs', 'hs', 'php',\n", + " 'c', 'pl', 'c', 'scala', 'ruby', 'c', 'cs', 'c', 'ocaml', 'hs',\n", + " 'scala', 'c', 'hs', 'ruby', 'c', 'c', 'ocaml', 'sbcl', 'ruby', 'c',\n", + " 'c', 'ruby', 'c', 'ocaml', 'java', 'c', 'ruby', 'ruby', 'php', 'c',\n", + " 'clj', 'cs', 'php', 'java', 'c', 'ruby', 'py', 'cs', 'scala', 'php',\n", + " 'c', 'js', 'cs', 'js', 'c', 'php', 'php', 'php', 'pl', 'c', 'ruby',\n", + " 'clj', 'php', 'c', 'pl', 'py', 'clj', 'c', 'c', 'py', 'sbcl', 'py',\n", + " 'ruby', 'php', 'py', 'php', 'c', 'php', 'ruby', 'ruby', 'ruby',\n", + " 'scala', 'py', 'ruby', 'clj', 'php', 'ruby', 'c', 'ocaml', 'racket',\n", + " 'php', 'hs', 'hs', 'sbcl', 'ocaml', 'py', 'scala', 'ruby', 'cs',\n", + " 'c', 'c', 'c', 'c', 'clj', 'sbcl', 'scala', 'cs', 'py', 'c', 'cs',\n", + " 'cs', 'hs', 'c', 'java', 'php', 'java', 'js', 'clj', 'ruby', 'c',\n", + " 'hs', 'ruby', 'c', 'php', 'py', 'scala', 'clj', 'cs', 'cs', 'ruby',\n", + " 'sbcl', 'cs', 'scala', 'cs', 'c', 'scala', 'clj', 'c', 'clj', 'pl',\n", + " 'ruby', 'racket', 'java', 'cs', 'js', 'ocaml', 'c', 'py', 'c',\n", + " 'scala', 'js', 'clj', 'c', 'clj', 'ruby', 'clj', 'racket', 'c',\n", + " 'ocaml', 'js', 'pl', 'java', 'hs', 'java', 'py', 'php', 'java',\n", + " 'ruby', 'sbcl', 'ruby', 'php', 'scala', 'py', 'c', 'racket', 'php',\n", + " 'c', 'js', 'java', 'php', 'java', 'pl', 'c', 'py', 'php', 'py', 'c',\n", + " 'cs', 'py', 'cs', 'c', 'c', 'clj', 'java', 'ocaml', 'cs', 'java',\n", + " 'ocaml', 'cs'], \n", + " dtype=' 0: # val = 1 - vector.append(val) - matrix.append(vector) +# print(i) + v[i] = val +# print(vector) + matrix.append(v) +# print(matrix[0]) return matrix @@ -232,19 +254,40 @@ def fit4(contents, ltype): return pipe -def fit4(contents, ltype): +def fit5(contents, ltype): custom_feature = CustomFeaturizer() pipe = make_pipeline(custom_feature, MultinomialNB()) pipe.fit(contents, ltype) return pipe -def fit4(contents, ltype): +def fit6(contents, ltype): custom_feature = CustomFeaturizer() pipe = make_pipeline(custom_feature, RandomForestClassifier()) pipe.fit(contents, ltype) return pipe + +def demo_class(X, y): + types = [] + for ext in y: + if ext not in types: + types.append(ext) + typecont = [""] * len(types) + for i in range(len(X)): + text = X[i] + for j in range(len(types)): + ext = types[j] + if ext == y[i]: + typecont[j] += text + custom_feature = CustomFeaturizer() + M = custom_feature.transform(typecont) + for j in range(len(M)): + print(types[j].ljust(8)+" ", end="") + for k in range(len(M[0])): + print(str(int(M[j][k])).ljust(5), end="") + print("") + #sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature, # percent_periods_feature) #big_list = sms_featurizer.transform(sms_data[:10]) @@ -254,7 +297,7 @@ def fit4(contents, ltype): filelist, testlist = load_file_names() contents, ltype, testcont = load_files(filelist, testlist) - plist = [fit1, fit2, fit3, fit4] + plist = [fit2, fit3, fit4, fit5, fit6] X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33) pipel = [0 for i in range(len(plist))] @@ -263,7 +306,28 @@ def fit4(contents, ltype): #pipe1 = fit1(contents, ltype) #pipe2 = fit2(contents, ltype) pipe = fit4(X, y) - #print(pipe.transform(testlist)) + M = pipe.transform(testcont) + print(str(len(M))+" "+str(len(M[0]))) +# print(M[0]) + M = pipe.transform(Xt) + print(str(len(M))+" "+str(len(M[0]))) + print(" failed to classify") + A = pipe.predict(X) + for i in range(len(A)): + if A[i] != y[i]: +# print(" ") + print(y[i].ljust(6)+" misclassified as "+A[i]) +# print(X[i]) +# print(M[0]) + + + cf = CustomFeaturizer() + M = cf.transform(testcont) + print(str(len(M))+" "+str(len(M[0]))) +# print(M[0]) + M = cf.transform(Xt) + print(str(len(M))+" "+str(len(M[0]))) +# print(M[0]) #print(testcont) ans = read_answers() @@ -281,3 +345,5 @@ def fit4(contents, ltype): word_list = re.findall(r"^#", "# include ") print(word_list) print(len(word_list)) + + demo_class(testcont, ans) diff --git a/lclassifier/output.txt b/lclassifier/output.txt new file mode 100644 index 0000000..987c632 --- /dev/null +++ b/lclassifier/output.txt @@ -0,0 +1,129 @@ + total samples 931 + number of usable files 656 + + number of read file types: 32 + number of recognized types: 14 + summary of tile types +ats +clj 38 +cs 41 +dart +erlang +fpascal +fsharp +c 129 +hs 33 +gnat +go +php 55 +ifc +java 51 +js 25 +ruby 73 +lua +ocaml 35 +oz +pl 34 +py 36 +racket 29 +rust +sbcl 34 +scala 43 +vw +cint +javasteady +parrot +cc +txt +ozf + not included: + +32 15 +217 15 + failed to classify +hs misclassified as py +racket misclassified as sbcl +racket misclassified as sbcl +racket misclassified as sbcl +clj misclassified as sbcl +racket misclassified as sbcl +hs misclassified as pl +racket misclassified as sbcl +scala misclassified as c +clj misclassified as sbcl +racket misclassified as sbcl +hs misclassified as py +hs misclassified as py +racket misclassified as sbcl +cs misclassified as js +clj misclassified as sbcl +cs misclassified as py +scala misclassified as py +clj misclassified as sbcl +racket misclassified as sbcl +racket misclassified as sbcl +ruby misclassified as racket +racket misclassified as sbcl +scala misclassified as py +clj misclassified as sbcl +js misclassified as racket +hs misclassified as pl +hs misclassified as sbcl +racket misclassified as sbcl +racket misclassified as sbcl +js misclassified as racket +scala misclassified as py +racket misclassified as sbcl +32 36 +217 36 +<_csv.reader object at 0x10855d908> + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_train 1 0.986332574032 + score_test 1 0.898617511521 + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'ruby' 'py' 'py' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'ruby' 'sbcl' 'racket' 'racket' 'java' + 'ruby' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'php' 'ocaml' 'ocaml'] + + score_train 2 1.0 + score_test 2 0.972350230415 + score_quest 2 0.65625 + pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'ruby' 'ruby' 'sbcl' 'java' 'java' 'scala' + 'scala' 'ruby' 'ruby' 'php' 'php' 'java' 'ocaml' 'ruby'] + + score_train 3 0.833712984055 + score_test 3 0.78801843318 + score_quest 3 0.71875 + pred 3 ['clj' 'clj' 'clj' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'racket' 'hs' 'scala' 'racket' 'scala' 'java' 'racket' + 'scala' 'scala' 'racket' 'racket' 'java' 'php' 'php' 'ocaml' 'ocaml'] + + score_train 4 0.974943052392 + score_test 4 0.981566820276 + score_quest 4 0.71875 + pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala' + 'scala' 'c' 'c' 'java' 'php' 'js' 'ocaml' 'ocaml'] + + score_train 5 1.0 + score_test 5 0.976958525346 + score_quest 5 0.59375 + pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' 'hs' + 'scala' 'ruby' 'ruby' 'php' 'php' 'cs' 'ocaml' 'ruby'] + +['#'] +1 +clj 10 4 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +py 0 0 0 74 18 10 60 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +js 1 2 0 0 0 0 0 42 20 86 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 +ruby 0 0 0 0 0 0 0 0 0 0 20 11 3 7 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +haskell 0 0 0 0 0 59 0 0 0 0 0 0 0 0 0 1 10 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 +racket 126 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 78 13 +scala 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 7 0 0 0 0 0 0 0 0 0 0 2 2 0 +tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 +php 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 17 0 0 0 0 11 5 0 0 0 0 0 0 0 0 92 26 +ocaml 6 0 0 0 0 0 0 0 4 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 1 0 0 0 0 0 0 1 0 diff --git a/test_lclassifier.py b/lclassifier/tests/test_lclassifier.py similarity index 100% rename from test_lclassifier.py rename to lclassifier/tests/test_lclassifier.py diff --git a/ref_program.py b/ref_program.py new file mode 100644 index 0000000..d6b7a6d --- /dev/null +++ b/ref_program.py @@ -0,0 +1,54 @@ +import csv +import re +import numpy as np +import random + +#from textblob import TextBlob +from collections import Counter + +from sklearn.pipeline import make_pipeline, make_union +from sklearn.base import TransformerMixin +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.cross_validation import train_test_split +from sklearn.metrics import classification_report, confusion_matrix + + +class DumbFeaturizer(TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X): + matrix = [] + for i in range(len(X)): + vector = [] + for j in range(11): + if j == X[i]: + vector.append(1) + else: + vector.append(0) + matrix.append(vector) + return matrix + +N = 22 +y = [0] * N +X = [0] * N +for k in range(N): + val = random.randrange(11) + y[k] = val + X[k] = val + + +dumb = DumbFeaturizer() +print(dumb.transform(X)) + +pipe = make_pipeline(dumb, DecisionTreeClassifier()) +pipe.fit(X, y) +# Our baseline +print(pipe.score(X, y)) +print(" ") +print(" transform ") +print(pipe.transform(X)) From cef4b3df3857713f5d1b1d8a8053f047c5bb7804 Mon Sep 17 00:00:00 2001 From: Alan R Date: Sat, 6 Jun 2015 23:49:09 -0400 Subject: [PATCH 7/9] added command line argument feature, trying to wrap up --- lclassifier/lclassifier.py | 93 ++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py index bd14944..3d5502e 100644 --- a/lclassifier/lclassifier.py +++ b/lclassifier/lclassifier.py @@ -15,6 +15,7 @@ # other utilities import csv import re +import sys def acceptable_file(text): @@ -60,7 +61,7 @@ def list_uniques(alist): def load_file_names(): l = [0 for i in range(5)] - s = "benchmarksgame-2014-08-31/benchmarksgame/bench/" + s = "../benchmarksgame-2014-08-31/benchmarksgame/bench/" max_lvl = 4 for i in range(max_lvl): l[i] = glob(s+"*/"*i+"*.*") @@ -70,7 +71,7 @@ def load_file_names(): filelist = [] for i in range(max_lvl): filelist += l[i] - testlist = glob("test/*") + testlist = glob("../test/*") print(" total samples "+str(len(filelist))) return filelist, testlist @@ -125,7 +126,7 @@ def load_files(filelist, testlist): #print(testlist) def read_answers(): - with open("test.csv") as csvfile: + with open("../test.csv") as csvfile: ans_list = csv.reader(csvfile, delimiter=",") ans = [] print(ans_list) @@ -224,19 +225,14 @@ def transform(self, X): matrix = [] for text in X: v = [0] * len(reg_list) -# print(str(len(v))+" "+str(len(reg_list))) for i in range(len(reg_list)): -# print(reg_expr) reg_expr = reg_list[i] prog = re.compile(reg_expr, flags=re.MULTILINE) val = len(prog.findall(text))#/len(text) #if val > 0: # val = 1 -# print(i) v[i] = val -# print(vector) matrix.append(v) -# print(matrix[0]) return matrix @@ -288,12 +284,8 @@ def demo_class(X, y): print(str(int(M[j][k])).ljust(5), end="") print("") -#sms_featurizer = CustomFeaturizer(longest_run_of_capital_letters_feature, -# percent_periods_feature) -#big_list = sms_featurizer.transform(sms_data[:10]) -#print(big_list) -if __name__ == "__main__": +def default_action(): filelist, testlist = load_file_names() contents, ltype, testcont = load_files(filelist, testlist) @@ -301,34 +293,40 @@ def demo_class(X, y): X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33) pipel = [0 for i in range(len(plist))] + print(" score for training_set test_set") + for i in range(len(plist)): + pipe = plist[i](X, y) + print(str(i).ljust(4)+" "+str(round(pipe.score(X, y),4)).ljust(8)\ + +str(round(pipe.score(Xt, yt),4)).ljust(8)) + print(" ") for i in range(len(plist)): - pipel[i] = plist[i](X, y) - #pipe1 = fit1(contents, ltype) - #pipe2 = fit2(contents, ltype) - pipe = fit4(X, y) - M = pipe.transform(testcont) - print(str(len(M))+" "+str(len(M[0]))) -# print(M[0]) - M = pipe.transform(Xt) - print(str(len(M))+" "+str(len(M[0]))) + pipel[i] = plist[i](contents, ltype) + print(" failed to classify") + failed_to_classify = {} + wrongly_classified = {} A = pipe.predict(X) for i in range(len(A)): if A[i] != y[i]: # print(" ") print(y[i].ljust(6)+" misclassified as "+A[i]) -# print(X[i]) -# print(M[0]) - - - cf = CustomFeaturizer() - M = cf.transform(testcont) - print(str(len(M))+" "+str(len(M[0]))) -# print(M[0]) - M = cf.transform(Xt) - print(str(len(M))+" "+str(len(M[0]))) -# print(M[0]) - #print(testcont) + if y[i] in failed_to_classify: + failed_to_classify[y[i]] += 1 + else: + failed_to_classify[y[i]] = 1 + if A[i] in wrongly_classified: + wrongly_classified[A[i]] += 1 + else: + wrongly_classified[A[i]] = 1 + print("") + print(" failure counts") + print(" wrongly classified:") + for ext in wrongly_classified: + print(ext.ljust(7) + "#"*wrongly_classified[ext]) + print(" failed to classify") + for ext in failed_to_classify: + print(ext.ljust(7) + "#"*failed_to_classify[ext]) + print(" ") ans = read_answers() print(ans) @@ -336,14 +334,29 @@ def demo_class(X, y): i = 0 for pipe in pipel: i += 1 - print(" score_train "+str(i)+" "+str(pipe.score(X, y))) - print(" score_test "+str(i)+" "+str(pipe.score(Xt, yt))) print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans))) print(" pred "+str(i)+" "+str(pipe.predict(testcont))) print(" ") - word_list = re.findall(r"^#", "# include ") - print(word_list) - print(len(word_list)) - demo_class(testcont, ans) + + +if __name__ == "__main__": + if len(sys.argv) == 1: + default_action() + elif len(sys.argv) == 2: + test_file = sys.argv[1] + print("Estimating file type of "+ test_file) + + filelist, testlist = load_file_names() + X, y, testcont = load_files(filelist, testlist) + pipe = fit6(X, y) + with open(test_file) as f: + test_contents = f.read() +# print(test_contents) + est_ext = pipe.predict([test_contents]) + + print("Predicted extension: "+str(est_ext)) + + else: + print("error: command line arguments not supported") From cfff3c0b5a4a7fa04703b8a3a319a7841325995b Mon Sep 17 00:00:00 2001 From: Alan R Date: Sun, 7 Jun 2015 10:26:33 -0400 Subject: [PATCH 8/9] polished workbook --- Lang_classifier_use.ipynb | 289 ++++++++++++++++++++++++++++++++----- lclassifier/lclassifier.py | 92 ++++++------ lclassifier/output.txt | 111 +++++--------- 3 files changed, 343 insertions(+), 149 deletions(-) diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb index 3f20fdf..d66060b 100644 --- a/Lang_classifier_use.ipynb +++ b/Lang_classifier_use.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -11,9 +11,16 @@ "from lclassifier.lclassifier import *" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demonstration of Language Classifier (lclassifier)" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -24,18 +31,40 @@ "True" ] }, - "execution_count": 15, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "acceptable_file(\"py\")" + "acceptable_file(\"py\") # testing that import is functional" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'py'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_ext(\"python3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": { "collapsed": false }, @@ -98,18 +127,227 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pipe = fit6(Xt, yt)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "testing set outcomes\n", + "scala scala \n", + "c c \n", + "c c \n", + "clj clj \n", + "java java \n", + "py py \n", + "clj clj \n", + "js js \n", + "c c \n", + "pl pl \n", + "cs cs \n", + "c c \n", + "c c \n", + "ocaml ocaml \n", + "hs hs \n", + "sbcl sbcl \n", + "racket racket \n", + "php php \n", + "pl pl \n", + "ocaml ocaml \n", + "\n", + " overall score: 1.0\n" + ] + } + ], + "source": [ + "M = pipe.predict(Xt)\n", + "print(\"testing set outcomes\")\n", + "for i in range(20):\n", + " print(M[i].ljust(8)+ \" \" + yt[i].ljust(8))\n", + "print(\"\")\n", + "print(\" overall score: \"+str(pipe.score(Xt, yt)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['php', 'php', 'php', 'clj', 'py', 'py', 'py', 'py', 'js', 'js',\n", + " 'js', 'js', 'ruby', 'ruby', 'ruby', 'hs', 'php', 'hs', 'racket',\n", + " 'php', 'racket', 'java', 'java', 'scala', 'scala', 'php', 'php',\n", + " 'java', 'php', 'java', 'ocaml', 'php'], \n", + " dtype='\n", + " number of testing file types: 11\n", + " actual_file_type predicted_type\n", + "clj php \n", + "clj php \n", + "clj php \n", + "clj clj \n", + "py py \n", + "py py \n", + "py py \n", + "py py \n", + "js js \n", + "js js \n", + "js js \n", + "js js \n", + "ruby ruby \n", + "ruby ruby \n", + "ruby ruby \n", + "haskell hs \n", + "haskell php \n", + "haskell hs \n", + "racket racket \n", + "racket php \n", + "racket racket \n", + "java java \n", + "java java \n", + "scala scala \n", + "scala scala \n", + "tcl php \n", + "tcl php \n", + "php java \n", + "php php \n", + "php java \n", + "ocaml ocaml \n", + "ocaml php \n", + " \n", + " score: 0.625\n" + ] + } + ], + "source": [ + "ans = read_answers()\n", + "M = pipe.predict(testcont)\n", + "print(\" actual_file_type predicted_type\")\n", + "for i in range(len(ans)):\n", + " print(ans[i].ljust(10)+M[i].ljust(10))\n", + "print(\" \")\n", + "print(\" score: \"+str(pipe.score(testcont, ans)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This score isn't very good, but it would be difficult to match all these no matter what methods were being used due to the small quantity of training data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Single file demo" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "pipe = fit5(Xt, yt)" + "py_file = '''JOIN_RETRANSMIT = 0.7\n", + "CATCHUP_INTERVAL = 0.6\n", + "ACCEPT_RETRANSMIT = 1.0\n", + "PREPARE_RETRANSMIT = 1.0\n", + "INVOKE_RETRANSMIT = 0.5\n", + "LEADER_TIMEOUT = 1.0\n", + "NULL_BALLOT = Ballot(-1, -1) # sorts before all real ballots\n", + "NOOP_PROPOSAL = Proposal(None, None, None) # no-op to fill otherwise empty slots\n", + "\n", + "class Node(object):\n", + " unique_ids = itertools.count()\n", + "\n", + " def __init__(self, network, address):\n", + " self.network = network\n", + " self.address = address or 'N%d' % self.unique_ids.next()\n", + " self.logger = SimTimeLogger(logging.getLogger(self.address), {'network': self.network})\n", + " self.logger.info('starting')\n", + " self.roles = []\n", + " self.send = functools.partial(self.network.send, self)\n", + "\n", + " def register(self, roles):\n", + " self.roles.append(roles)\n", + "\n", + " def unregister(self, roles):\n", + " self.roles.remove(roles)\n", + "\n", + " def receive(self, sender, message):\n", + " handler_name = 'do_%s' % type(message).__name__\n", + "\n", + " for comp in self.roles[:]:\n", + " if not hasattr(comp, handler_name):\n", + " continue\n", + " comp.logger.debug(\"received %s from %s\", message, sender)\n", + " fn = getattr(comp, handler_name)\n", + " fn(sender=sender, **message._asdict())\n", + "\n", + "class Timer(object):\n", + "\n", + " def __init__(self, expires, address, callback):\n", + " self.expires = expires\n", + " self.address = address\n", + " self.callback = callback\n", + " self.cancelled = False'''" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -117,39 +355,24 @@ { "data": { "text/plain": [ - "array(['hs', 'c', 'java', 'ruby', 'clj', 'hs', 'racket', 'c', 'clj',\n", - " 'ruby', 'c', 'pl', 'c', 'c', 'scala', 'ruby', 'cs', 'c', 'cs',\n", - " 'sbcl', 'java', 'ruby', 'pl', 'ocaml', 'ocaml', 'cs', 'hs', 'php',\n", - " 'c', 'pl', 'c', 'scala', 'ruby', 'c', 'cs', 'c', 'ocaml', 'hs',\n", - " 'scala', 'c', 'hs', 'ruby', 'c', 'c', 'ocaml', 'sbcl', 'ruby', 'c',\n", - " 'c', 'ruby', 'c', 'ocaml', 'java', 'c', 'ruby', 'ruby', 'php', 'c',\n", - " 'clj', 'cs', 'php', 'java', 'c', 'ruby', 'py', 'cs', 'scala', 'php',\n", - " 'c', 'js', 'cs', 'js', 'c', 'php', 'php', 'php', 'pl', 'c', 'ruby',\n", - " 'clj', 'php', 'c', 'pl', 'py', 'clj', 'c', 'c', 'py', 'sbcl', 'py',\n", - " 'ruby', 'php', 'py', 'php', 'c', 'php', 'ruby', 'ruby', 'ruby',\n", - " 'scala', 'py', 'ruby', 'clj', 'php', 'ruby', 'c', 'ocaml', 'racket',\n", - " 'php', 'hs', 'hs', 'sbcl', 'ocaml', 'py', 'scala', 'ruby', 'cs',\n", - " 'c', 'c', 'c', 'c', 'clj', 'sbcl', 'scala', 'cs', 'py', 'c', 'cs',\n", - " 'cs', 'hs', 'c', 'java', 'php', 'java', 'js', 'clj', 'ruby', 'c',\n", - " 'hs', 'ruby', 'c', 'php', 'py', 'scala', 'clj', 'cs', 'cs', 'ruby',\n", - " 'sbcl', 'cs', 'scala', 'cs', 'c', 'scala', 'clj', 'c', 'clj', 'pl',\n", - " 'ruby', 'racket', 'java', 'cs', 'js', 'ocaml', 'c', 'py', 'c',\n", - " 'scala', 'js', 'clj', 'c', 'clj', 'ruby', 'clj', 'racket', 'c',\n", - " 'ocaml', 'js', 'pl', 'java', 'hs', 'java', 'py', 'php', 'java',\n", - " 'ruby', 'sbcl', 'ruby', 'php', 'scala', 'py', 'c', 'racket', 'php',\n", - " 'c', 'js', 'java', 'php', 'java', 'pl', 'c', 'py', 'php', 'py', 'c',\n", - " 'cs', 'py', 'cs', 'c', 'c', 'clj', 'java', 'ocaml', 'cs', 'java',\n", - " 'ocaml', 'cs'], \n", + "array(['py'], \n", " dtype=' 0: - # val = 1 + val = len(prog.findall(text)) # /len(text) + # this was found to have best results over normalized forms v[i] = val matrix.append(v) return matrix @@ -258,6 +264,8 @@ def fit5(contents, ltype): def fit6(contents, ltype): + '''Random Forest uses multiple decision trees and selects the + tree out of all of those which has occurs the most''' custom_feature = CustomFeaturizer() pipe = make_pipeline(custom_feature, RandomForestClassifier()) pipe.fit(contents, ltype) @@ -278,10 +286,11 @@ def demo_class(X, y): typecont[j] += text custom_feature = CustomFeaturizer() M = custom_feature.transform(typecont) + ratio = 1000 / max([max(vt) for vt in M]) for j in range(len(M)): - print(types[j].ljust(8)+" ", end="") + print(types[j].ljust(8) + " ", end="") for k in range(len(M[0])): - print(str(int(M[j][k])).ljust(5), end="") + print(str(int(ratio*M[j][k])).ljust(5), end="") print("") @@ -296,8 +305,8 @@ def default_action(): print(" score for training_set test_set") for i in range(len(plist)): pipe = plist[i](X, y) - print(str(i).ljust(4)+" "+str(round(pipe.score(X, y),4)).ljust(8)\ - +str(round(pipe.score(Xt, yt),4)).ljust(8)) + print(str(i).ljust(4) + " " + str(round(pipe.score(X, y), 4)).ljust(8) + + str(round(pipe.score(Xt, yt), 4)).ljust(8)) print(" ") for i in range(len(plist)): pipel[i] = plist[i](contents, ltype) @@ -308,8 +317,8 @@ def default_action(): A = pipe.predict(X) for i in range(len(A)): if A[i] != y[i]: -# print(" ") - print(y[i].ljust(6)+" misclassified as "+A[i]) + # print(" ") + print(y[i].ljust(6) + " misclassified as " + A[i]) if y[i] in failed_to_classify: failed_to_classify[y[i]] += 1 else: @@ -322,10 +331,10 @@ def default_action(): print(" failure counts") print(" wrongly classified:") for ext in wrongly_classified: - print(ext.ljust(7) + "#"*wrongly_classified[ext]) + print(ext.ljust(7) + "#" * wrongly_classified[ext]) print(" failed to classify") for ext in failed_to_classify: - print(ext.ljust(7) + "#"*failed_to_classify[ext]) + print(ext.ljust(7) + "#" * failed_to_classify[ext]) print(" ") ans = read_answers() @@ -334,8 +343,8 @@ def default_action(): i = 0 for pipe in pipel: i += 1 - print(" score_quest "+str(i)+" "+str(pipe.score(testcont, ans))) - print(" pred "+str(i)+" "+str(pipe.predict(testcont))) + print(" score_quest " + str(i) + " " + str(pipe.score(testcont, ans))) + print(" pred " + str(i) + " " + str(pipe.predict(testcont))) print(" ") demo_class(testcont, ans) @@ -346,17 +355,16 @@ def default_action(): default_action() elif len(sys.argv) == 2: test_file = sys.argv[1] - print("Estimating file type of "+ test_file) + print("Estimating file type of " + test_file) filelist, testlist = load_file_names() X, y, testcont = load_files(filelist, testlist) pipe = fit6(X, y) with open(test_file) as f: test_contents = f.read() -# print(test_contents) est_ext = pipe.predict([test_contents]) - print("Predicted extension: "+str(est_ext)) + print("Predicted extension: " + str(est_ext)) else: print("error: command line arguments not supported") diff --git a/lclassifier/output.txt b/lclassifier/output.txt index 987c632..577a52b 100644 --- a/lclassifier/output.txt +++ b/lclassifier/output.txt @@ -38,92 +38,55 @@ txt ozf not included: -32 15 -217 15 + score for training_set test_set +0 0.9818 0.871 +1 1.0 0.977 +2 0.9658 0.9355 +3 0.9795 0.9677 +4 1.0 0.977 + + failed to classify + + failure counts + wrongly classified: failed to classify -hs misclassified as py -racket misclassified as sbcl -racket misclassified as sbcl -racket misclassified as sbcl -clj misclassified as sbcl -racket misclassified as sbcl -hs misclassified as pl -racket misclassified as sbcl -scala misclassified as c -clj misclassified as sbcl -racket misclassified as sbcl -hs misclassified as py -hs misclassified as py -racket misclassified as sbcl -cs misclassified as js -clj misclassified as sbcl -cs misclassified as py -scala misclassified as py -clj misclassified as sbcl -racket misclassified as sbcl -racket misclassified as sbcl -ruby misclassified as racket -racket misclassified as sbcl -scala misclassified as py -clj misclassified as sbcl -js misclassified as racket -hs misclassified as pl -hs misclassified as sbcl -racket misclassified as sbcl -racket misclassified as sbcl -js misclassified as racket -scala misclassified as py -racket misclassified as sbcl -32 36 -217 36 -<_csv.reader object at 0x10855d908> + +<_csv.reader object at 0x113420a58> number of testing file types: 11 ['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] - score_train 1 0.986332574032 - score_test 1 0.898617511521 score_quest 1 0.59375 - pred 1 ['clj' 'clj' 'clj' 'ruby' 'py' 'py' 'ruby' 'py' 'js' 'js' 'clj' 'php' - 'ruby' 'clj' 'ruby' 'hs' 'hs' 'ruby' 'sbcl' 'racket' 'racket' 'java' - 'ruby' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'php' 'ocaml' 'ocaml'] + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] - score_train 2 1.0 - score_test 2 0.972350230415 score_quest 2 0.65625 pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' - 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'ruby' 'ruby' 'sbcl' 'java' 'java' 'scala' - 'scala' 'ruby' 'ruby' 'php' 'php' 'java' 'ocaml' 'ruby'] + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'racket' 'sbcl' 'java' 'java' + 'scala' 'scala' 'ruby' 'ruby' 'java' 'java' 'java' 'ocaml' 'ruby'] - score_train 3 0.833712984055 - score_test 3 0.78801843318 - score_quest 3 0.71875 - pred 3 ['clj' 'clj' 'clj' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' - 'ruby' 'ruby' 'hs' 'racket' 'hs' 'scala' 'racket' 'scala' 'java' 'racket' - 'scala' 'scala' 'racket' 'racket' 'java' 'php' 'php' 'ocaml' 'ocaml'] + score_quest 3 0.6875 + pred 3 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' 'racket' 'pl' 'pl' + 'scala' 'scala' 'racket' 'racket' 'pl' 'php' 'js' 'ocaml' 'ocaml'] - score_train 4 0.974943052392 - score_test 4 0.981566820276 score_quest 4 0.71875 pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala' - 'scala' 'c' 'c' 'java' 'php' 'js' 'ocaml' 'ocaml'] + 'scala' 'py' 'py' 'java' 'php' 'js' 'ocaml' 'ocaml'] - score_train 5 1.0 - score_test 5 0.976958525346 - score_quest 5 0.59375 + score_quest 5 0.625 pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' - 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' 'hs' - 'scala' 'ruby' 'ruby' 'php' 'php' 'cs' 'ocaml' 'ruby'] + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' + 'scala' 'scala' 'ruby' 'ruby' 'java' 'php' 'php' 'ocaml' 'ruby'] -['#'] -1 -clj 10 4 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -py 0 0 0 74 18 10 60 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -js 1 2 0 0 0 0 0 42 20 86 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 -ruby 0 0 0 0 0 0 0 0 0 0 20 11 3 7 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -haskell 0 0 0 0 0 59 0 0 0 0 0 0 0 0 0 1 10 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 -racket 126 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 78 13 -scala 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 7 0 0 0 0 0 0 0 0 0 0 2 2 0 -tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 -php 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 17 0 0 0 0 11 5 0 0 0 0 0 0 0 0 92 26 -ocaml 6 0 0 0 0 0 0 0 4 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 1 0 0 0 0 0 0 1 0 +clj 79 31 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +py 0 0 0 587 142 79 476 0 0 0 0 0 0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +js 7 15 0 0 0 0 0 333 158 682 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 0 +ruby 0 0 0 0 0 0 0 0 0 0 158 87 23 55 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +haskell 0 0 0 0 0 468 0 0 0 0 0 0 0 0 0 7 79 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 +racket 1000 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 412 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 619 103 +scala 7 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 55 0 0 0 0 0 0 0 0 0 0 15 15 0 +tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0 0 0 0 0 0 0 +php 0 0 0 0 0 0 0 0 0 63 0 0 0 0 0 0 0 0 0 134 0 0 0 0 87 39 0 0 0 0 0 0 0 0 730 206 +ocaml 47 0 0 0 0 0 0 0 31 0 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 277 7 0 0 0 0 0 0 7 0 From 52c7a2563feeb9ee6b7e7b1ae05f6725f7ad85c6 Mon Sep 17 00:00:00 2001 From: Alan R Date: Sun, 7 Jun 2015 11:38:25 -0400 Subject: [PATCH 9/9] compared with Ben's rules and aux set of my rules --- lclassifier/ben_output.txt | 319 +++++++++++++++++++++++++++++++++++++ lclassifier/bens_rules.py | 19 +++ lclassifier/lclassifier.py | 150 +++++++++++------ lclassifier/old_output.txt | 96 +++++++++++ 4 files changed, 534 insertions(+), 50 deletions(-) create mode 100644 lclassifier/ben_output.txt create mode 100644 lclassifier/bens_rules.py create mode 100644 lclassifier/old_output.txt diff --git a/lclassifier/ben_output.txt b/lclassifier/ben_output.txt new file mode 100644 index 0000000..c8534c9 --- /dev/null +++ b/lclassifier/ben_output.txt @@ -0,0 +1,319 @@ + total samples 931 + number of usable files 656 + + number of read file types: 32 + number of recognized types: 14 + summary of tile types +ats +clj 38 +cs 41 +dart +erlang +fpascal +fsharp +c 129 +hs 33 +gnat +go +php 55 +ifc +java 51 +js 25 +ruby 73 +lua +ocaml 35 +oz +pl 34 +py 36 +racket 29 +rust +sbcl 34 +scala 43 +vw +cint +javasteady +parrot +cc +txt +ozf + not included: + + score for training_set test_set +0 0.9932 0.8387 +1 0.5399 0.3733 +2 0.2346 0.235 +3 0.2916 0.2765 +4 0.533 0.3779 + + failed to classify +js misclassified as c +ocaml misclassified as c +java misclassified as ruby +cs misclassified as c +java misclassified as ruby +java misclassified as c +scala misclassified as c +php misclassified as c +scala misclassified as c +java misclassified as c +c misclassified as ruby +ruby misclassified as c +cs misclassified as c +c misclassified as ruby +cs misclassified as c +js misclassified as c +hs misclassified as py +scala misclassified as c +java misclassified as c +php misclassified as c +java misclassified as c +php misclassified as ruby +js misclassified as c +java misclassified as ocaml +hs misclassified as clj +pl misclassified as c +php misclassified as c +hs misclassified as py +pl misclassified as ruby +scala misclassified as c +php misclassified as c +ocaml misclassified as c +ocaml misclassified as c +java misclassified as c +ruby misclassified as pl +cs misclassified as c +ruby misclassified as pl +py misclassified as c +scala misclassified as c +scala misclassified as hs +js misclassified as c +java misclassified as c +cs misclassified as c +php misclassified as c +php misclassified as ruby +java misclassified as c +php misclassified as pl +php misclassified as c +hs misclassified as ocaml +pl misclassified as ruby +java misclassified as c +cs misclassified as ruby +php misclassified as pl +cs misclassified as c +c misclassified as ruby +java misclassified as c +py misclassified as ruby +hs misclassified as c +scala misclassified as c +c misclassified as pl +cs misclassified as c +c misclassified as ruby +cs misclassified as c +py misclassified as c +php misclassified as ruby +ruby misclassified as hs +pl misclassified as ruby +cs misclassified as c +scala misclassified as c +java misclassified as c +php misclassified as c +java misclassified as c +hs misclassified as c +ruby misclassified as pl +java misclassified as ruby +ocaml misclassified as c +hs misclassified as ruby +java misclassified as pl +hs misclassified as c +js misclassified as php +clj misclassified as racket +clj misclassified as racket +ruby misclassified as c +hs misclassified as c +ocaml misclassified as c +scala misclassified as c +cs misclassified as c +php misclassified as c +java misclassified as ruby +ocaml misclassified as py +clj misclassified as racket +php misclassified as c +java misclassified as c +ruby misclassified as pl +java misclassified as c +scala misclassified as c +php misclassified as c +py misclassified as scala +php misclassified as pl +py misclassified as scala +pl misclassified as ruby +js misclassified as ruby +php misclassified as ruby +hs misclassified as c +java misclassified as c +scala misclassified as c +c misclassified as scala +java misclassified as c +java misclassified as c +java misclassified as c +java misclassified as c +hs misclassified as py +ruby misclassified as c +php misclassified as ruby +hs misclassified as py +java misclassified as c +ruby misclassified as pl +c misclassified as pl +py misclassified as ruby +py misclassified as ruby +php misclassified as ruby +pl misclassified as c +java misclassified as c +cs misclassified as c +pl misclassified as ruby +js misclassified as c +java misclassified as c +pl misclassified as c +c misclassified as scala +js misclassified as c +clj misclassified as racket +hs misclassified as scala +pl misclassified as c +ruby misclassified as c +php misclassified as c +java misclassified as c +cs misclassified as c +php misclassified as ruby +ocaml misclassified as ruby +php misclassified as c +cs misclassified as c +pl misclassified as c +py misclassified as c +java misclassified as c +java misclassified as c +pl misclassified as c +php misclassified as c +pl misclassified as ruby +scala misclassified as c +ruby misclassified as c +clj misclassified as racket +php misclassified as c +java misclassified as c +ocaml misclassified as py +java misclassified as c +clj misclassified as racket +php misclassified as ruby +cs misclassified as c +ocaml misclassified as scala +ruby misclassified as pl +clj misclassified as racket +pl misclassified as ruby +ocaml misclassified as c +pl misclassified as c +cs misclassified as c +scala misclassified as c +hs misclassified as c +scala misclassified as c +js misclassified as php +hs misclassified as c +php misclassified as ruby +java misclassified as c +ruby misclassified as pl +scala misclassified as c +hs misclassified as c +cs misclassified as c +pl misclassified as c +js misclassified as c +cs misclassified as scala +pl misclassified as c +cs misclassified as c +ocaml misclassified as c +cs misclassified as c +c misclassified as ruby +ruby misclassified as c +c misclassified as ruby +js misclassified as ruby +php misclassified as ruby +pl misclassified as c +hs misclassified as c +pl misclassified as ruby +java misclassified as c +hs misclassified as py +scala misclassified as c +cs misclassified as c +c misclassified as ruby +java misclassified as c +js misclassified as ruby +ruby misclassified as py +php misclassified as c +java misclassified as ruby +java misclassified as c +php misclassified as c +py misclassified as ocaml +cs misclassified as c + + failure counts + wrongly classified: +c ########################################################################################################################### +ruby ####################################### +racket ####### +py ######## +hs ## +clj # +ocaml ### +php ## +pl ############# +scala ####### + failed to classify +ruby ############### +cs ###################### +scala ################ +ocaml ########### +pl ################## +clj ####### +c ########### +js ############ +php ############################# +hs ################## +py ######### +java ##################################### + +<_csv.reader object at 0x10a1a2c18> + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 2 0.125 + pred 2 ['java' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php' + 'ruby' 'php' 'pl' 'ruby' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' 'c' + 'c' 'ruby' 'c' 'c' 'php' 'hs' 'ocaml' 'c' 'py'] + + score_quest 3 0.125 + pred 3 ['scala' 'ocaml' 'ocaml' 'scala' 'ruby' 'ruby' 'ocaml' 'ruby' 'cs' 'cs' + 'cs' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'scala' 'ocaml' + 'scala' 'ocaml' 'ruby' 'ruby' 'ruby' 'scala' 'ruby' 'ruby' 'cs' 'cs' + 'ruby' 'ocaml'] + + score_quest 4 0.0625 + pred 4 ['c' 'clj' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' + 'c' 'sbcl' 'sbcl' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c'] + + score_quest 5 0.125 + pred 5 ['clj' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php' + 'ruby' 'php' 'pl' 'py' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' + 'ruby' 'c' 'ruby' 'clj' 'ruby' 'php' 'hs' 'ocaml' 'c' 'py'] + +clj 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 571 142 142 0 +py 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 285 +js 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 428 285 142 428 +ruby 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 142 142 142 571 +haskell 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 285 +racket 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 142 142 +java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 142 142 142 0 +scala 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 142 +tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 0 +php 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 428 +ocaml 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 142 diff --git a/lclassifier/bens_rules.py b/lclassifier/bens_rules.py new file mode 100644 index 0000000..37f8572 --- /dev/null +++ b/lclassifier/bens_rules.py @@ -0,0 +1,19 @@ + + + + elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b', + '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b', + '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b'] + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + + elements = ['[)]+','[}]+', '[\]]+', '[=]+'] + + for element in elements: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + return results diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py index 884e14e..9e96436 100644 --- a/lclassifier/lclassifier.py +++ b/lclassifier/lclassifier.py @@ -176,6 +176,99 @@ def print_matrix(matrix, p_max=None): print("") #print([str(round(val, 3)) for val in vector]) +def ben_transform(X): + elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b', + '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b', + '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b'] + + elements2 = ['[)]+','[}]+', '[\]]+', '[=]+'] + + matrix = [] + for text in X: + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + + for element in elements2: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + matrix.append(results) + return matrix + + +def alan_transform(X): + cish = ["^[ \t]*\*", "^[ \t]*/\*\*"] + clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "] + python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s", + "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"] + js = ["^[ \t]*var", "=\s*function", + "function\s*\w*\(\w*[\w\s,]*\)\s*\{"] + ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$", + "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"] + hs = ["&&&", "^\{-"] + clj = ["^\(define", "^[ \t]*;+"] + java = ["^[ \t]*public \w* \w*", "^import .*;$"] + scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="] + tcl = ["^[ \t]*proc \w*::\w* \{"] + php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*", + "^[ \t]*\$\w* ?=.*;$"] + ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"] + perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"] + gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<", + "^using .*;$", "sealed"] + + reg_list = clojure + python + js + ruby + hs + clj + java + scl\ + + tcl + php + ocaml + perl + gcc + cish + + matrix = [] + for text in X: + v = [0] * len(reg_list) + for i in range(len(reg_list)): + reg_expr = reg_list[i] + prog = re.compile(reg_expr, flags=re.MULTILINE) + val = len(prog.findall(text)) # /len(text) + # this was found to have best results over normalized forms + v[i] = val + matrix.append(v) + return matrix + + +def old_transform(X): + char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", + "this\.", "^end", ";", "\*", "%", "^do", + "\<\$php", "/\*", "__", "=", "==", + "===", "\(\)", "\{\}", ":", "\+\+", "\+=", + "^#include", "^ \*", ":\s*$", "\<\<|\>\>", + "int", "\b\*\w", "\(&\w", "argv", "\[\]" + "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", + "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", + "\(\w+:", "@", "\b@\w"] + word_list = ["private", "static", "make","let", "def", "^\(defn", + "defn", "do", "class", "^function", "public", + "unset", "printf\(", "return", "NULL", "void", + "main\(", "main_", "void\s\*\w", "\{else\}", + "char", "array\(", "__init__", "__str__", "token", + "^import", "^from", "final", "val", "type", "package", + "object", "String", "string", "primitive", "fixnum", + "error", "try"] + + reg_list = char_list + word_list + + matrix = [] + for text in X: + v = [0] * len(reg_list) + for i in range(len(reg_list)): + reg_expr = reg_list[i] + prog = re.compile(reg_expr, flags=re.MULTILINE) + val = len(prog.findall(text)) # /len(text) + # this was found to have best results over normalized forms + v[i] = val + matrix.append(v) + return matrix + class CustomFeaturizer(TransformerMixin): @@ -189,56 +282,13 @@ def fit(self, X, y=None): return self def transform(self, X): - # char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", - # "this\.", "^end", ";", "\*", "%", "^do", - # "\<\$php", "/\*", "__", "=", "==", - # "===", "\(\)", "\{\}", ":", "\+\+", "\+=", - # "^#include", "^ \*", ":\s*$", "\<\<|\>\>", - # "int", "\b\*\w", "\(&\w", "argv", "\[\]" - # "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", - # "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", - # "\(\w+:", "@", "\b@\w"] - # word_list = ["private", "static", "make","let", "def", "^\(defn", - # "defn", "do", "class", "^function", "public", - # "unset", "printf\(", "return", "NULL", "void", - # "main\(", "main_", "void\s\*\w", "\{else\}", - # "char", "array\(", "__init__", "__str__", "token", - # "^import", "^from", "final", "val", "type", "package", - # "object", "String", "string", "primitive", "fixnum", - # "error", "try"] - cish = ["^[ \t]*\*", "^[ \t]*/\*\*"] - clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "] - python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s", - "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"] - js = ["^[ \t]*var", "=\s*function", - "function\s*\w*\(\w*[\w\s,]*\)\s*\{"] - ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$", - "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"] - hs = ["&&&", "^\{-"] - clj = ["^\(define", "^[ \t]*;+"] - java = ["^[ \t]*public \w* \w*", "^import .*;$"] - scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="] - tcl = ["^[ \t]*proc \w*::\w* \{"] - php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*", - "^[ \t]*\$\w* ?=.*;$"] - ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"] - perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"] - gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<", - "^using .*;$", "sealed"] - - reg_list = clojure + python + js + ruby + hs + clj + java + scl\ - + tcl + php + ocaml + perl + gcc + cish - - matrix = [] - for text in X: - v = [0] * len(reg_list) - for i in range(len(reg_list)): - reg_expr = reg_list[i] - prog = re.compile(reg_expr, flags=re.MULTILINE) - val = len(prog.findall(text)) # /len(text) - # this was found to have best results over normalized forms - v[i] = val - matrix.append(v) + + #matrix = ben_transform(X) + + #matrix = old_transform(X) + + matrix = alan_transform(X) + return matrix diff --git a/lclassifier/old_output.txt b/lclassifier/old_output.txt new file mode 100644 index 0000000..be1c816 --- /dev/null +++ b/lclassifier/old_output.txt @@ -0,0 +1,96 @@ + total samples 931 + number of usable files 656 + + number of read file types: 32 + number of recognized types: 14 + summary of tile types +ats +clj 38 +cs 41 +dart +erlang +fpascal +fsharp +c 129 +hs 33 +gnat +go +php 55 +ifc +java 51 +js 25 +ruby 73 +lua +ocaml 35 +oz +pl 34 +py 36 +racket 29 +rust +sbcl 34 +scala 43 +vw +cint +javasteady +parrot +cc +txt +ozf + not included: + + score for training_set test_set +0 0.9863 0.9124 +1 1.0 0.9401 +2 0.82 0.7926 +3 0.9636 0.9447 +4 0.9977 0.9724 + + failed to classify +sbcl misclassified as racket + + failure counts + wrongly classified: +racket # + failed to classify +sbcl # + +<_csv.reader object at 0x1113abc18> + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 2 0.375 + pred 2 ['clj' 'clj' 'js' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' 'js' 'hs' + 'hs' 'hs' 'hs' 'js' 'hs' 'js' 'js' 'racket' 'ocaml' 'js' 'pl' 'scala' + 'ocaml' 'ocaml' 'php' 'js' 'php' 'ocaml' 'ocaml'] + + score_quest 3 0.5625 + pred 3 ['clj' 'clj' 'cs' 'clj' 'py' 'py' 'sbcl' 'py' 'js' 'js' 'ruby' 'java' + 'ruby' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'c' 'c' + 'scala' 'scala' 'hs' 'hs' 'c' 'ruby' 'hs' 'ocaml' 'ocaml'] + + score_quest 4 0.59375 + pred 4 ['clj' 'clj' 'ocaml' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'scala' 'cs' + 'scala' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'sbcl' 'racket' 'racket' 'js' 'js' + 'scala' 'scala' 'php' 'php' 'sbcl' 'php' 'php' 'ocaml' 'ocaml'] + + score_quest 5 0.4375 + pred 5 ['clj' 'clj' 'ocaml' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' + 'racket' 'scala' 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' + 'ocaml' 'c' 'js' 'scala' 'scala' 'pl' 'php' 'php' 'racket' 'php' 'ocaml' + 'racket'] + +clj 0 1 2 0 0 23 0 0 0 5 0 1 0 0 0 0 0 0 0 0 0 20 0 0 0 0 1 1 3 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 10 9 9 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 0 1 0 0 1 6 0 0 0 0 +py 3 0 19 0 6 97 0 0 0 6 9 44 0 0 0 75 366 11 0 66 14 217 0 0 0 0 198 0 13 0 0 0 0 0 0 1 0 0 3 0 0 0 0 0 0 0 0 2 0 91 0 0 41 22 0 0 0 0 26 2 0 0 0 0 0 31 0 29 2 0 10 2 0 7 5 0 6 0 0 0 0 0 5 +js 0 2 244 3 9 100 0 11 0 45 29 3 0 0 5 1 297 27 7 55 18 87 3 0 0 0 6 0 0 0 0 0 0 1 0 39 0 1 0 0 15 0 0 1 0 0 0 1 1 20 0 0 114 1 27 0 0 0 66 0 0 0 0 0 0 2 0 0 0 0 0 0 1 61 18 0 9 1 15 0 0 11 1 +ruby 0 0 5 0 10 7 0 0 3 0 5 0 0 0 0 0 44 7 3 0 1 33 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 7 19 0 1 0 1 0 20 0 0 5 9 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 +haskell 0 92 33 63 28 24 0 0 0 6 11 0 0 0 0 0 92 2 0 5 0 81 6 0 0 0 1 1 5 0 0 0 0 0 13 0 0 0 0 0 0 0 0 16 0 0 0 0 3 6 0 0 179 1 0 0 0 0 14 0 0 0 0 0 0 1 0 0 0 0 76 0 0 0 0 0 0 145 1 0 0 5 0 +racket 0 1 0 7 20 48 0 0 0 7 0 0 0 0 0 15 22 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 11 70 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 5 0 0 0 0 32 65 5 19 0 +java 0 0 6 0 3 1 0 0 0 16 136 0 0 0 18 0 0 0 0 2 0 1 0 0 0 32 0 0 10 0 0 0 0 0 0 2 0 0 0 0 0 0 0 37 0 0 0 0 3 5 0 0 2 0 0 11 0 0 7 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 11 1 0 0 0 3 +scala 0 3 33 32 0 28 0 0 0 0 6 0 0 0 1 0 71 0 0 0 0 57 0 0 0 2 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 23 2 0 0 0 0 0 16 0 0 2 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 15 37 9 2 9 13 0 0 0 0 1 +tcl 0 0 48 92 13 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 0 0 0 0 10 0 0 0 6 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 +php 0 93 53 185 1 14 0 0 0 74 999 7 0 0 33 5 27 3 2 28 0 18 0 0 0 0 0 0 5 0 0 0 0 0 1 2 0 0 0 0 0 0 0 36 0 0 0 0 0 0 0 0 6 5 0 23 0 0 35 0 0 0 0 0 0 0 3 0 0 0 0 0 0 20 0 0 1 0 18 0 0 0 0 +ocaml 0 83 19 0 2 11 0 0 0 13 76 0 0 0 0 0 75 0 0 3 2 35 0 0 0 0 1 0 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 57 3 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 22 0 0 20 9 0 0 0 2