diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..94840b3 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +layout python3 diff --git a/.gitignore b/.gitignore index f00dbf2..a7c3822 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,8 @@ docs/_build/ # PyBuilder target/ +.DS_store +benchmarksgame-2014-08-31/ + +.direnv/ +test_w_ext/ diff --git a/Lang_classifier_use.ipynb b/Lang_classifier_use.ipynb new file mode 100644 index 0000000..d66060b --- /dev/null +++ b/Lang_classifier_use.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from lclassifier.lclassifier import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demonstration of Language Classifier (lclassifier)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acceptable_file(\"py\") # testing that import is functional" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'py'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean_ext(\"python3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " total samples 931\n", + " number of usable files 656\n", + " \n", + " number of read file types: 32\n", + " number of recognized types: 14\n", + " summary of tile types\n", + "ats \n", + "clj 38 \n", + "cs 41 \n", + "dart \n", + "erlang \n", + "fpascal \n", + "fsharp \n", + "c 129 \n", + "hs 33 \n", + "gnat \n", + "go \n", + "php 55 \n", + "ifc \n", + "java 51 \n", + "js 25 \n", + "ruby 73 \n", + "lua \n", + "ocaml 35 \n", + "oz \n", + "pl 34 \n", + "py 36 \n", + "racket 29 \n", + "rust \n", + "sbcl 34 \n", + "scala 43 \n", + "vw \n", + "cint \n", + "javasteady \n", + "parrot \n", + "cc \n", + "txt \n", + "ozf \n", + " not included: \n", + " \n" + ] + } + ], + "source": [ + "filelist, testlist = load_file_names()\n", + "contents, ltype, testcont = load_files(filelist, testlist)\n", + "\n", + "plist = [fit2, fit3, fit4, fit5, fit6]\n", + "\n", + "X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pipe = fit6(Xt, yt)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "testing set outcomes\n", + "scala scala \n", + "c c \n", + "c c \n", + "clj clj \n", + "java java \n", + "py py \n", + "clj clj \n", + "js js \n", + "c c \n", + "pl pl \n", + "cs cs \n", + "c c \n", + "c c \n", + "ocaml ocaml \n", + "hs hs \n", + "sbcl sbcl \n", + "racket racket \n", + "php php \n", + "pl pl \n", + "ocaml ocaml \n", + "\n", + " overall score: 1.0\n" + ] + } + ], + "source": [ + "M = pipe.predict(Xt)\n", + "print(\"testing set outcomes\")\n", + "for i in range(20):\n", + " print(M[i].ljust(8)+ \" \" + yt[i].ljust(8))\n", + "print(\"\")\n", + "print(\" overall score: \"+str(pipe.score(Xt, yt)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['php', 'php', 'php', 'clj', 'py', 'py', 'py', 'py', 'js', 'js',\n", + " 'js', 'js', 'ruby', 'ruby', 'ruby', 'hs', 'php', 'hs', 'racket',\n", + " 'php', 'racket', 'java', 'java', 'scala', 'scala', 'php', 'php',\n", + " 'java', 'php', 'java', 'ocaml', 'php'], \n", + " dtype='\n", + " number of testing file types: 11\n", + " actual_file_type predicted_type\n", + "clj php \n", + "clj php \n", + "clj php \n", + "clj clj \n", + "py py \n", + "py py \n", + "py py \n", + "py py \n", + "js js \n", + "js js \n", + "js js \n", + "js js \n", + "ruby ruby \n", + "ruby ruby \n", + "ruby ruby \n", + "haskell hs \n", + "haskell php \n", + "haskell hs \n", + "racket racket \n", + "racket php \n", + "racket racket \n", + "java java \n", + "java java \n", + "scala scala \n", + "scala scala \n", + "tcl php \n", + "tcl php \n", + "php java \n", + "php php \n", + "php java \n", + "ocaml ocaml \n", + "ocaml php \n", + " \n", + " score: 0.625\n" + ] + } + ], + "source": [ + "ans = read_answers()\n", + "M = pipe.predict(testcont)\n", + "print(\" actual_file_type predicted_type\")\n", + "for i in range(len(ans)):\n", + " print(ans[i].ljust(10)+M[i].ljust(10))\n", + "print(\" \")\n", + "print(\" score: \"+str(pipe.score(testcont, ans)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This score isn't very good, but it would be difficult to match all these no matter what methods were being used due to the small quantity of training data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Single file demo" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "py_file = '''JOIN_RETRANSMIT = 0.7\n", + "CATCHUP_INTERVAL = 0.6\n", + "ACCEPT_RETRANSMIT = 1.0\n", + "PREPARE_RETRANSMIT = 1.0\n", + "INVOKE_RETRANSMIT = 0.5\n", + "LEADER_TIMEOUT = 1.0\n", + "NULL_BALLOT = Ballot(-1, -1) # sorts before all real ballots\n", + "NOOP_PROPOSAL = Proposal(None, None, None) # no-op to fill otherwise empty slots\n", + "\n", + "class Node(object):\n", + " unique_ids = itertools.count()\n", + "\n", + " def __init__(self, network, address):\n", + " self.network = network\n", + " self.address = address or 'N%d' % self.unique_ids.next()\n", + " self.logger = SimTimeLogger(logging.getLogger(self.address), {'network': self.network})\n", + " self.logger.info('starting')\n", + " self.roles = []\n", + " self.send = functools.partial(self.network.send, self)\n", + "\n", + " def register(self, roles):\n", + " self.roles.append(roles)\n", + "\n", + " def unregister(self, roles):\n", + " self.roles.remove(roles)\n", + "\n", + " def receive(self, sender, message):\n", + " handler_name = 'do_%s' % type(message).__name__\n", + "\n", + " for comp in self.roles[:]:\n", + " if not hasattr(comp, handler_name):\n", + " continue\n", + " comp.logger.debug(\"received %s from %s\", message, sender)\n", + " fn = getattr(comp, handler_name)\n", + " fn(sender=sender, **message._asdict())\n", + "\n", + "class Timer(object):\n", + "\n", + " def __init__(self, expires, address, callback):\n", + " self.expires = expires\n", + " self.address = address\n", + " self.callback = callback\n", + " self.cancelled = False'''" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['py'], \n", + " dtype=' + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 2 0.125 + pred 2 ['java' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php' + 'ruby' 'php' 'pl' 'ruby' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' 'c' + 'c' 'ruby' 'c' 'c' 'php' 'hs' 'ocaml' 'c' 'py'] + + score_quest 3 0.125 + pred 3 ['scala' 'ocaml' 'ocaml' 'scala' 'ruby' 'ruby' 'ocaml' 'ruby' 'cs' 'cs' + 'cs' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby' 'scala' 'ocaml' + 'scala' 'ocaml' 'ruby' 'ruby' 'ruby' 'scala' 'ruby' 'ruby' 'cs' 'cs' + 'ruby' 'ocaml'] + + score_quest 4 0.0625 + pred 4 ['c' 'clj' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' + 'c' 'sbcl' 'sbcl' 'clj' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c'] + + score_quest 5 0.125 + pred 5 ['clj' 'racket' 'racket' 'clj' 'c' 'c' 'ocaml' 'c' 'ocaml' 'ocaml' 'php' + 'ruby' 'php' 'pl' 'py' 'c' 'ruby' 'ruby' 'sbcl' 'ocaml' 'racket' 'c' + 'ruby' 'c' 'ruby' 'clj' 'ruby' 'php' 'hs' 'ocaml' 'c' 'py'] + +clj 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 571 142 142 0 +py 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 285 +js 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 428 285 142 428 +ruby 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 142 142 142 571 +haskell 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 285 +racket 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 142 142 +java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 142 142 142 0 +scala 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 142 +tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 0 +php 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 428 +ocaml 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 285 142 142 142 diff --git a/lclassifier/bens_rules.py b/lclassifier/bens_rules.py new file mode 100644 index 0000000..37f8572 --- /dev/null +++ b/lclassifier/bens_rules.py @@ -0,0 +1,19 @@ + + + + elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b', + '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b', + '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b'] + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + + elements = ['[)]+','[}]+', '[\]]+', '[=]+'] + + for element in elements: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + return results diff --git a/lclassifier/lclassifier.py b/lclassifier/lclassifier.py new file mode 100644 index 0000000..9e96436 --- /dev/null +++ b/lclassifier/lclassifier.py @@ -0,0 +1,420 @@ +from glob import glob +import pandas as pd +import numpy as np +from sklearn import linear_model +from sklearn.cross_validation import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.feature_extraction.text import TfidfTransformer +# estimators +from sklearn.naive_bayes import MultinomialNB +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import SGDClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.base import TransformerMixin +# other utilities +import csv +import re +import sys + + +def acceptable_file(text): + if text in llist: + return True + else: + return False + + +def clean_ext(textp): + text = textp.strip() + if text == "gcc" or text == "h" or text == "gpp": + return "c" + elif text == "hack": + return "php" + elif text == "yarv" or text == "jruby": + return "ruby" + elif text == "clojure": + return "clj" + elif text == "python3" or text == "python": + return "py" + elif text == "perl": + return "pl" + elif text == "javascript": + return "js" + elif text == "csharp": + return "cs" + elif text == "ghc": + return "hs" + elif text == "scheme": + return "racket" + else: + return text + +llist = ["c", "cs", "sbcl", "clj", "hs", "java", "js", + "ocaml", "pl", "php", "py", "ruby", "scala", "racket"] +main_dir = "/Users/admin/Documents/week5/programming-language-classifier" + + +def list_uniques(alist): + rlist = [] + for item in alist: + if item not in rlist: + rlist.append(item) + return rlist + + +def load_file_names(): + l = [0 for i in range(5)] + s = main_dir+"/benchmarksgame-2014-08-31/benchmarksgame/bench/" + max_lvl = 4 + for i in range(max_lvl): + l[i] = glob(s + "*/" * i + "*.*") +# l[0] = glob("benchmarksgame-2014-08-31/benchmarksgame/*/*/*/*/*.*") +# l2 = glob("benchmarksgame-2014-08-31/benchmarksgame/bench/*/*/*.*") +# filelist = l1 + l2 + filelist = [] + for i in range(max_lvl): + filelist += l[i] + testlist = glob(main_dir+"/test/*") + + print(" total samples " + str(len(filelist))) + return filelist, testlist + + +def load_files(filelist, testlist): + contents = [] + ltype = [] + ext_list = [] + for filename in filelist: + i = filename.rfind(".") + ext = clean_ext(filename[i + 1:]) + if ext == "tcl": + print(filename) + # print(ext, end=" - ") + # print(ext+ str(ext in ext_list) + " - "+str(ext_list)) + if ext not in ext_list: + ext_list.append(ext) + if acceptable_file(ext): + ltype.append(ext) + with open(filename, encoding="ISO-8859-1") as file: + # print(filename) + contents.append(file.read()) +# return contents, ltype + + print(" number of usable files " + str(len(ltype))) + print(" ") + print(" number of read file types: " + str(len(ext_list))) + print(" number of recognized types: " + str(len(llist))) + print(" summary of tile types") + for ext in ext_list: + print(ext.ljust(12) + " ", end=" ") + if ext in llist: + print(ltype.count(ext), end=" ") + print(" ") + print(" not included: ", end="") + for ext in llist: + if ext not in ext_list: + print(ext, end=" : ") + print(" ") + + testcont = [0] * 32 + for filename in testlist: + # print(filename) + with open(filename) as file: + di = filename.rfind("/") + i = int(filename[di + 1:]) +# print(filename+" "+str(i)) + testcont[i - 1] = file.read() + print(" ") + return contents, ltype, testcont + # print(testlist) + + +def read_answers(): + with open(main_dir+"/test.csv") as csvfile: + ans_list = csv.reader(csvfile, delimiter=",") + ans = [] + print(ans_list) + for row in ans_list: + ans.append(clean_ext(row[1])) + print(" number of testing file types: " + str(len(list_uniques(ans)))) +# print(row[0]) + return ans + + +def fit1(contents, ltype): + pipe = Pipeline([('bag_of_words', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('bayes', MultinomialNB())]) + pipe.fit(contents, ltype) + return pipe +# print(pipe.score(contents, ltype)) +# print(pipe.predict(testcont)) +# return pipe.score(contents, ltype) + + +def fit2(contents, ltype): + pipe = Pipeline([('bag_of_words', CountVectorizer()), + # ('tfidf', TfidfTransformer()), + ('bayes', MultinomialNB())]) + pipe.fit(contents, ltype) + return pipe +# print(pipe.score(contents, ltype)) +# print(pipe.predict(testcont)) +# return pipe.score(contents, ltype) + + +def print_matrix(matrix, p_max=None): + if p_max is None: + upper_limit = len(matrix) + else: + upper_limit = p_max + for i in range(upper_limit): + vector = matrix[i] + for val in vector: + print(str(round(val, 3)).ljust(5) + ",", end="") + print("") + #print([str(round(val, 3)) for val in vector]) + +def ben_transform(X): + elements = ['\bbegin\b', '\bend\b', '\bdo\b', '\bvar\b', '\bdefine\b', '\bdefn\b', '\bfunction\b', + '\bclass\b', '\bmy\b', '\brequire\b', '\bvoid\b', '\bval\b', '\bpublic\b', '\blet\b', + '\bwhere\b', '\busing\b', '\bextend\b', '\bfunction\b'] + + elements2 = ['[)]+','[}]+', '[\]]+', '[=]+'] + + matrix = [] + for text in X: + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + + for element in elements2: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + matrix.append(results) + return matrix + + +def alan_transform(X): + cish = ["^[ \t]*\*", "^[ \t]*/\*\*"] + clojure = ["^\s*\(\w.*\s*\)$", "^[ \t]*;", "\(def(n)? "] + python = ["\):[ \t]*\n[ \t]*\w", "\s__\w*__\(", "(^from|^import)\s", + "def\s*\w*\([ \w,]*\):[ \t]*\n(( {4})+|\t+)\w"] + js = ["^[ \t]*var", "=\s*function", + "function\s*\w*\(\w*[\w\s,]*\)\s*\{"] + ruby = ["^[ \t]*end$", "^[ \t]*def *\w*(\(\w*\))?[ \t]*$", + "^[ \t]*include \w*[ \t]*$", "^[ \t]*@", "super"] + hs = ["&&&", "^\{-"] + clj = ["^\(define", "^[ \t]*;+"] + java = ["^[ \t]*public \w* \w*", "^import .*;$"] + scl = ["^[ \t]*object \w*", "^[ \t]*(final)?val \w* ="] + tcl = ["^[ \t]*proc \w*::\w* \{"] + php = ["^[ \t]*(\w*)?( )?function \w*( )?\(&?\$\w*", + "^[ \t]*\$\w* ?=.*;$"] + ocaml = ["^[ \t]*let \w+", "^[ \t]*struct[ \t]*$"] + perl = ["^[ \t]*my ", "^[ \t]*sub \w* \{"] + gcc = ["^[ \t]*typedef \w* \w* ?\{", "^#include ?\<", + "^using .*;$", "sealed"] + + reg_list = clojure + python + js + ruby + hs + clj + java + scl\ + + tcl + php + ocaml + perl + gcc + cish + + matrix = [] + for text in X: + v = [0] * len(reg_list) + for i in range(len(reg_list)): + reg_expr = reg_list[i] + prog = re.compile(reg_expr, flags=re.MULTILINE) + val = len(prog.findall(text)) # /len(text) + # this was found to have best results over normalized forms + v[i] = val + matrix.append(v) + return matrix + + +def old_transform(X): + char_list = ["^#", "\-\>", "\{", "\$", "\<", "\[", "func\b", + "this\.", "^end", ";", "\*", "%", "^do", + "\<\$php", "/\*", "__", "=", "==", + "===", "\(\)", "\{\}", ":", "\+\+", "\+=", + "^#include", "^ \*", ":\s*$", "\<\<|\>\>", + "int", "\b\*\w", "\(&\w", "argv", "\[\]" + "if\s", "if\(", "^\{", "^\}", ",\s*int\s\w", + "\};", "\[\d*:\d*\]", "\]\s*\{", "^//", "\w\.\{", + "\(\w+:", "@", "\b@\w"] + word_list = ["private", "static", "make","let", "def", "^\(defn", + "defn", "do", "class", "^function", "public", + "unset", "printf\(", "return", "NULL", "void", + "main\(", "main_", "void\s\*\w", "\{else\}", + "char", "array\(", "__init__", "__str__", "token", + "^import", "^from", "final", "val", "type", "package", + "object", "String", "string", "primitive", "fixnum", + "error", "try"] + + reg_list = char_list + word_list + + matrix = [] + for text in X: + v = [0] * len(reg_list) + for i in range(len(reg_list)): + reg_expr = reg_list[i] + prog = re.compile(reg_expr, flags=re.MULTILINE) + val = len(prog.findall(text)) # /len(text) + # this was found to have best results over normalized forms + v[i] = val + matrix.append(v) + return matrix + + +class CustomFeaturizer(TransformerMixin): + + def __init__(self): + pass + #self.featurizers = featurizers + + def fit(self, X, y=None): + """All scikit-lear compatible transforms and classifiers have the + same interface, and fit always returns the same object.""" + return self + + def transform(self, X): + + #matrix = ben_transform(X) + + #matrix = old_transform(X) + + matrix = alan_transform(X) + + return matrix + + +def fit3(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, DecisionTreeClassifier()) + pipe.fit(contents, ltype) + return pipe + + +def fit4(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, SGDClassifier()) + pipe.fit(contents, ltype) + return pipe + + +def fit5(contents, ltype): + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, MultinomialNB()) + pipe.fit(contents, ltype) + return pipe + + +def fit6(contents, ltype): + '''Random Forest uses multiple decision trees and selects the + tree out of all of those which has occurs the most''' + custom_feature = CustomFeaturizer() + pipe = make_pipeline(custom_feature, RandomForestClassifier()) + pipe.fit(contents, ltype) + return pipe + + +def demo_class(X, y): + types = [] + for ext in y: + if ext not in types: + types.append(ext) + typecont = [""] * len(types) + for i in range(len(X)): + text = X[i] + for j in range(len(types)): + ext = types[j] + if ext == y[i]: + typecont[j] += text + custom_feature = CustomFeaturizer() + M = custom_feature.transform(typecont) + ratio = 1000 / max([max(vt) for vt in M]) + for j in range(len(M)): + print(types[j].ljust(8) + " ", end="") + for k in range(len(M[0])): + print(str(int(ratio*M[j][k])).ljust(5), end="") + print("") + + +def default_action(): + filelist, testlist = load_file_names() + contents, ltype, testcont = load_files(filelist, testlist) + + plist = [fit2, fit3, fit4, fit5, fit6] + + X, Xt, y, yt = train_test_split(contents, ltype, test_size=0.33) + pipel = [0 for i in range(len(plist))] + print(" score for training_set test_set") + for i in range(len(plist)): + pipe = plist[i](X, y) + print(str(i).ljust(4) + " " + str(round(pipe.score(X, y), 4)).ljust(8) + + str(round(pipe.score(Xt, yt), 4)).ljust(8)) + print(" ") + for i in range(len(plist)): + pipel[i] = plist[i](contents, ltype) + + print(" failed to classify") + failed_to_classify = {} + wrongly_classified = {} + A = pipe.predict(X) + for i in range(len(A)): + if A[i] != y[i]: + # print(" ") + print(y[i].ljust(6) + " misclassified as " + A[i]) + if y[i] in failed_to_classify: + failed_to_classify[y[i]] += 1 + else: + failed_to_classify[y[i]] = 1 + if A[i] in wrongly_classified: + wrongly_classified[A[i]] += 1 + else: + wrongly_classified[A[i]] = 1 + print("") + print(" failure counts") + print(" wrongly classified:") + for ext in wrongly_classified: + print(ext.ljust(7) + "#" * wrongly_classified[ext]) + print(" failed to classify") + for ext in failed_to_classify: + print(ext.ljust(7) + "#" * failed_to_classify[ext]) + print(" ") + + ans = read_answers() + print(ans) + + i = 0 + for pipe in pipel: + i += 1 + print(" score_quest " + str(i) + " " + str(pipe.score(testcont, ans))) + print(" pred " + str(i) + " " + str(pipe.predict(testcont))) + print(" ") + + demo_class(testcont, ans) + + +if __name__ == "__main__": + if len(sys.argv) == 1: + default_action() + elif len(sys.argv) == 2: + test_file = sys.argv[1] + print("Estimating file type of " + test_file) + + filelist, testlist = load_file_names() + X, y, testcont = load_files(filelist, testlist) + pipe = fit6(X, y) + with open(test_file) as f: + test_contents = f.read() + est_ext = pipe.predict([test_contents]) + + print("Predicted extension: " + str(est_ext)) + + else: + print("error: command line arguments not supported") diff --git a/lclassifier/old_output.txt b/lclassifier/old_output.txt new file mode 100644 index 0000000..be1c816 --- /dev/null +++ b/lclassifier/old_output.txt @@ -0,0 +1,96 @@ + total samples 931 + number of usable files 656 + + number of read file types: 32 + number of recognized types: 14 + summary of tile types +ats +clj 38 +cs 41 +dart +erlang +fpascal +fsharp +c 129 +hs 33 +gnat +go +php 55 +ifc +java 51 +js 25 +ruby 73 +lua +ocaml 35 +oz +pl 34 +py 36 +racket 29 +rust +sbcl 34 +scala 43 +vw +cint +javasteady +parrot +cc +txt +ozf + not included: + + score for training_set test_set +0 0.9863 0.9124 +1 1.0 0.9401 +2 0.82 0.7926 +3 0.9636 0.9447 +4 0.9977 0.9724 + + failed to classify +sbcl misclassified as racket + + failure counts + wrongly classified: +racket # + failed to classify +sbcl # + +<_csv.reader object at 0x1113abc18> + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 2 0.375 + pred 2 ['clj' 'clj' 'js' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' 'js' 'hs' + 'hs' 'hs' 'hs' 'js' 'hs' 'js' 'js' 'racket' 'ocaml' 'js' 'pl' 'scala' + 'ocaml' 'ocaml' 'php' 'js' 'php' 'ocaml' 'ocaml'] + + score_quest 3 0.5625 + pred 3 ['clj' 'clj' 'cs' 'clj' 'py' 'py' 'sbcl' 'py' 'js' 'js' 'ruby' 'java' + 'ruby' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'c' 'c' + 'scala' 'scala' 'hs' 'hs' 'c' 'ruby' 'hs' 'ocaml' 'ocaml'] + + score_quest 4 0.59375 + pred 4 ['clj' 'clj' 'ocaml' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'scala' 'cs' + 'scala' 'ruby' 'ruby' 'hs' 'hs' 'hs' 'sbcl' 'racket' 'racket' 'js' 'js' + 'scala' 'scala' 'php' 'php' 'sbcl' 'php' 'php' 'ocaml' 'ocaml'] + + score_quest 5 0.4375 + pred 5 ['clj' 'clj' 'ocaml' 'clj' 'js' 'hs' 'js' 'scala' 'js' 'js' 'scala' + 'racket' 'scala' 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' + 'ocaml' 'c' 'js' 'scala' 'scala' 'pl' 'php' 'php' 'racket' 'php' 'ocaml' + 'racket'] + +clj 0 1 2 0 0 23 0 0 0 5 0 1 0 0 0 0 0 0 0 0 0 20 0 0 0 0 1 1 3 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 10 9 9 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 0 1 0 0 1 6 0 0 0 0 +py 3 0 19 0 6 97 0 0 0 6 9 44 0 0 0 75 366 11 0 66 14 217 0 0 0 0 198 0 13 0 0 0 0 0 0 1 0 0 3 0 0 0 0 0 0 0 0 2 0 91 0 0 41 22 0 0 0 0 26 2 0 0 0 0 0 31 0 29 2 0 10 2 0 7 5 0 6 0 0 0 0 0 5 +js 0 2 244 3 9 100 0 11 0 45 29 3 0 0 5 1 297 27 7 55 18 87 3 0 0 0 6 0 0 0 0 0 0 1 0 39 0 1 0 0 15 0 0 1 0 0 0 1 1 20 0 0 114 1 27 0 0 0 66 0 0 0 0 0 0 2 0 0 0 0 0 0 1 61 18 0 9 1 15 0 0 11 1 +ruby 0 0 5 0 10 7 0 0 3 0 5 0 0 0 0 0 44 7 3 0 1 33 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 7 19 0 1 0 1 0 20 0 0 5 9 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 +haskell 0 92 33 63 28 24 0 0 0 6 11 0 0 0 0 0 92 2 0 5 0 81 6 0 0 0 1 1 5 0 0 0 0 0 13 0 0 0 0 0 0 0 0 16 0 0 0 0 3 6 0 0 179 1 0 0 0 0 14 0 0 0 0 0 0 1 0 0 0 0 76 0 0 0 0 0 0 145 1 0 0 5 0 +racket 0 1 0 7 20 48 0 0 0 7 0 0 0 0 0 15 22 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 11 70 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 5 0 0 0 0 32 65 5 19 0 +java 0 0 6 0 3 1 0 0 0 16 136 0 0 0 18 0 0 0 0 2 0 1 0 0 0 32 0 0 10 0 0 0 0 0 0 2 0 0 0 0 0 0 0 37 0 0 0 0 3 5 0 0 2 0 0 11 0 0 7 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 11 1 0 0 0 3 +scala 0 3 33 32 0 28 0 0 0 0 6 0 0 0 1 0 71 0 0 0 0 57 0 0 0 2 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 23 2 0 0 0 0 0 16 0 0 2 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 15 37 9 2 9 13 0 0 0 0 1 +tcl 0 0 48 92 13 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 0 0 0 0 10 0 0 0 6 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 +php 0 93 53 185 1 14 0 0 0 74 999 7 0 0 33 5 27 3 2 28 0 18 0 0 0 0 0 0 5 0 0 0 0 0 1 2 0 0 0 0 0 0 0 36 0 0 0 0 0 0 0 0 6 5 0 23 0 0 35 0 0 0 0 0 0 0 3 0 0 0 0 0 0 20 0 0 1 0 18 0 0 0 0 +ocaml 0 83 19 0 2 11 0 0 0 13 76 0 0 0 0 0 75 0 0 3 2 35 0 0 0 0 1 0 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 57 3 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 22 0 0 20 9 0 0 0 2 diff --git a/lclassifier/output.txt b/lclassifier/output.txt new file mode 100644 index 0000000..577a52b --- /dev/null +++ b/lclassifier/output.txt @@ -0,0 +1,92 @@ + total samples 931 + number of usable files 656 + + number of read file types: 32 + number of recognized types: 14 + summary of tile types +ats +clj 38 +cs 41 +dart +erlang +fpascal +fsharp +c 129 +hs 33 +gnat +go +php 55 +ifc +java 51 +js 25 +ruby 73 +lua +ocaml 35 +oz +pl 34 +py 36 +racket 29 +rust +sbcl 34 +scala 43 +vw +cint +javasteady +parrot +cc +txt +ozf + not included: + + score for training_set test_set +0 0.9818 0.871 +1 1.0 0.977 +2 0.9658 0.9355 +3 0.9795 0.9677 +4 1.0 0.977 + + failed to classify + + failure counts + wrongly classified: + failed to classify + +<_csv.reader object at 0x113420a58> + number of testing file types: 11 +['clj', 'clj', 'clj', 'clj', 'py', 'py', 'py', 'py', 'js', 'js', 'js', 'js', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'racket', 'racket', 'racket', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml'] + score_quest 1 0.59375 + pred 1 ['clj' 'clj' 'clj' 'clj' 'py' 'clj' 'ruby' 'py' 'js' 'js' 'clj' 'php' + 'ruby' 'clj' 'ruby' 'hs' 'hs' 'clj' 'racket' 'racket' 'racket' 'java' + 'clj' 'scala' 'scala' 'racket' 'py' 'c' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 2 0.65625 + pred 2 ['clj' 'clj' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'racket' 'sbcl' 'java' 'java' + 'scala' 'scala' 'ruby' 'ruby' 'java' 'java' 'java' 'ocaml' 'ruby'] + + score_quest 3 0.6875 + pred 3 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'racket' 'hs' 'racket' 'racket' 'racket' 'pl' 'pl' + 'scala' 'scala' 'racket' 'racket' 'pl' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 4 0.71875 + pred 4 ['clj' 'clj' 'sbcl' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'hs' 'hs' 'racket' 'racket' 'racket' 'java' 'c' 'scala' + 'scala' 'py' 'py' 'java' 'php' 'js' 'ocaml' 'ocaml'] + + score_quest 5 0.625 + pred 5 ['ruby' 'ruby' 'ruby' 'clj' 'py' 'py' 'py' 'py' 'js' 'js' 'js' 'js' 'ruby' + 'ruby' 'ruby' 'hs' 'ruby' 'hs' 'racket' 'ruby' 'ruby' 'java' 'java' + 'scala' 'scala' 'ruby' 'ruby' 'java' 'php' 'php' 'ocaml' 'ruby'] + +clj 79 31 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +py 0 0 0 587 142 79 476 0 0 0 0 0 0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +js 7 15 0 0 0 0 0 333 158 682 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 0 +ruby 0 0 0 0 0 0 0 0 0 0 158 87 23 55 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +haskell 0 0 0 0 0 468 0 0 0 0 0 0 0 0 0 7 79 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 +racket 1000 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 412 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +java 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 619 103 +scala 7 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 55 0 0 0 0 0 0 0 0 0 0 15 15 0 +tcl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 0 0 0 0 0 0 0 0 0 0 0 0 +php 0 0 0 0 0 0 0 0 0 63 0 0 0 0 0 0 0 0 0 134 0 0 0 0 87 39 0 0 0 0 0 0 0 0 730 206 +ocaml 47 0 0 0 0 0 0 0 31 0 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 277 7 0 0 0 0 0 0 7 0 diff --git a/lclassifier/tests/test_lclassifier.py b/lclassifier/tests/test_lclassifier.py new file mode 100644 index 0000000..502b789 --- /dev/null +++ b/lclassifier/tests/test_lclassifier.py @@ -0,0 +1,40 @@ +from lclassifier import * + +def test_ext(): + ext = "cowboy" + assert acceptable_file(ext) == False + +def test_correct_ext(): + ext = "perl" + assert clean_ext(ext) == "pl" + +def test_reg_use(): + reg_expr = "\s__\w*__\(" + prog = re.compile(reg_expr) + text ='''import packlag +def __init__(self): + var = thing''' + val = prog.findall(text) + print(val) + assert len(val) == 1 + + reg_expr = "\):[ \t]*\n[ \t]*\w" + prog = re.compile(reg_expr) + val = prog.findall(text) + print(val) + assert len(val) == 1 + + reg_expr = "(^from|^import)\s" + prog = re.compile(reg_expr) + val = prog.findall(text) + print(val) + assert len(val) == 1 + + textjs = '''function noAction() { + } + ''' + reg_expr = "function\s*\w*\(\w*[\w\s,]*\)\s*\{" + prog = re.compile(reg_expr) + val = prog.findall(textjs) + print(val) + assert len(val) == 1 diff --git a/ref_program.py b/ref_program.py new file mode 100644 index 0000000..d6b7a6d --- /dev/null +++ b/ref_program.py @@ -0,0 +1,54 @@ +import csv +import re +import numpy as np +import random + +#from textblob import TextBlob +from collections import Counter + +from sklearn.pipeline import make_pipeline, make_union +from sklearn.base import TransformerMixin +from sklearn.tree import DecisionTreeClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.cross_validation import train_test_split +from sklearn.metrics import classification_report, confusion_matrix + + +class DumbFeaturizer(TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X): + matrix = [] + for i in range(len(X)): + vector = [] + for j in range(11): + if j == X[i]: + vector.append(1) + else: + vector.append(0) + matrix.append(vector) + return matrix + +N = 22 +y = [0] * N +X = [0] * N +for k in range(N): + val = random.randrange(11) + y[k] = val + X[k] = val + + +dumb = DumbFeaturizer() +print(dumb.transform(X)) + +pipe = make_pipeline(dumb, DecisionTreeClassifier()) +pipe.fit(X, y) +# Our baseline +print(pipe.score(X, y)) +print(" ") +print(" transform ") +print(pipe.transform(X))