Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Created by https://www.gitignore.io

.direnv
.envrc
benchmarks/
.DS_Store
### IPythonNotebook ###
# Temporary data
.ipynb_checkpoints/
Expand Down
53 changes: 53 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from lang_class.file_loader import load_files, get_names"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"programs = load_files()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
1 change: 1 addition & 0 deletions lang_class/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

24 changes: 24 additions & 0 deletions lang_class/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
To train the classifier on files with a file extension:
./preprocess.py --name=filename

To run the classifier on the reserved test data:
./classifier.py

To run the classifier on the reserved test data and compare to
the test answers:
./classifier.py --Ytest=Yte.npy

To run the classifier on specified test data:
./classifier.py --from_text=(filename or folder name)

To run the classifier on specified test data and compare
to specified answers (must be saved np.array format):
./classifier.py --from_text=(filename or folder name) --Ytest=answer_file_name.npy


The preprocessor can use TfidfVectorizer with an altered tokenizer
or with the -cv argument will use the custom CodeVectorizer

The classifier can be either a Decision Tree (default) or
you can specify Multinomial Naive Bayes by passing
-bayes to ./classifier.py
Empty file added lang_class/__init__.py
Empty file.
135 changes: 135 additions & 0 deletions lang_class/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python3

from sklearn import cross_validation
from sklearn.externals import joblib
import numpy as np
from scipy.sparse import csc_matrix
import argparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os


def main(X_test, Y_test, data=None, model='dt'):
"""
Creates a classifier, trains the classifier
on preprocessed data, then predicts X_test
data. If Y_test data given, returns the
success rate and counts, otherwise
returns the predictions
"""

if X_test is not None:
X_test = load_matrix(X_test)
else:
if data is not None:
X_test = data
else:
raise TypeError('Bad input data')

classifier = new_classifier(model)
prediction = classifier.predict(X_test)

proba = classifier.predict_proba(X_test)

if Y_test:

Y_test = np.load(Y_test)
result = sum(1 for idx in range(len(Y_test)) if Y_test[idx] != \
prediction[idx])

print('success rate: ', round(100 - 100*result / len(Y_test), 3))
print('or: ', len(Y_test) - result,' out of ', len(Y_test))

else:
keys = np.load('data_keys.npy')

for item in prediction:
print(keys[item])


if data is not None:
for idx in range(len(proba[0])):
print(keys[idx], round(proba[0][idx], 3))

def new_classifier(model):
"""
creates a classifier, either Decision Tree or linear
trains it on the preprocessed training data
and returns the classifier
"""
Xtr = load_matrix('matrix/Xtr.npz')
Ytr = np.load('matrix/Ytr.npy')
if model == 'dt':
nb = DecisionTreeClassifier(criterion='entropy')
if model == 'bayes':
nb = MultinomialNB()

nb.fit(Xtr, Ytr)

return nb

def load_matrix(filename):
"""
imports the preprocessed training data
"""
filename = filename
loader = np.load(filename)
return csc_matrix((loader['data'], loader['indices'], loader['indptr']),
shape = loader['shape'])


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train a classifier using \
preprocessed files, classify \
input file using the classifier')


parser.add_argument('--Xtest', default='matrix/Xte.npz', type=str)

parser.add_argument('--Ytest', default=None, type=str)

parser.add_argument('--from_text', default=None, type=str)

parser.add_argument('-bayes', help='Use Multinomial Naive Bayes',
action='store_true')


args = parser.parse_args()

if args.bayes:
model = 'bayes'
else:
model = 'dt'

data = []
if args.from_text:
pipe = joblib.load('dumps/pipe.pkl')
from_text = args.from_text

if os.path.isfile(from_text):
with open(from_text, 'r') as fh:
data = fh.read()

data = pipe.transform([data])

else:
filenames = os.walk(from_text)

f = []
for filename in filenames:
if filename[1] == []:
for item in filename[2]:
f.append(filename[0] + '/' + item)

for item in f:
with open(item, 'r') as fh:
data.append(fh.read())

data = pipe.transform(data)

main(None, args.Ytest, data, model)

else:
main(args.Xtest, args.Ytest, model)
Binary file added lang_class/data_keys.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_01.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_02.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_03.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_04.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_05.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_06.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_07.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_08.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_09.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_10.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_100.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_101.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_102.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_103.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_11.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_12.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_13.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_14.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_15.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_16.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_17.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_18.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_19.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_20.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_21.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_22.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_23.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_24.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_25.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_26.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_27.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_28.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_29.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_30.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_31.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_32.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_33.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_34.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_35.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_36.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_37.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_38.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_39.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_40.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_41.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_42.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_43.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_44.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_45.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_46.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_47.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_48.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_49.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_50.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_51.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_52.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_53.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_54.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_55.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_56.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_57.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_58.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_59.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_60.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_61.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_62.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_63.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_64.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_65.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_66.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_67.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_68.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_69.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_70.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_71.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_72.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_73.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_74.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_75.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_76.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_77.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_78.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_79.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_80.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_81.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_82.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_83.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_84.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_85.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_86.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_87.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_88.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_89.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_90.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_91.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_92.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_93.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_94.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_95.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_96.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_97.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_98.npy
Binary file not shown.
Binary file added lang_class/dumps/pipe.pkl_99.npy
Binary file not shown.
74 changes: 74 additions & 0 deletions lang_class/file_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import re
import numpy as np

names = {'.cs':'C#', '.java':'Java', '.c':'C', '.gcc':'C', '.js':'JavaScipt',
'.hack':'PHP', '.php':'PHP', '.racket':'Scheme',
'.sbcl':'Common Lisp', '.jruby':'Ruby', '.yarv':'Ruby', '.rb':'Ruby',
'.hs':'Haskell', '.lhs':'Haskell', '.ocaml':'Ocaml', '.prl':'Perl',
'.clojure':'Clojure', '.clj':'Clojure', '.ats':'Ada', '.csharp':'C#',
'.dart':'Dart', '.erlang':'Erlang', '.fpascal':'Pascal',
'.fsharp':'F#', '.ghc':'Haskell', '.gnat':'Ada', '.go':'Go',
'.gpp':'C', '.ifc':'Fortran', '.javascript':'Javascript',
'.lua':'Lua', '.oz':'Oz', '.perl':'Perl', '.py':'Python',
'.python3':'Python', '.rust':'Rust', '.scala':'Scala', '.vw':'VW',
'.cint':'C', '.javasteady':'Java', '.parrot':'Perl', '.tcl':'TCL'}

data_keys = ['Java', 'Lua', 'Scheme', 'Common Lisp', 'JavaScipt',
'Haskell', 'Javascript', 'Fortran', 'Pascal', 'Go',
'Rust', 'C', 'Scala', 'Perl', 'Clojure', 'Erlang', 'Oz',
'PHP', 'Ruby', 'Ocaml', 'Ada', 'F#', 'Dart', 'Python',
'VW', 'C#', 'TCL']

np.save('data_keys.npy', np.array(data_keys))

def load_files(name=None):
"""
Loads all text from programs from a given folder
and their file extension into a list
and returns it
Assumes files have file extenstions identifying the
programming language in which they are written
"""

if name:
files = get_names(name)
else:
files = get_names()

programs = [[],[]]

for name in files:
if get_ext(name):
ext = get_ext(name).group(0)

with open(name, 'r') as fh:
programs[0].append(fh.read())
programs[1].append(data_keys.index(names[ext]))

return programs

def get_names(name='../benchmarks'):
"""
Walks the specified directory and
returns the path to each file contained
within
"""
f = []

filenames = os.walk(name)

for filename in filenames:
if filename[1] == []:
for item in filename[2]:
f.append(filename[0] + '/' + item)

return f


def get_ext(filename):
"""
returns the file extension for
the given filename
"""
return re.search(r'\.([\w]+)$', filename)
Binary file added lang_class/matrix/Xte.npz
Binary file not shown.
Binary file added lang_class/matrix/Xtr.npz
Binary file not shown.
Binary file added lang_class/matrix/Yte.npy
Binary file not shown.
Binary file added lang_class/matrix/Ytr.npy
Binary file not shown.
92 changes: 92 additions & 0 deletions lang_class/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import os
import argparse
import numpy as np

from file_loader import load_files
from vectorizer import CodeVectorizer


def main(name=None, thresh=.3, vectorizer=None):
"""
breaks the data into test/train cases
and trains a vectorizer and random forest
on the test data, then saves the results
to Xtr.npz, Ytr.npy, Xte.npz, Yte.npy
and pickles the pipeline
"""
data = load_files(name)
pipe = make_pipe(vectorizer)

cross_data = cross_validation.train_test_split(
data[0], data[1], test_size=thresh)

pipe.fit(cross_data[0], cross_data[2])

cross_data[0] = pipe.transform(cross_data[0])
cross_data[1] = pipe.transform(cross_data[1])

cross_data[2] = np.array(cross_data[2])
cross_data[3] = np.array(cross_data[3])

save_matrix('matrix/Xtr', cross_data[0])
np.save('matrix/Ytr', cross_data[2])

save_matrix('matrix/Xte', cross_data[1])
np.save('matrix/Yte', cross_data[3])

joblib.dump(pipe, 'dumps/pipe.pkl')


def make_pipe(vectorizer=None):
'''
creates a pipeline with the given
vectorizer or the Tfidf if none given
and a random forest classifier
'''
if vectorizer is None:
tf = TfidfVectorizer(sublinear_tf=True, token_pattern= \
r'\b[\w\.,:\(\)\[\]\{\}\'\";%#@!*&|\<\>]+\b', stop_words=None,
binary=True)
else:
tf = vectorizer

rf = RandomForestClassifier()
return Pipeline([('tf', tf), ('rf', rf)])

def save_matrix(filename, array):
"""
saves the given matrix to a .npz file
"""
np.savez(filename, data = array.data ,indices=array.indices,
indptr=array.indptr, shape=array.shape)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Preprocess a given folder \
of programs and train a classifier on \
them')

parser.add_argument('--thresh', default=.5, type=float, help='specify \
a test/train split %')

parser.add_argument('--name', default=None, type=str, help='specify \
a source for the test data')

parser.add_argument('-cv', help= 'Use CodeVectorizer rather than \
TfidfVectorizer with altered tokenizer',
action='store_true')

vect=None
args = parser.parse_args()

if args.cv:
vect = CodeVectorizer(TfidfVectorizer(binary=True, norm='l1'))

main(args.name, args.thresh, vect)
Empty file added lang_class/tests/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added lang_class/tests/test_array.npy
Binary file not shown.
Loading