diff --git a/.gitignore b/.gitignore index f00dbf2..9090d68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # Created by https://www.gitignore.io - +.direnv +.envrc +benchmarks/ +.DS_Store ### IPythonNotebook ### # Temporary data .ipynb_checkpoints/ diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..3155c12 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,53 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from lang_class.file_loader import load_files, get_names" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "programs = load_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/lang_class/.gitignore b/lang_class/.gitignore new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/lang_class/.gitignore @@ -0,0 +1 @@ + diff --git a/lang_class/README.txt b/lang_class/README.txt new file mode 100644 index 0000000..75b31b7 --- /dev/null +++ b/lang_class/README.txt @@ -0,0 +1,24 @@ +To train the classifier on files with a file extension: +./preprocess.py --name=filename + +To run the classifier on the reserved test data: +./classifier.py + +To run the classifier on the reserved test data and compare to +the test answers: +./classifier.py --Ytest=Yte.npy + +To run the classifier on specified test data: +./classifier.py --from_text=(filename or folder name) + +To run the classifier on specified test data and compare +to specified answers (must be saved np.array format): +./classifier.py --from_text=(filename or folder name) --Ytest=answer_file_name.npy + + +The preprocessor can use TfidfVectorizer with an altered tokenizer +or with the -cv argument will use the custom CodeVectorizer + +The classifier can be either a Decision Tree (default) or +you can specify Multinomial Naive Bayes by passing +-bayes to ./classifier.py diff --git a/lang_class/__init__.py b/lang_class/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lang_class/classifier.py b/lang_class/classifier.py new file mode 100755 index 0000000..d99ade5 --- /dev/null +++ b/lang_class/classifier.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +from sklearn import cross_validation +from sklearn.externals import joblib +import numpy as np +from scipy.sparse import csc_matrix +import argparse +from sklearn.tree import DecisionTreeClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +import os + + +def main(X_test, Y_test, data=None, model='dt'): + """ + Creates a classifier, trains the classifier + on preprocessed data, then predicts X_test + data. If Y_test data given, returns the + success rate and counts, otherwise + returns the predictions + """ + + if X_test is not None: + X_test = load_matrix(X_test) + else: + if data is not None: + X_test = data + else: + raise TypeError('Bad input data') + + classifier = new_classifier(model) + prediction = classifier.predict(X_test) + + proba = classifier.predict_proba(X_test) + + if Y_test: + + Y_test = np.load(Y_test) + result = sum(1 for idx in range(len(Y_test)) if Y_test[idx] != \ + prediction[idx]) + + print('success rate: ', round(100 - 100*result / len(Y_test), 3)) + print('or: ', len(Y_test) - result,' out of ', len(Y_test)) + + else: + keys = np.load('data_keys.npy') + + for item in prediction: + print(keys[item]) + + + if data is not None: + for idx in range(len(proba[0])): + print(keys[idx], round(proba[0][idx], 3)) + +def new_classifier(model): + """ + creates a classifier, either Decision Tree or linear + trains it on the preprocessed training data + and returns the classifier + """ + Xtr = load_matrix('matrix/Xtr.npz') + Ytr = np.load('matrix/Ytr.npy') + if model == 'dt': + nb = DecisionTreeClassifier(criterion='entropy') + if model == 'bayes': + nb = MultinomialNB() + + nb.fit(Xtr, Ytr) + + return nb + +def load_matrix(filename): + """ + imports the preprocessed training data + """ + filename = filename + loader = np.load(filename) + return csc_matrix((loader['data'], loader['indices'], loader['indptr']), + shape = loader['shape']) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train a classifier using \ + preprocessed files, classify \ + input file using the classifier') + + + parser.add_argument('--Xtest', default='matrix/Xte.npz', type=str) + + parser.add_argument('--Ytest', default=None, type=str) + + parser.add_argument('--from_text', default=None, type=str) + + parser.add_argument('-bayes', help='Use Multinomial Naive Bayes', + action='store_true') + + + args = parser.parse_args() + + if args.bayes: + model = 'bayes' + else: + model = 'dt' + + data = [] + if args.from_text: + pipe = joblib.load('dumps/pipe.pkl') + from_text = args.from_text + + if os.path.isfile(from_text): + with open(from_text, 'r') as fh: + data = fh.read() + + data = pipe.transform([data]) + + else: + filenames = os.walk(from_text) + + f = [] + for filename in filenames: + if filename[1] == []: + for item in filename[2]: + f.append(filename[0] + '/' + item) + + for item in f: + with open(item, 'r') as fh: + data.append(fh.read()) + + data = pipe.transform(data) + + main(None, args.Ytest, data, model) + + else: + main(args.Xtest, args.Ytest, model) diff --git a/lang_class/data_keys.npy b/lang_class/data_keys.npy new file mode 100644 index 0000000..43d5000 Binary files /dev/null and b/lang_class/data_keys.npy differ diff --git a/lang_class/dumps/pipe.pkl b/lang_class/dumps/pipe.pkl new file mode 100644 index 0000000..442de5d Binary files /dev/null and b/lang_class/dumps/pipe.pkl differ diff --git a/lang_class/dumps/pipe.pkl_01.npy b/lang_class/dumps/pipe.pkl_01.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_01.npy differ diff --git a/lang_class/dumps/pipe.pkl_02.npy b/lang_class/dumps/pipe.pkl_02.npy new file mode 100644 index 0000000..93c264b Binary files /dev/null and b/lang_class/dumps/pipe.pkl_02.npy differ diff --git a/lang_class/dumps/pipe.pkl_03.npy b/lang_class/dumps/pipe.pkl_03.npy new file mode 100644 index 0000000..1a0fd06 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_03.npy differ diff --git a/lang_class/dumps/pipe.pkl_04.npy b/lang_class/dumps/pipe.pkl_04.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_04.npy differ diff --git a/lang_class/dumps/pipe.pkl_05.npy b/lang_class/dumps/pipe.pkl_05.npy new file mode 100644 index 0000000..3a4a053 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_05.npy differ diff --git a/lang_class/dumps/pipe.pkl_06.npy b/lang_class/dumps/pipe.pkl_06.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_06.npy differ diff --git a/lang_class/dumps/pipe.pkl_07.npy b/lang_class/dumps/pipe.pkl_07.npy new file mode 100644 index 0000000..c77fc66 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_07.npy differ diff --git a/lang_class/dumps/pipe.pkl_08.npy b/lang_class/dumps/pipe.pkl_08.npy new file mode 100644 index 0000000..9da81ec Binary files /dev/null and b/lang_class/dumps/pipe.pkl_08.npy differ diff --git a/lang_class/dumps/pipe.pkl_09.npy b/lang_class/dumps/pipe.pkl_09.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_09.npy differ diff --git a/lang_class/dumps/pipe.pkl_10.npy b/lang_class/dumps/pipe.pkl_10.npy new file mode 100644 index 0000000..93c5686 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_10.npy differ diff --git a/lang_class/dumps/pipe.pkl_100.npy b/lang_class/dumps/pipe.pkl_100.npy new file mode 100644 index 0000000..8c59ec1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_100.npy differ diff --git a/lang_class/dumps/pipe.pkl_101.npy b/lang_class/dumps/pipe.pkl_101.npy new file mode 100644 index 0000000..be18fd5 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_101.npy differ diff --git a/lang_class/dumps/pipe.pkl_102.npy b/lang_class/dumps/pipe.pkl_102.npy new file mode 100644 index 0000000..14e824c Binary files /dev/null and b/lang_class/dumps/pipe.pkl_102.npy differ diff --git a/lang_class/dumps/pipe.pkl_103.npy b/lang_class/dumps/pipe.pkl_103.npy new file mode 100644 index 0000000..25bc307 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_103.npy differ diff --git a/lang_class/dumps/pipe.pkl_11.npy b/lang_class/dumps/pipe.pkl_11.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_11.npy differ diff --git a/lang_class/dumps/pipe.pkl_12.npy b/lang_class/dumps/pipe.pkl_12.npy new file mode 100644 index 0000000..b6f6270 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_12.npy differ diff --git a/lang_class/dumps/pipe.pkl_13.npy b/lang_class/dumps/pipe.pkl_13.npy new file mode 100644 index 0000000..d60e6f1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_13.npy differ diff --git a/lang_class/dumps/pipe.pkl_14.npy b/lang_class/dumps/pipe.pkl_14.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_14.npy differ diff --git a/lang_class/dumps/pipe.pkl_15.npy b/lang_class/dumps/pipe.pkl_15.npy new file mode 100644 index 0000000..a6290e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_15.npy differ diff --git a/lang_class/dumps/pipe.pkl_16.npy b/lang_class/dumps/pipe.pkl_16.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_16.npy differ diff --git a/lang_class/dumps/pipe.pkl_17.npy b/lang_class/dumps/pipe.pkl_17.npy new file mode 100644 index 0000000..1448c65 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_17.npy differ diff --git a/lang_class/dumps/pipe.pkl_18.npy b/lang_class/dumps/pipe.pkl_18.npy new file mode 100644 index 0000000..bfe8f2a Binary files /dev/null and b/lang_class/dumps/pipe.pkl_18.npy differ diff --git a/lang_class/dumps/pipe.pkl_19.npy b/lang_class/dumps/pipe.pkl_19.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_19.npy differ diff --git a/lang_class/dumps/pipe.pkl_20.npy b/lang_class/dumps/pipe.pkl_20.npy new file mode 100644 index 0000000..5495b7c Binary files /dev/null and b/lang_class/dumps/pipe.pkl_20.npy differ diff --git a/lang_class/dumps/pipe.pkl_21.npy b/lang_class/dumps/pipe.pkl_21.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_21.npy differ diff --git a/lang_class/dumps/pipe.pkl_22.npy b/lang_class/dumps/pipe.pkl_22.npy new file mode 100644 index 0000000..1610993 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_22.npy differ diff --git a/lang_class/dumps/pipe.pkl_23.npy b/lang_class/dumps/pipe.pkl_23.npy new file mode 100644 index 0000000..32efbff Binary files /dev/null and b/lang_class/dumps/pipe.pkl_23.npy differ diff --git a/lang_class/dumps/pipe.pkl_24.npy b/lang_class/dumps/pipe.pkl_24.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_24.npy differ diff --git a/lang_class/dumps/pipe.pkl_25.npy b/lang_class/dumps/pipe.pkl_25.npy new file mode 100644 index 0000000..2b5406d Binary files /dev/null and b/lang_class/dumps/pipe.pkl_25.npy differ diff --git a/lang_class/dumps/pipe.pkl_26.npy b/lang_class/dumps/pipe.pkl_26.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_26.npy differ diff --git a/lang_class/dumps/pipe.pkl_27.npy b/lang_class/dumps/pipe.pkl_27.npy new file mode 100644 index 0000000..4be1732 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_27.npy differ diff --git a/lang_class/dumps/pipe.pkl_28.npy b/lang_class/dumps/pipe.pkl_28.npy new file mode 100644 index 0000000..a8e838a Binary files /dev/null and b/lang_class/dumps/pipe.pkl_28.npy differ diff --git a/lang_class/dumps/pipe.pkl_29.npy b/lang_class/dumps/pipe.pkl_29.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_29.npy differ diff --git a/lang_class/dumps/pipe.pkl_30.npy b/lang_class/dumps/pipe.pkl_30.npy new file mode 100644 index 0000000..21c265c Binary files /dev/null and b/lang_class/dumps/pipe.pkl_30.npy differ diff --git a/lang_class/dumps/pipe.pkl_31.npy b/lang_class/dumps/pipe.pkl_31.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_31.npy differ diff --git a/lang_class/dumps/pipe.pkl_32.npy b/lang_class/dumps/pipe.pkl_32.npy new file mode 100644 index 0000000..0737716 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_32.npy differ diff --git a/lang_class/dumps/pipe.pkl_33.npy b/lang_class/dumps/pipe.pkl_33.npy new file mode 100644 index 0000000..334bc2c Binary files /dev/null and b/lang_class/dumps/pipe.pkl_33.npy differ diff --git a/lang_class/dumps/pipe.pkl_34.npy b/lang_class/dumps/pipe.pkl_34.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_34.npy differ diff --git a/lang_class/dumps/pipe.pkl_35.npy b/lang_class/dumps/pipe.pkl_35.npy new file mode 100644 index 0000000..27fc796 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_35.npy differ diff --git a/lang_class/dumps/pipe.pkl_36.npy b/lang_class/dumps/pipe.pkl_36.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_36.npy differ diff --git a/lang_class/dumps/pipe.pkl_37.npy b/lang_class/dumps/pipe.pkl_37.npy new file mode 100644 index 0000000..e5d8217 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_37.npy differ diff --git a/lang_class/dumps/pipe.pkl_38.npy b/lang_class/dumps/pipe.pkl_38.npy new file mode 100644 index 0000000..37a1d30 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_38.npy differ diff --git a/lang_class/dumps/pipe.pkl_39.npy b/lang_class/dumps/pipe.pkl_39.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_39.npy differ diff --git a/lang_class/dumps/pipe.pkl_40.npy b/lang_class/dumps/pipe.pkl_40.npy new file mode 100644 index 0000000..1b970cb Binary files /dev/null and b/lang_class/dumps/pipe.pkl_40.npy differ diff --git a/lang_class/dumps/pipe.pkl_41.npy b/lang_class/dumps/pipe.pkl_41.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_41.npy differ diff --git a/lang_class/dumps/pipe.pkl_42.npy b/lang_class/dumps/pipe.pkl_42.npy new file mode 100644 index 0000000..9087a06 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_42.npy differ diff --git a/lang_class/dumps/pipe.pkl_43.npy b/lang_class/dumps/pipe.pkl_43.npy new file mode 100644 index 0000000..2c48692 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_43.npy differ diff --git a/lang_class/dumps/pipe.pkl_44.npy b/lang_class/dumps/pipe.pkl_44.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_44.npy differ diff --git a/lang_class/dumps/pipe.pkl_45.npy b/lang_class/dumps/pipe.pkl_45.npy new file mode 100644 index 0000000..33f42bb Binary files /dev/null and b/lang_class/dumps/pipe.pkl_45.npy differ diff --git a/lang_class/dumps/pipe.pkl_46.npy b/lang_class/dumps/pipe.pkl_46.npy new file mode 100644 index 0000000..49967e1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_46.npy differ diff --git a/lang_class/dumps/pipe.pkl_47.npy b/lang_class/dumps/pipe.pkl_47.npy new file mode 100644 index 0000000..48bc6a6 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_47.npy differ diff --git a/lang_class/dumps/pipe.pkl_48.npy b/lang_class/dumps/pipe.pkl_48.npy new file mode 100644 index 0000000..e48e41f Binary files /dev/null and b/lang_class/dumps/pipe.pkl_48.npy differ diff --git a/lang_class/dumps/pipe.pkl_49.npy b/lang_class/dumps/pipe.pkl_49.npy new file mode 100644 index 0000000..4d194da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_49.npy differ diff --git a/lang_class/dumps/pipe.pkl_50.npy b/lang_class/dumps/pipe.pkl_50.npy new file mode 100644 index 0000000..a8195f8 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_50.npy differ diff --git a/lang_class/dumps/pipe.pkl_51.npy b/lang_class/dumps/pipe.pkl_51.npy new file mode 100644 index 0000000..4cf1513 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_51.npy differ diff --git a/lang_class/dumps/pipe.pkl_52.npy b/lang_class/dumps/pipe.pkl_52.npy new file mode 100644 index 0000000..afbb778 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_52.npy differ diff --git a/lang_class/dumps/pipe.pkl_53.npy b/lang_class/dumps/pipe.pkl_53.npy new file mode 100644 index 0000000..d6227da Binary files /dev/null and b/lang_class/dumps/pipe.pkl_53.npy differ diff --git a/lang_class/dumps/pipe.pkl_54.npy b/lang_class/dumps/pipe.pkl_54.npy new file mode 100644 index 0000000..8c21c02 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_54.npy differ diff --git a/lang_class/dumps/pipe.pkl_55.npy b/lang_class/dumps/pipe.pkl_55.npy new file mode 100644 index 0000000..98a01fd Binary files /dev/null and b/lang_class/dumps/pipe.pkl_55.npy differ diff --git a/lang_class/dumps/pipe.pkl_56.npy b/lang_class/dumps/pipe.pkl_56.npy new file mode 100644 index 0000000..340a945 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_56.npy differ diff --git a/lang_class/dumps/pipe.pkl_57.npy b/lang_class/dumps/pipe.pkl_57.npy new file mode 100644 index 0000000..9db92bf Binary files /dev/null and b/lang_class/dumps/pipe.pkl_57.npy differ diff --git a/lang_class/dumps/pipe.pkl_58.npy b/lang_class/dumps/pipe.pkl_58.npy new file mode 100644 index 0000000..dff9a64 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_58.npy differ diff --git a/lang_class/dumps/pipe.pkl_59.npy b/lang_class/dumps/pipe.pkl_59.npy new file mode 100644 index 0000000..8c21c02 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_59.npy differ diff --git a/lang_class/dumps/pipe.pkl_60.npy b/lang_class/dumps/pipe.pkl_60.npy new file mode 100644 index 0000000..5861e26 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_60.npy differ diff --git a/lang_class/dumps/pipe.pkl_61.npy b/lang_class/dumps/pipe.pkl_61.npy new file mode 100644 index 0000000..340a945 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_61.npy differ diff --git a/lang_class/dumps/pipe.pkl_62.npy b/lang_class/dumps/pipe.pkl_62.npy new file mode 100644 index 0000000..673a6d2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_62.npy differ diff --git a/lang_class/dumps/pipe.pkl_63.npy b/lang_class/dumps/pipe.pkl_63.npy new file mode 100644 index 0000000..98645dc Binary files /dev/null and b/lang_class/dumps/pipe.pkl_63.npy differ diff --git a/lang_class/dumps/pipe.pkl_64.npy b/lang_class/dumps/pipe.pkl_64.npy new file mode 100644 index 0000000..8c21c02 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_64.npy differ diff --git a/lang_class/dumps/pipe.pkl_65.npy b/lang_class/dumps/pipe.pkl_65.npy new file mode 100644 index 0000000..7eeedf6 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_65.npy differ diff --git a/lang_class/dumps/pipe.pkl_66.npy b/lang_class/dumps/pipe.pkl_66.npy new file mode 100644 index 0000000..340a945 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_66.npy differ diff --git a/lang_class/dumps/pipe.pkl_67.npy b/lang_class/dumps/pipe.pkl_67.npy new file mode 100644 index 0000000..893e89d Binary files /dev/null and b/lang_class/dumps/pipe.pkl_67.npy differ diff --git a/lang_class/dumps/pipe.pkl_68.npy b/lang_class/dumps/pipe.pkl_68.npy new file mode 100644 index 0000000..a514d44 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_68.npy differ diff --git a/lang_class/dumps/pipe.pkl_69.npy b/lang_class/dumps/pipe.pkl_69.npy new file mode 100644 index 0000000..8c21c02 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_69.npy differ diff --git a/lang_class/dumps/pipe.pkl_70.npy b/lang_class/dumps/pipe.pkl_70.npy new file mode 100644 index 0000000..c03249c Binary files /dev/null and b/lang_class/dumps/pipe.pkl_70.npy differ diff --git a/lang_class/dumps/pipe.pkl_71.npy b/lang_class/dumps/pipe.pkl_71.npy new file mode 100644 index 0000000..340a945 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_71.npy differ diff --git a/lang_class/dumps/pipe.pkl_72.npy b/lang_class/dumps/pipe.pkl_72.npy new file mode 100644 index 0000000..1f2d0a9 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_72.npy differ diff --git a/lang_class/dumps/pipe.pkl_73.npy b/lang_class/dumps/pipe.pkl_73.npy new file mode 100644 index 0000000..a65b282 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_73.npy differ diff --git a/lang_class/dumps/pipe.pkl_74.npy b/lang_class/dumps/pipe.pkl_74.npy new file mode 100644 index 0000000..8c21c02 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_74.npy differ diff --git a/lang_class/dumps/pipe.pkl_75.npy b/lang_class/dumps/pipe.pkl_75.npy new file mode 100644 index 0000000..0999fd4 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_75.npy differ diff --git a/lang_class/dumps/pipe.pkl_76.npy b/lang_class/dumps/pipe.pkl_76.npy new file mode 100644 index 0000000..340a945 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_76.npy differ diff --git a/lang_class/dumps/pipe.pkl_77.npy b/lang_class/dumps/pipe.pkl_77.npy new file mode 100644 index 0000000..23b1578 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_77.npy differ diff --git a/lang_class/dumps/pipe.pkl_78.npy b/lang_class/dumps/pipe.pkl_78.npy new file mode 100644 index 0000000..d575210 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_78.npy differ diff --git a/lang_class/dumps/pipe.pkl_79.npy b/lang_class/dumps/pipe.pkl_79.npy new file mode 100644 index 0000000..22862a2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_79.npy differ diff --git a/lang_class/dumps/pipe.pkl_80.npy b/lang_class/dumps/pipe.pkl_80.npy new file mode 100644 index 0000000..8c59ec1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_80.npy differ diff --git a/lang_class/dumps/pipe.pkl_81.npy b/lang_class/dumps/pipe.pkl_81.npy new file mode 100644 index 0000000..bf6247b Binary files /dev/null and b/lang_class/dumps/pipe.pkl_81.npy differ diff --git a/lang_class/dumps/pipe.pkl_82.npy b/lang_class/dumps/pipe.pkl_82.npy new file mode 100644 index 0000000..1c94d6d Binary files /dev/null and b/lang_class/dumps/pipe.pkl_82.npy differ diff --git a/lang_class/dumps/pipe.pkl_83.npy b/lang_class/dumps/pipe.pkl_83.npy new file mode 100644 index 0000000..302b4c9 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_83.npy differ diff --git a/lang_class/dumps/pipe.pkl_84.npy b/lang_class/dumps/pipe.pkl_84.npy new file mode 100644 index 0000000..22862a2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_84.npy differ diff --git a/lang_class/dumps/pipe.pkl_85.npy b/lang_class/dumps/pipe.pkl_85.npy new file mode 100644 index 0000000..8c59ec1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_85.npy differ diff --git a/lang_class/dumps/pipe.pkl_86.npy b/lang_class/dumps/pipe.pkl_86.npy new file mode 100644 index 0000000..e33b32f Binary files /dev/null and b/lang_class/dumps/pipe.pkl_86.npy differ diff --git a/lang_class/dumps/pipe.pkl_87.npy b/lang_class/dumps/pipe.pkl_87.npy new file mode 100644 index 0000000..ec49745 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_87.npy differ diff --git a/lang_class/dumps/pipe.pkl_88.npy b/lang_class/dumps/pipe.pkl_88.npy new file mode 100644 index 0000000..847cd16 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_88.npy differ diff --git a/lang_class/dumps/pipe.pkl_89.npy b/lang_class/dumps/pipe.pkl_89.npy new file mode 100644 index 0000000..22862a2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_89.npy differ diff --git a/lang_class/dumps/pipe.pkl_90.npy b/lang_class/dumps/pipe.pkl_90.npy new file mode 100644 index 0000000..8c59ec1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_90.npy differ diff --git a/lang_class/dumps/pipe.pkl_91.npy b/lang_class/dumps/pipe.pkl_91.npy new file mode 100644 index 0000000..1ff9038 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_91.npy differ diff --git a/lang_class/dumps/pipe.pkl_92.npy b/lang_class/dumps/pipe.pkl_92.npy new file mode 100644 index 0000000..4d9a7c3 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_92.npy differ diff --git a/lang_class/dumps/pipe.pkl_93.npy b/lang_class/dumps/pipe.pkl_93.npy new file mode 100644 index 0000000..370adf1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_93.npy differ diff --git a/lang_class/dumps/pipe.pkl_94.npy b/lang_class/dumps/pipe.pkl_94.npy new file mode 100644 index 0000000..22862a2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_94.npy differ diff --git a/lang_class/dumps/pipe.pkl_95.npy b/lang_class/dumps/pipe.pkl_95.npy new file mode 100644 index 0000000..8c59ec1 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_95.npy differ diff --git a/lang_class/dumps/pipe.pkl_96.npy b/lang_class/dumps/pipe.pkl_96.npy new file mode 100644 index 0000000..56d7278 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_96.npy differ diff --git a/lang_class/dumps/pipe.pkl_97.npy b/lang_class/dumps/pipe.pkl_97.npy new file mode 100644 index 0000000..7fd3045 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_97.npy differ diff --git a/lang_class/dumps/pipe.pkl_98.npy b/lang_class/dumps/pipe.pkl_98.npy new file mode 100644 index 0000000..d24fe1e Binary files /dev/null and b/lang_class/dumps/pipe.pkl_98.npy differ diff --git a/lang_class/dumps/pipe.pkl_99.npy b/lang_class/dumps/pipe.pkl_99.npy new file mode 100644 index 0000000..22862a2 Binary files /dev/null and b/lang_class/dumps/pipe.pkl_99.npy differ diff --git a/lang_class/file_loader.py b/lang_class/file_loader.py new file mode 100644 index 0000000..2a1a00f --- /dev/null +++ b/lang_class/file_loader.py @@ -0,0 +1,74 @@ +import os +import re +import numpy as np + +names = {'.cs':'C#', '.java':'Java', '.c':'C', '.gcc':'C', '.js':'JavaScipt', + '.hack':'PHP', '.php':'PHP', '.racket':'Scheme', + '.sbcl':'Common Lisp', '.jruby':'Ruby', '.yarv':'Ruby', '.rb':'Ruby', + '.hs':'Haskell', '.lhs':'Haskell', '.ocaml':'Ocaml', '.prl':'Perl', + '.clojure':'Clojure', '.clj':'Clojure', '.ats':'Ada', '.csharp':'C#', + '.dart':'Dart', '.erlang':'Erlang', '.fpascal':'Pascal', + '.fsharp':'F#', '.ghc':'Haskell', '.gnat':'Ada', '.go':'Go', + '.gpp':'C', '.ifc':'Fortran', '.javascript':'Javascript', + '.lua':'Lua', '.oz':'Oz', '.perl':'Perl', '.py':'Python', + '.python3':'Python', '.rust':'Rust', '.scala':'Scala', '.vw':'VW', + '.cint':'C', '.javasteady':'Java', '.parrot':'Perl', '.tcl':'TCL'} + +data_keys = ['Java', 'Lua', 'Scheme', 'Common Lisp', 'JavaScipt', + 'Haskell', 'Javascript', 'Fortran', 'Pascal', 'Go', + 'Rust', 'C', 'Scala', 'Perl', 'Clojure', 'Erlang', 'Oz', + 'PHP', 'Ruby', 'Ocaml', 'Ada', 'F#', 'Dart', 'Python', + 'VW', 'C#', 'TCL'] + +np.save('data_keys.npy', np.array(data_keys)) + +def load_files(name=None): + """ + Loads all text from programs from a given folder + and their file extension into a list + and returns it + Assumes files have file extenstions identifying the + programming language in which they are written + """ + + if name: + files = get_names(name) + else: + files = get_names() + + programs = [[],[]] + + for name in files: + if get_ext(name): + ext = get_ext(name).group(0) + + with open(name, 'r') as fh: + programs[0].append(fh.read()) + programs[1].append(data_keys.index(names[ext])) + + return programs + +def get_names(name='../benchmarks'): + """ + Walks the specified directory and + returns the path to each file contained + within + """ + f = [] + + filenames = os.walk(name) + + for filename in filenames: + if filename[1] == []: + for item in filename[2]: + f.append(filename[0] + '/' + item) + + return f + + +def get_ext(filename): + """ + returns the file extension for + the given filename + """ + return re.search(r'\.([\w]+)$', filename) diff --git a/lang_class/matrix/Xte.npz b/lang_class/matrix/Xte.npz new file mode 100644 index 0000000..574da17 Binary files /dev/null and b/lang_class/matrix/Xte.npz differ diff --git a/lang_class/matrix/Xtr.npz b/lang_class/matrix/Xtr.npz new file mode 100644 index 0000000..aa35c8d Binary files /dev/null and b/lang_class/matrix/Xtr.npz differ diff --git a/lang_class/matrix/Yte.npy b/lang_class/matrix/Yte.npy new file mode 100644 index 0000000..4af3fe0 Binary files /dev/null and b/lang_class/matrix/Yte.npy differ diff --git a/lang_class/matrix/Ytr.npy b/lang_class/matrix/Ytr.npy new file mode 100644 index 0000000..18051dc Binary files /dev/null and b/lang_class/matrix/Ytr.npy differ diff --git a/lang_class/preprocess.py b/lang_class/preprocess.py new file mode 100755 index 0000000..1d5bed2 --- /dev/null +++ b/lang_class/preprocess.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +from sklearn import cross_validation +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import Pipeline +from sklearn.externals import joblib +import os +import argparse +import numpy as np + +from file_loader import load_files +from vectorizer import CodeVectorizer + + +def main(name=None, thresh=.3, vectorizer=None): + """ + breaks the data into test/train cases + and trains a vectorizer and random forest + on the test data, then saves the results + to Xtr.npz, Ytr.npy, Xte.npz, Yte.npy + and pickles the pipeline + """ + data = load_files(name) + pipe = make_pipe(vectorizer) + + cross_data = cross_validation.train_test_split( + data[0], data[1], test_size=thresh) + + pipe.fit(cross_data[0], cross_data[2]) + + cross_data[0] = pipe.transform(cross_data[0]) + cross_data[1] = pipe.transform(cross_data[1]) + + cross_data[2] = np.array(cross_data[2]) + cross_data[3] = np.array(cross_data[3]) + + save_matrix('matrix/Xtr', cross_data[0]) + np.save('matrix/Ytr', cross_data[2]) + + save_matrix('matrix/Xte', cross_data[1]) + np.save('matrix/Yte', cross_data[3]) + + joblib.dump(pipe, 'dumps/pipe.pkl') + + +def make_pipe(vectorizer=None): + ''' + creates a pipeline with the given + vectorizer or the Tfidf if none given + and a random forest classifier + ''' + if vectorizer is None: + tf = TfidfVectorizer(sublinear_tf=True, token_pattern= \ + r'\b[\w\.,:\(\)\[\]\{\}\'\";%#@!*&|\<\>]+\b', stop_words=None, + binary=True) + else: + tf = vectorizer + + rf = RandomForestClassifier() + return Pipeline([('tf', tf), ('rf', rf)]) + +def save_matrix(filename, array): + """ + saves the given matrix to a .npz file + """ + np.savez(filename, data = array.data ,indices=array.indices, + indptr=array.indptr, shape=array.shape) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Preprocess a given folder \ + of programs and train a classifier on \ + them') + + parser.add_argument('--thresh', default=.5, type=float, help='specify \ + a test/train split %') + + parser.add_argument('--name', default=None, type=str, help='specify \ + a source for the test data') + + parser.add_argument('-cv', help= 'Use CodeVectorizer rather than \ + TfidfVectorizer with altered tokenizer', + action='store_true') + + vect=None + args = parser.parse_args() + + if args.cv: + vect = CodeVectorizer(TfidfVectorizer(binary=True, norm='l1')) + + main(args.name, args.thresh, vect) diff --git a/lang_class/tests/__init__.py b/lang_class/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test.csv b/lang_class/tests/test.csv similarity index 100% rename from test.csv rename to lang_class/tests/test.csv diff --git a/test/1 b/lang_class/tests/test/01 similarity index 100% rename from test/1 rename to lang_class/tests/test/01 diff --git a/test/2 b/lang_class/tests/test/02 similarity index 100% rename from test/2 rename to lang_class/tests/test/02 diff --git a/test/3 b/lang_class/tests/test/03 similarity index 100% rename from test/3 rename to lang_class/tests/test/03 diff --git a/test/4 b/lang_class/tests/test/04 similarity index 100% rename from test/4 rename to lang_class/tests/test/04 diff --git a/test/5 b/lang_class/tests/test/05 similarity index 100% rename from test/5 rename to lang_class/tests/test/05 diff --git a/test/6 b/lang_class/tests/test/06 similarity index 100% rename from test/6 rename to lang_class/tests/test/06 diff --git a/test/7 b/lang_class/tests/test/07 similarity index 100% rename from test/7 rename to lang_class/tests/test/07 diff --git a/test/8 b/lang_class/tests/test/08 similarity index 100% rename from test/8 rename to lang_class/tests/test/08 diff --git a/test/9 b/lang_class/tests/test/09 similarity index 100% rename from test/9 rename to lang_class/tests/test/09 diff --git a/test/10 b/lang_class/tests/test/10 similarity index 100% rename from test/10 rename to lang_class/tests/test/10 diff --git a/test/11 b/lang_class/tests/test/11 similarity index 100% rename from test/11 rename to lang_class/tests/test/11 diff --git a/test/12 b/lang_class/tests/test/12 similarity index 100% rename from test/12 rename to lang_class/tests/test/12 diff --git a/test/13 b/lang_class/tests/test/13 similarity index 100% rename from test/13 rename to lang_class/tests/test/13 diff --git a/test/14 b/lang_class/tests/test/14 similarity index 100% rename from test/14 rename to lang_class/tests/test/14 diff --git a/test/15 b/lang_class/tests/test/15 similarity index 100% rename from test/15 rename to lang_class/tests/test/15 diff --git a/test/16 b/lang_class/tests/test/16 similarity index 100% rename from test/16 rename to lang_class/tests/test/16 diff --git a/test/17 b/lang_class/tests/test/17 similarity index 100% rename from test/17 rename to lang_class/tests/test/17 diff --git a/test/18 b/lang_class/tests/test/18 similarity index 100% rename from test/18 rename to lang_class/tests/test/18 diff --git a/test/19 b/lang_class/tests/test/19 similarity index 100% rename from test/19 rename to lang_class/tests/test/19 diff --git a/test/20 b/lang_class/tests/test/20 similarity index 100% rename from test/20 rename to lang_class/tests/test/20 diff --git a/test/21 b/lang_class/tests/test/21 similarity index 100% rename from test/21 rename to lang_class/tests/test/21 diff --git a/test/22 b/lang_class/tests/test/22 similarity index 100% rename from test/22 rename to lang_class/tests/test/22 diff --git a/test/23 b/lang_class/tests/test/23 similarity index 100% rename from test/23 rename to lang_class/tests/test/23 diff --git a/test/24 b/lang_class/tests/test/24 similarity index 100% rename from test/24 rename to lang_class/tests/test/24 diff --git a/test/25 b/lang_class/tests/test/25 similarity index 100% rename from test/25 rename to lang_class/tests/test/25 diff --git a/test/26 b/lang_class/tests/test/26 similarity index 100% rename from test/26 rename to lang_class/tests/test/26 diff --git a/test/27 b/lang_class/tests/test/27 similarity index 100% rename from test/27 rename to lang_class/tests/test/27 diff --git a/test/28 b/lang_class/tests/test/28 similarity index 100% rename from test/28 rename to lang_class/tests/test/28 diff --git a/test/29 b/lang_class/tests/test/29 similarity index 100% rename from test/29 rename to lang_class/tests/test/29 diff --git a/test/30 b/lang_class/tests/test/30 similarity index 100% rename from test/30 rename to lang_class/tests/test/30 diff --git a/test/31 b/lang_class/tests/test/31 similarity index 100% rename from test/31 rename to lang_class/tests/test/31 diff --git a/test/32 b/lang_class/tests/test/32 similarity index 100% rename from test/32 rename to lang_class/tests/test/32 diff --git a/lang_class/tests/test_array.npy b/lang_class/tests/test_array.npy new file mode 100644 index 0000000..9bd1e16 Binary files /dev/null and b/lang_class/tests/test_array.npy differ diff --git a/lang_class/vectorizer.py b/lang_class/vectorizer.py new file mode 100644 index 0000000..98c324f --- /dev/null +++ b/lang_class/vectorizer.py @@ -0,0 +1,85 @@ +import re +import numpy as np +from scipy.sparse import csc_matrix, hstack, coo_matrix + + + +class CodeVectorizer(): + def __init__(self, vectorizer): + self.fit_X = None + self.fit_Y = None + self.punctuation = None + self.lengths = None + self.fitted = False + self.punc = '. , ; : ! # $ % * ? + - & ^ | = _'.split(' ') + self.vectorizer = vectorizer + + def fit(self, X, y): + self.fit_X = X + self.fit_Y = y + self.process() + self.vectorizer.fit(X, y) + self.fitted = True + + def find_brackets(self, X): # assumes all delimiters are matched + if isinstance(X, list): + final = [] + for item in X: + final.append(self.find_brackets(item)) + return final + + delimiters = re.finditer(r'([\{\(\[\]\)\}])', X) + positions = [(item.groups(0)[0], item.span()[0]) for item in delimiters] + left = ['(', '[', '{'] + + if len(positions)%2 != 0: + return 0 + + if len([item[0] for item in positions if item in left]) != \ + len(positions)/2: + return 0 + + final = [len(positions)] + idx = 0 + while positions: + + if positions[idx][0] not in left: + final.append(positions[idx][1] - positions[idx - 1][1]) + positions.pop(idx) + positions.pop(idx-1) + + if idx > 1: + idx -= 2 + + idx += 1 + + if idx >= len(positions): + idx = 0 + + return final + + def find_punctuation(self, X): + if isinstance(X, list): + return [[line.count(item) for item in self.punc] for line in X] + return [x.count(item) for item in self.punc] + + def process(self): + self.lengths = self.find_brackets(self.fit_X) + self.punctuation = self.find_punctuation(self.fit_X) + + def transform(self, X): + if self.fitted: + ln = coo_matrix(np.matrix(self.find_brackets(X))).transpose() + pc = coo_matrix(np.matrix(self.find_punctuation(X))) + X_transformed = self.vectorizer.transform(X) + + if not isinstance(X_transformed, csc_matrix): + X_transformed = csc_matrix(X_transformed) + + return hstack([X_transformed, ln, pc], format='csc') + else: + raise Exception('Did not fit before transforming') + + def fit_transform(self, X, y): + self.fit(X, y) + return self.transform(X) diff --git a/requirements.txt b/requirements.txt index 473a3b2..12c35ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ +ipython[notebook] +textblob +beautifulsoup4 scikit-learn scipy pandas