diff --git a/language_classifier/classifier b/language_classifier/classifier new file mode 100644 index 0000000..425a653 Binary files /dev/null and b/language_classifier/classifier differ diff --git a/language_classifier/gather_data.py b/language_classifier/gather_data.py new file mode 100644 index 0000000..4600492 --- /dev/null +++ b/language_classifier/gather_data.py @@ -0,0 +1,46 @@ +import pandas as pd +import glob +from bs4 import BeautifulSoup +import os + +extension_dict = {'gcc': 'c', 'perl': 'perl', 'clojure': 'clojure', 'hs': 'haskell', 'java': 'java', + 'javascript': 'javascript', 'jruby': 'ruby', 'yarv': 'ruby', 'ocaml': 'ocaml', + 'sbcl': 'lisp', 'scala': 'scala', 'csharp': 'csharp', 'hack': 'php', 'php': 'php', + 'python3': 'python', 'racket': 'scheme', 'tcl': 'tcl'} + +def get_test_data(): + content = [] + for file in sorted(os.listdir("../data/test/"), key=int): + with open("../data/test/" + file) as fh: + content.append([fh.read()]) + test_data = pd.DataFrame(content) + return test_data + +def get_code_from_html(lang): + htmlfiles = glob.glob("../data/html/*.html") + texts = [] + tags = [] + for file in htmlfiles: + soup = BeautifulSoup(open(file)) + html_tag = soup.find_all('pre', {'class' : '{} highlighted_source'.format(lang)}) + html_text = [part.get_text() for part in html_tag] + for tag in html_tag: + tags.append(lang) + texts.extend(html_text) + return texts, tags + +def get_benchmark_code(directory): + files = glob.glob("../data/corpus/{}/*.{}".format(directory, directory)) + texts = [] + tags = [] + for file in files: + with open(file) as fh: + tags.append(extension_dict[directory]) + texts.append(fh.read()) + return texts, tags + +def get_snippet(filename): + content = [] + with open(filename) as fh: + content.append([fh.read()]) + return content \ No newline at end of file diff --git a/language_classifier/lang_classifier.py b/language_classifier/lang_classifier.py new file mode 100644 index 0000000..93e35a0 --- /dev/null +++ b/language_classifier/lang_classifier.py @@ -0,0 +1,113 @@ +import re +import itertools +import random +import pickle +import pandas as pd + +from sklearn.pipeline import Pipeline +from sklearn.cross_validation import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.base import TransformerMixin + +import gather_data as gd + +def count_characters(text): + return len(text) + +def count_words(text): + words = [r'\barray\b', r'\bbegin\b', r'\bend\b', r'\bdo\b', r'\bvar\b', r'\bdefn\b', r'\bfunction\b', + r'\bclass\b', r'\brequire\b', r'\bval\b', r'\bpublic\b', r'\blet\b', r'\bwhere\b', r'\busing\b', + r'\bextend\b', r'\bfunction\b', r'\bval\b', r'\btry\b'] + results = [] + for word in words: + results.append(len(re.findall(word, text))) + return results + +def char_runs(text): + chars = [r'[)]+',r'[}]+', r'[\]]+', r'[=]+'] + results = [] + for char in chars: + found = sorted(re.findall(char, text), key=len) + if found: + results.append(len(found[-1])) + else: + results.append(0) + return results + +def percent_characters(text): + chars = ';!=.<>/\[]{}:_#%$&*' + results = [] + for char in chars: + total = max(1, len(text)) + found = text.count(char) + if found: + results.append(found / total) + else: + results.append(0) + return results + +def endings(text): + ends = [r'[)]$', r';$', r'}$', r']$', r'\):$'] + results = [] + for end in ends: + results.append(len(re.findall(end, text, re.MULTILINE))) + return results + + +class FunctionFeaturizer(TransformerMixin): + def __init__(self, *featurizers): + self.featurizers = featurizers + + def fit(self, X, y=None): + return self + + def transform(self, X): + fvs = [] + for datum in X: + vec = list(itertools.chain.from_iterable([function(datum) for function in self.featurizers])) + fvs.append(vec) + return fvs + +class PipelineDebugger(TransformerMixin): + def __init__(self, name): + self.name = name + + def fit(self, X, y=None): + return self + + def transform(self, X): + print(self.name) + print("=" * 40) + x = X[random.randrange(0, len(X))] + print("len:", len(x)) + print(x) + return X + +if __name__ == '__main__': + texts = [] + tags = [] + + languages = ['c', 'perl', 'clojure', 'haskell', 'java', 'javascript', 'ruby', 'ocaml', 'lisp', 'scala', 'csharp', 'php', 'python', 'scheme', 'tcl'] + for language in languages: + texts.extend(gd.get_code_from_html(language)[0]) + tags.extend(gd.get_code_from_html(language)[1]) + + folders = ['clojure', 'csharp', 'gcc', 'hack', 'hs', 'java', 'javascript', 'jruby', 'ocaml', 'perl', 'php', 'python3', 'racket', 'sbcl', 'scala', 'yarv'] + for folder in folders: + tags.extend(gd.get_benchmark_code(folder)[1]) + texts.extend(gd.get_benchmark_code(folder)[0]) + + df_texts = pd.DataFrame(texts) + print(df_texts.head()) + df_tags = pd.DataFrame(tags) + merged = pd.merge(df_texts, df_tags, left_index=True, right_index=True) + merged.columns = ['Snippet', 'Language'] + + train_X, test_X, train_y, test_y = train_test_split(merged['Snippet'], merged['Language'], test_size=0.33) + + classifier = Pipeline([('features', FunctionFeaturizer(count_words, percent_characters, char_runs, endings)), + ('bayes', MultinomialNB())]) + classifier.fit(train_X, train_y) + + with open("./classifier", "wb") as file: + pickle.dump(classifier, file) \ No newline at end of file diff --git a/language_classifier/predict.py b/language_classifier/predict.py new file mode 100644 index 0000000..682dd5f --- /dev/null +++ b/language_classifier/predict.py @@ -0,0 +1,17 @@ +import gather_data as gd +import pandas as pd +import sys +import pickle + +def predict(classifier, data): + prediction = classifier.predict(data) + print(prediction) + + +if __name__ == '__main__': + content = gd.get_snippet(sys.argv[1]) + df = pd.DataFrame(content) + with open("./classifier", "rb") as file: + predictor = pickle.load(file) + predict(predictor, df) + diff --git a/language_classifier/tests/test_lang_classifier.py b/language_classifier/tests/test_lang_classifier.py new file mode 100644 index 0000000..4306da8 --- /dev/null +++ b/language_classifier/tests/test_lang_classifier.py @@ -0,0 +1,21 @@ +from language_classifier.lang_classifier import * + +test_data = [] +data_lang = 'python' + +with open("language_classifier/tests/feature_test.txt") as file: + test_file = file.read() + +def test_total_characters(): + assert count_characters(test_file) == 32 + +def test_percent_char(): + assert percent_character(test_file, '.') == 6/32 + assert percent_character(test_file, ';') == 7/32 + assert percent_character(test_file, '\t') == 4/32 + +def test_count_vars(): + assert count_vars(test_file) == 2 + +def test_percent_word_chars(): + assert count_word_chars(test_file) == 6/32 \ No newline at end of file