diff --git a/.gitignore b/.gitignore index f00dbf2..1f808f3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # Temporary data .ipynb_checkpoints/ +### PyCharm ### +.idea + # Created by https://www.gitignore.io ### Python ### diff --git a/Programming Language Guesser.ipynb b/Programming Language Guesser.ipynb new file mode 100644 index 0000000..3122ca9 --- /dev/null +++ b/Programming Language Guesser.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cross_validation import train_test_split\n", + "from programming_language_classifier import get_data as gd\n", + "from programming_language_classifier import plc_trainer as plc\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "content_list = gd.get_content(\"programming_language_classifier/train/\")\n", + "train_data = gd.make_dataframe(content_list)\n", + "x_train, x_test, y_train, y_test = train_test_split(train_data[1], train_data[0], test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "classifier = Pipeline([('features', plc.Featurizer(plc.percent_elements, plc.number_elements,\n", + " plc.longest_run, plc.line_enders)),\n", + " ('bayes', MultinomialNB())])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('features', ), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for Training Data\n", + "\n", + "[[46 0 0 0 0 0 0 0 4 0 0 0 0 0 1]\n", + " [ 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 31 0 0 0 0 0 0 0 0 0 0 1 0]\n", + " [ 0 0 0 21 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 29 0 1 0 0 0 0 0 0 0 1]\n", + " [ 0 7 0 0 0 45 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 17 0 1 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0]\n", + " [ 1 0 0 0 0 0 1 0 34 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 26 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 32 0 0 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 56 0 0 0]\n", + " [ 0 0 0 0 0 0 2 0 0 0 0 0 34 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 18 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 38]]\n", + "\n", + "Train Score: 0.954365079365\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for Training Data\\n\")\n", + "print(confusion_matrix(classifier.predict(x_train), y_train))\n", + "print(\"\\nTrain Score: \" + str(classifier.score(x_train, y_train)))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for Test Data\n", + "\n", + "[[11 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 13 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 4 0 0 0 0 0 1 0 0]\n", + " [ 0 0 0 0 0 0 0 11 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 15 0 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 7 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 10 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 6]]\n", + "\n", + "Test Score: 0.96062992126\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for Test Data\\n\")\n", + "print(confusion_matrix(classifier.predict(x_test), y_test))\n", + "print(\"\\nTest Score: \" + str(classifier.score(x_test, y_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "content = []\n", + "for file in sorted(os.listdir(\"test/\"), key=int):\n", + " with open(\"test/\" + file) as fh:\n", + " content.append([fh.read()])\n", + "test_data = gd.make_dataframe(content)\n", + "test_labels = ['Clojure', 'Clojure', 'Clojure', 'Clojure', 'Python', 'Python',\n", + " 'Python', 'Python', 'JavaScript', 'JavaScript', 'JavaScript',\n", + " 'JavaScript', 'Ruby', 'Ruby', 'Ruby', 'Haskell', 'Haskell',\n", + " 'Haskell', 'Scheme', 'Scheme', 'Scheme', 'Java', 'Java', 'Scala',\n", + " 'Scala', 'TCL', 'TCL', 'PHP', 'PHP', 'PHP', 'OCaml', 'OCaml']" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for New Test Data\n", + "\n", + "[[3 0 0 0 0 0 0 0 0 0 0]\n", + " [0 3 0 0 0 0 0 0 0 0 0]\n", + " [0 0 2 0 0 0 0 0 0 0 0]\n", + " [0 0 0 4 0 0 0 0 0 0 0]\n", + " [0 0 0 0 2 0 0 0 0 0 0]\n", + " [0 0 0 0 0 3 0 0 0 0 0]\n", + " [1 0 0 0 0 0 4 0 0 0 0]\n", + " [0 0 0 0 0 0 0 3 0 0 0]\n", + " [0 0 0 0 0 0 0 0 2 0 0]\n", + " [0 0 0 0 0 0 0 0 0 3 0]\n", + " [0 0 0 0 0 0 0 0 0 0 2]]\n", + "\n", + "New Test Score: 0.96875\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for New Test Data\\n\")\n", + "print(confusion_matrix(classifier.predict(test_data[0]), test_labels))\n", + "print(\"\\nNew Test Score: \" + str(classifier.score(test_data[0], test_labels)))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Clojure 0.75 1.00 0.86 3\n", + " Haskell 1.00 1.00 1.00 3\n", + " Java 1.00 1.00 1.00 2\n", + " JavaScript 1.00 1.00 1.00 4\n", + " OCaml 1.00 1.00 1.00 2\n", + " PHP 1.00 1.00 1.00 3\n", + " Python 1.00 0.80 0.89 5\n", + " Ruby 1.00 1.00 1.00 3\n", + " Scala 1.00 1.00 1.00 2\n", + " Scheme 1.00 1.00 1.00 3\n", + " TCL 1.00 1.00 1.00 2\n", + "\n", + "avg / total 0.98 0.97 0.97 32\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(classifier.predict(test_data[0]), test_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/programming_language_classifier/__init__.py b/programming_language_classifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/programming_language_classifier/classifier b/programming_language_classifier/classifier new file mode 100644 index 0000000..988a290 Binary files /dev/null and b/programming_language_classifier/classifier differ diff --git a/programming_language_classifier/crawler_scraper.py b/programming_language_classifier/crawler_scraper.py new file mode 100644 index 0000000..67535d4 --- /dev/null +++ b/programming_language_classifier/crawler_scraper.py @@ -0,0 +1,48 @@ +import bs4 +import requests +import sys +import re + + +languages = {'javascript': '.js', + 'haskell': '.haskell', + 'scala': '.scala', + 'ocaml': '.ocaml', + 'ruby': '.jruby', + 'php': '.php', + 'clojure': '.clojure', + 'perl': '.perl', + 'csharp': '.csharp', + 'java': '.java', + 'c': '.gcc', + 'scheme': '.racket', + 'python': '.py', + 'lisp': '.sbcl', + 'tcl': '.tcl'} + +def rosetta_scraper(seed, path): + response = requests.get(seed) + soup = bs4.BeautifulSoup(response.text) + divs = soup.select("div") + for div in divs: + if div.attrs.get("id") and div.attrs.get("id") == "mw-pages": + all_a = div.select('a') + links = ["http://rosettacode.org" + a.attrs.get("href") + for a in all_a + if a.attrs.get("href") and "wiki" in a.attrs.get("href")] + count = 1 + for link in links: + response = requests.get(link) + soup = bs4.BeautifulSoup(response.text) + code = soup.select('pre') + for block in code: + for key in languages: + if block.attrs.get('class') is not None and key in block.attrs.get('class'): + soup = bs4.BeautifulSoup(re.sub(r'
', "\n", str(block))) + with open(path + str(count) + languages[key], "w+") as file: + file.write(soup.text) + count += 1 + + +if __name__ == '__main__': + rosetta_scraper(sys.argv[1], sys.argv[2]) diff --git a/programming_language_classifier/get_data.py b/programming_language_classifier/get_data.py new file mode 100644 index 0000000..0bbac97 --- /dev/null +++ b/programming_language_classifier/get_data.py @@ -0,0 +1,42 @@ +import os +import sys +import pandas as pd + +extensions = {".gcc": "C", + ".c": "C", + ".csharp": "C#", + ".sbcl": "Common Lisp", + ".clojure": "Clojure", + ".ghc": "Haskell", + ".java": "Java", + ".javascript": "JavaScript", + ".js": "JavaScript", + ".ocaml": "OCaml", + ".perl": "Perl", + ".hack": "PHP", + ".php": "PHP", + ".py": "Python", + ".python3": "Python", + ".jruby": "Ruby", + ".yarv": "Ruby", + ".scala": "Scala", + ".racket": "Scheme", + ".tcl": "TCL"} + +def get_content(directory): + content = [] + for file in os.listdir(directory): + extension = os.path.splitext(file)[1] + if extension in extensions: + with open(directory + file) as fh: + content.append([extensions[extension], fh.read()]) + return content + + +def make_dataframe(content_list): + return pd.DataFrame(content_list) + + +if __name__ == '__main__': + content_list = get_content(sys.argv[1]) + print(make_dataframe(content_list)) \ No newline at end of file diff --git a/programming_language_classifier/plc_predict.py b/programming_language_classifier/plc_predict.py new file mode 100644 index 0000000..5453821 --- /dev/null +++ b/programming_language_classifier/plc_predict.py @@ -0,0 +1,23 @@ +import os +import sys +import pickle +import get_data as gd +from plc_trainer import Featurizer, percent_elements, number_elements, longest_run, line_enders + +def predict(classifier, directory): + content = [] + for filename in os.listdir(directory): + with open(directory + filename) as fh: + content.append([filename, fh.read()]) + test_data = gd.make_dataframe(content) + predictions = list(classifier.predict(test_data[1])) + buffer = max([len(item) for item in test_data[0]]) + 5 + for index in range(len(predictions)): + print(test_data[0][index].ljust(buffer) + "| " + predictions[index]) + + + +if __name__ == '__main__': + with open("./classifier", "rb") as file: + predictor = pickle.load(file) + predict(predictor, sys.argv[1]) diff --git a/programming_language_classifier/plc_trainer.py b/programming_language_classifier/plc_trainer.py new file mode 100644 index 0000000..c083e68 --- /dev/null +++ b/programming_language_classifier/plc_trainer.py @@ -0,0 +1,71 @@ +import re +import itertools +from programming_language_classifier import get_data as gd +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +import pickle + + +def percent_elements(text): + elements = ")}];:.,\/-_#*!$%|<>& " + results = [] + for element in elements: + total = max(1, len(text)) + results.append(text.count(element)/total) + return results + + +def number_elements(text): + elements = [r'\bbegin\b', r'\bend\b', r'\bdo\b', r'\bvar\b', r'\bdefine\b', r'\bdefn\b', r'\bfunction\b', + r'\bclass\b', r'\bmy\b', r'\brequire\b', r'\bvoid\b', r'\bval\b', r'\bpublic\b', r'\blet\b', + r'\bwhere\b', r'\busing\b', r'\bextend\b', r'\bfunction\b'] + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + return results + + +def longest_run(text): + elements = [r'[)]+',r'[}]+', r'[\]]+', r'[=]+'] + results = [] + for element in elements: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + return results + + +def line_enders(text): + elements = [r'[)]$', r';$', r'}$', r']$', r'\):$'] + results = [] + for element in elements: + results.append(len(re.findall(element, text, re.MULTILINE))) + return results + + +class Featurizer: + def __init__(self, *feature_makers): + self.feature_makers = feature_makers + + def fit(self, X, y): + return self + + def transform(self, X): + feature_vectors = [] + for item in X: + vector = list(itertools.chain.from_iterable([function(item) for function in self.feature_makers])) + feature_vectors.append(vector) + return feature_vectors + + + +if __name__ == '__main__': + content_list = gd.get_content("./train/") + train_data = gd.make_dataframe(content_list) + classifier = Pipeline([('features', Featurizer(percent_elements, number_elements, longest_run, line_enders)), + ('bayes', MultinomialNB())]) + classifier.fit(train_data[1], train_data[0]) + with open("./classifier", "wb") as file: + pickle.dump(classifier, file) diff --git a/programming_language_classifier/tests/__init__.py b/programming_language_classifier/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/programming_language_classifier/tests/function_testfiles/test1.gcc b/programming_language_classifier/tests/function_testfiles/test1.gcc new file mode 100644 index 0000000..233261a --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test1.gcc @@ -0,0 +1 @@ +This is a C file diff --git a/programming_language_classifier/tests/function_testfiles/test2.js b/programming_language_classifier/tests/function_testfiles/test2.js new file mode 100644 index 0000000..773f533 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test2.js @@ -0,0 +1 @@ +This is a javascript file diff --git a/programming_language_classifier/tests/function_testfiles/test3.yarv b/programming_language_classifier/tests/function_testfiles/test3.yarv new file mode 100644 index 0000000..6d3b574 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test3.yarv @@ -0,0 +1 @@ +This is a Ruby file diff --git a/programming_language_classifier/tests/function_testfiles/test4.python3 b/programming_language_classifier/tests/function_testfiles/test4.python3 new file mode 100644 index 0000000..0d5c729 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test4.python3 @@ -0,0 +1 @@ +This is a Python file diff --git a/test.csv b/programming_language_classifier/tests/test.csv similarity index 100% rename from test.csv rename to programming_language_classifier/tests/test.csv diff --git a/programming_language_classifier/tests/test_get_data.py b/programming_language_classifier/tests/test_get_data.py new file mode 100644 index 0000000..cf77a25 --- /dev/null +++ b/programming_language_classifier/tests/test_get_data.py @@ -0,0 +1,19 @@ +from programming_language_classifier import get_data as gd + + +def test_get_content(): + assert gd.get_content("tests/function_testfiles/") == [["C", "This is a C file\n"], + ["JavaScript", "This is a javascript file\n"], + ["Ruby", "This is a Ruby file\n"], + ["Python", "This is a Python file\n"]] + + +def test_make_dataframe(): + test_list = gd.get_content("tests/function_testfiles/") + assert gd.make_dataframe(test_list)[0][0] == "C" + assert gd.make_dataframe(test_list)[1][0] == "This is a C file\n" + assert gd.make_dataframe(test_list)[1][2] == "This is a Ruby file\n" + + + + diff --git a/programming_language_classifier/tests/test_plc_trainer.py b/programming_language_classifier/tests/test_plc_trainer.py new file mode 100644 index 0000000..065f76c --- /dev/null +++ b/programming_language_classifier/tests/test_plc_trainer.py @@ -0,0 +1,56 @@ +import programming_language_classifier.plc_trainer as plc + + +def test_percent_elements(): + """element order: ) } ] ; : . , \ / - _ # * ! $ % | """ + a_string = "..oooooOO}" + assert plc.percent_elements(a_string) == [0, 0.1, 0, 0, 0, .2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "]]]*%!,,:M" + assert plc.percent_elements(a_string) == [0, 0, 0.3, 0, 0.1, 0, 0.2, 0, 0, 0, 0, 0, 0.1, 0.1, 0, 0.1, 0, 0, 0, 0, 0] + a_string = "" + assert plc.percent_elements(a_string) == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +def test_number_elements(): + """element order: begin end do""" + a_string = "begin: words!!! end begin itbeginq" + assert plc.number_elements(a_string) == [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "dobeginend do do end, Mend :begin:" + assert plc.number_elements(a_string) == [1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "" + assert plc.number_elements(a_string) == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +def test_longest_run(): + """element order: ) } ] =""" + a_string = ")))))[]]]}]]]]}}=" + assert plc.longest_run(a_string) == [5, 2, 4, 1] + a_string = "Adn;ksenfas]]]]]((()====" + assert plc.longest_run(a_string) == [1, 0, 5, 4] + + +def test_line_enders(): + a_string = "....)\n ....;\n....;\n" + assert plc.line_enders(a_string) == [1, 2, 0, 0, 0] + + +def test_featurizer_transform(): + tf = plc.Featurizer(plc.percent_elements, plc.number_elements, plc.longest_run) + test_list = ["begin }}} . end", "do end %%__=====", ""] + array = tf.transform(test_list) + test_array = [[0, 0.2, 0, 0, 0, 0.06666667, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.125, 0, 0, 0, 0, 0.125, 0, 0, 0, 0, 0.125, + 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]] + def rounder(array): + for collection in array: + for index in range(len(collection)): + collection[index] = round(collection[index], 3) + return array + + assert rounder(array) == rounder(test_array) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 473a3b2..1cf8a0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,7 @@ scipy pandas numpy matplotlib +ipython[notebook] +pytest +requests +beautifulsoup4