tiyd-python-2015-05 · benred42 · Jun 6, 2015 · Jun 6, 2015 · Jun 6, 2015 · Jun 6, 2015
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@
 # Temporary data
 .ipynb_checkpoints/
 
+### PyCharm ###
+.idea
+
 # Created by https://www.gitignore.io
 
 ### Python ###

diff --git a/Programming Language Guesser.ipynb b/Programming Language Guesser.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.cross_validation import train_test_split\n",
+    "from programming_language_classifier import get_data as gd\n",
+    "from programming_language_classifier import plc_trainer as plc\n",
+    "from sklearn.metrics import classification_report, confusion_matrix\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "content_list = gd.get_content(\"programming_language_classifier/train/\")\n",
+    "train_data = gd.make_dataframe(content_list)\n",
+    "x_train, x_test, y_train, y_test = train_test_split(train_data[1], train_data[0], test_size=0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "classifier = Pipeline([('features', plc.Featurizer(plc.percent_elements, plc.number_elements,\n",
+    "                                                   plc.longest_run, plc.line_enders)),\n",
+    "                       ('bayes', MultinomialNB())])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('features', <programming_language_classifier.plc_trainer.Featurizer object at 0x10cfc9d30>), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.fit(x_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion Matrix for Training Data\n",
+      "\n",
+      "[[46  0  0  0  0  0  0  0  4  0  0  0  0  0  1]\n",
+      " [ 0 31  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0 31  0  0  0  0  0  0  0  0  0  0  1  0]\n",
+      " [ 0  0  0 21  0  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0 29  0  1  0  0  0  0  0  0  0  1]\n",
+      " [ 0  7  0  0  0 45  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0 17  0  1  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0 23  0  0  0  0  0  0  0]\n",
+      " [ 1  0  0  0  0  0  1  0 34  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0 26  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0 32  0  0  0  1]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0 56  0  0  0]\n",
+      " [ 0  0  0  0  0  0  2  0  0  0  0  0 34  0  1]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0 18  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0 38]]\n",
+      "\n",
+      "Train Score: 0.954365079365\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Confusion Matrix for Training Data\\n\")\n",
+    "print(confusion_matrix(classifier.predict(x_train), y_train))\n",
+    "print(\"\\nTrain Score: \" + str(classifier.score(x_train, y_train)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion Matrix for Test Data\n",
+      "\n",
+      "[[11  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  3  0  0  0  1  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  7  0  0  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0 13  0  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  4  0  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  5  0  0  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  4  0  0  0  0  0  1  0  0]\n",
+      " [ 0  0  0  0  0  0  0 11  0  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0 14  0  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  8  0  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  4  0  0  0  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0 15  0  0  1]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0  0  7  0  1]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0  0  0 10  0]\n",
+      " [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  6]]\n",
+      "\n",
+      "Test Score: 0.96062992126\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Confusion Matrix for Test Data\\n\")\n",
+    "print(confusion_matrix(classifier.predict(x_test), y_test))\n",
+    "print(\"\\nTest Score: \" + str(classifier.score(x_test, y_test)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "content = []\n",
+    "for file in sorted(os.listdir(\"test/\"), key=int):\n",
+    "    with open(\"test/\" + file) as fh:\n",
+    "        content.append([fh.read()])\n",
+    "test_data = gd.make_dataframe(content)\n",
+    "test_labels = ['Clojure', 'Clojure', 'Clojure', 'Clojure', 'Python', 'Python',\n",
+    "       'Python', 'Python', 'JavaScript', 'JavaScript', 'JavaScript',\n",
+    "       'JavaScript', 'Ruby', 'Ruby', 'Ruby', 'Haskell', 'Haskell',\n",
+    "       'Haskell', 'Scheme', 'Scheme', 'Scheme', 'Java', 'Java', 'Scala',\n",
+    "       'Scala', 'TCL', 'TCL', 'PHP', 'PHP', 'PHP', 'OCaml', 'OCaml']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion Matrix for New Test Data\n",
+      "\n",
+      "[[3 0 0 0 0 0 0 0 0 0 0]\n",
+      " [0 3 0 0 0 0 0 0 0 0 0]\n",
+      " [0 0 2 0 0 0 0 0 0 0 0]\n",
+      " [0 0 0 4 0 0 0 0 0 0 0]\n",
+      " [0 0 0 0 2 0 0 0 0 0 0]\n",
+      " [0 0 0 0 0 3 0 0 0 0 0]\n",
+      " [1 0 0 0 0 0 4 0 0 0 0]\n",
+      " [0 0 0 0 0 0 0 3 0 0 0]\n",
+      " [0 0 0 0 0 0 0 0 2 0 0]\n",
+      " [0 0 0 0 0 0 0 0 0 3 0]\n",
+      " [0 0 0 0 0 0 0 0 0 0 2]]\n",
+      "\n",
+      "New Test Score: 0.96875\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Confusion Matrix for New Test Data\\n\")\n",
+    "print(confusion_matrix(classifier.predict(test_data[0]), test_labels))\n",
+    "print(\"\\nNew Test Score: \" + str(classifier.score(test_data[0], test_labels)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "    Clojure       0.75      1.00      0.86         3\n",
+      "    Haskell       1.00      1.00      1.00         3\n",
+      "       Java       1.00      1.00      1.00         2\n",
+      " JavaScript       1.00      1.00      1.00         4\n",
+      "      OCaml       1.00      1.00      1.00         2\n",
+      "        PHP       1.00      1.00      1.00         3\n",
+      "     Python       1.00      0.80      0.89         5\n",
+      "       Ruby       1.00      1.00      1.00         3\n",
+      "      Scala       1.00      1.00      1.00         2\n",
+      "     Scheme       1.00      1.00      1.00         3\n",
+      "        TCL       1.00      1.00      1.00         2\n",
+      "\n",
+      "avg / total       0.98      0.97      0.97        32\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(classification_report(classifier.predict(test_data[0]), test_labels))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/programming_language_classifier/__init__.py b/programming_language_classifier/__init__.py
diff --git a/programming_language_classifier/classifier b/programming_language_classifier/classifier
diff --git a/programming_language_classifier/crawler_scraper.py b/programming_language_classifier/crawler_scraper.py
@@ -0,0 +1,48 @@
+import bs4
+import requests
+import sys
+import re
+
+
+languages = {'javascript': '.js',
+             'haskell': '.haskell',
+             'scala': '.scala',
+             'ocaml': '.ocaml',
+             'ruby': '.jruby',
+             'php': '.php',
+             'clojure': '.clojure',
+             'perl': '.perl',
+             'csharp': '.csharp',
+             'java': '.java',
+             'c': '.gcc',
+             'scheme': '.racket',
+             'python': '.py',
+             'lisp': '.sbcl',
+             'tcl': '.tcl'}
+
+def rosetta_scraper(seed, path):
+    response = requests.get(seed)
+    soup = bs4.BeautifulSoup(response.text)
+    divs = soup.select("div")
+    for div in divs:
+        if div.attrs.get("id") and div.attrs.get("id") == "mw-pages":
+            all_a = div.select('a')
+    links = ["http://rosettacode.org" + a.attrs.get("href")
+             for a in all_a
+             if a.attrs.get("href") and "wiki" in a.attrs.get("href")]
+    count = 1
+    for link in links:
+        response = requests.get(link)
+        soup = bs4.BeautifulSoup(response.text)
+        code = soup.select('pre')
+        for block in code:
+            for key in languages:
+                if block.attrs.get('class') is not None and key in block.attrs.get('class'):
+                    soup = bs4.BeautifulSoup(re.sub(r'<br/>', "\n", str(block)))
+                    with open(path + str(count) + languages[key], "w+") as file:
+                        file.write(soup.text)
+                        count += 1
+
+
+if __name__ == '__main__':
+    rosetta_scraper(sys.argv[1], sys.argv[2])
diff --git a/programming_language_classifier/get_data.py b/programming_language_classifier/get_data.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import pandas as pd
+
+extensions = {".gcc": "C",
+              ".c": "C",
+              ".csharp": "C#",
+              ".sbcl": "Common Lisp",
+              ".clojure": "Clojure",
+              ".ghc": "Haskell",
+              ".java": "Java",
+              ".javascript": "JavaScript",
+              ".js": "JavaScript",
+              ".ocaml": "OCaml",
+              ".perl": "Perl",
+              ".hack": "PHP",
+              ".php": "PHP",
+              ".py": "Python",
+              ".python3": "Python",
+              ".jruby": "Ruby",
+              ".yarv": "Ruby",
+              ".scala": "Scala",
+              ".racket": "Scheme",
+              ".tcl": "TCL"}
+
+def get_content(directory):
+    content = []
+    for file in os.listdir(directory):
+        extension = os.path.splitext(file)[1]
+        if extension in extensions:
+            with open(directory + file) as fh:
+                content.append([extensions[extension], fh.read()])
+    return content
+
+
+def make_dataframe(content_list):
+    return pd.DataFrame(content_list)
+
+
+if __name__ == '__main__':
+    content_list = get_content(sys.argv[1])
+    print(make_dataframe(content_list))