Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
# Temporary data
.ipynb_checkpoints/

### PyCharm ###
.idea

# Created by https://www.gitignore.io

### Python ###
Expand Down
277 changes: 277 additions & 0 deletions Programming Language Guesser.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.cross_validation import train_test_split\n",
"from programming_language_classifier import get_data as gd\n",
"from programming_language_classifier import plc_trainer as plc\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"content_list = gd.get_content(\"programming_language_classifier/train/\")\n",
"train_data = gd.make_dataframe(content_list)\n",
"x_train, x_test, y_train, y_test = train_test_split(train_data[1], train_data[0], test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"classifier = Pipeline([('features', plc.Featurizer(plc.percent_elements, plc.number_elements,\n",
" plc.longest_run, plc.line_enders)),\n",
" ('bayes', MultinomialNB())])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('features', <programming_language_classifier.plc_trainer.Featurizer object at 0x10cfc9d30>), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix for Training Data\n",
"\n",
"[[46 0 0 0 0 0 0 0 4 0 0 0 0 0 1]\n",
" [ 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 31 0 0 0 0 0 0 0 0 0 0 1 0]\n",
" [ 0 0 0 21 0 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 29 0 1 0 0 0 0 0 0 0 1]\n",
" [ 0 7 0 0 0 45 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 17 0 1 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0]\n",
" [ 1 0 0 0 0 0 1 0 34 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 0 26 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 0 0 32 0 0 0 1]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 56 0 0 0]\n",
" [ 0 0 0 0 0 0 2 0 0 0 0 0 34 0 1]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 18 0]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 38]]\n",
"\n",
"Train Score: 0.954365079365\n"
]
}
],
"source": [
"print(\"Confusion Matrix for Training Data\\n\")\n",
"print(confusion_matrix(classifier.predict(x_train), y_train))\n",
"print(\"\\nTrain Score: \" + str(classifier.score(x_train, y_train)))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix for Test Data\n",
"\n",
"[[11 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 13 0 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 4 0 0 0 0 0 1 0 0]\n",
" [ 0 0 0 0 0 0 0 11 0 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 15 0 0 1]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 0 7 0 1]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 0 0 10 0]\n",
" [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 6]]\n",
"\n",
"Test Score: 0.96062992126\n"
]
}
],
"source": [
"print(\"Confusion Matrix for Test Data\\n\")\n",
"print(confusion_matrix(classifier.predict(x_test), y_test))\n",
"print(\"\\nTest Score: \" + str(classifier.score(x_test, y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"content = []\n",
"for file in sorted(os.listdir(\"test/\"), key=int):\n",
" with open(\"test/\" + file) as fh:\n",
" content.append([fh.read()])\n",
"test_data = gd.make_dataframe(content)\n",
"test_labels = ['Clojure', 'Clojure', 'Clojure', 'Clojure', 'Python', 'Python',\n",
" 'Python', 'Python', 'JavaScript', 'JavaScript', 'JavaScript',\n",
" 'JavaScript', 'Ruby', 'Ruby', 'Ruby', 'Haskell', 'Haskell',\n",
" 'Haskell', 'Scheme', 'Scheme', 'Scheme', 'Java', 'Java', 'Scala',\n",
" 'Scala', 'TCL', 'TCL', 'PHP', 'PHP', 'PHP', 'OCaml', 'OCaml']"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix for New Test Data\n",
"\n",
"[[3 0 0 0 0 0 0 0 0 0 0]\n",
" [0 3 0 0 0 0 0 0 0 0 0]\n",
" [0 0 2 0 0 0 0 0 0 0 0]\n",
" [0 0 0 4 0 0 0 0 0 0 0]\n",
" [0 0 0 0 2 0 0 0 0 0 0]\n",
" [0 0 0 0 0 3 0 0 0 0 0]\n",
" [1 0 0 0 0 0 4 0 0 0 0]\n",
" [0 0 0 0 0 0 0 3 0 0 0]\n",
" [0 0 0 0 0 0 0 0 2 0 0]\n",
" [0 0 0 0 0 0 0 0 0 3 0]\n",
" [0 0 0 0 0 0 0 0 0 0 2]]\n",
"\n",
"New Test Score: 0.96875\n"
]
}
],
"source": [
"print(\"Confusion Matrix for New Test Data\\n\")\n",
"print(confusion_matrix(classifier.predict(test_data[0]), test_labels))\n",
"print(\"\\nNew Test Score: \" + str(classifier.score(test_data[0], test_labels)))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" Clojure 0.75 1.00 0.86 3\n",
" Haskell 1.00 1.00 1.00 3\n",
" Java 1.00 1.00 1.00 2\n",
" JavaScript 1.00 1.00 1.00 4\n",
" OCaml 1.00 1.00 1.00 2\n",
" PHP 1.00 1.00 1.00 3\n",
" Python 1.00 0.80 0.89 5\n",
" Ruby 1.00 1.00 1.00 3\n",
" Scala 1.00 1.00 1.00 2\n",
" Scheme 1.00 1.00 1.00 3\n",
" TCL 1.00 1.00 1.00 2\n",
"\n",
"avg / total 0.98 0.97 0.97 32\n",
"\n"
]
}
],
"source": [
"print(classification_report(classifier.predict(test_data[0]), test_labels))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Empty file.
Binary file added programming_language_classifier/classifier
Binary file not shown.
48 changes: 48 additions & 0 deletions programming_language_classifier/crawler_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import bs4
import requests
import sys
import re


languages = {'javascript': '.js',
'haskell': '.haskell',
'scala': '.scala',
'ocaml': '.ocaml',
'ruby': '.jruby',
'php': '.php',
'clojure': '.clojure',
'perl': '.perl',
'csharp': '.csharp',
'java': '.java',
'c': '.gcc',
'scheme': '.racket',
'python': '.py',
'lisp': '.sbcl',
'tcl': '.tcl'}

def rosetta_scraper(seed, path):
response = requests.get(seed)
soup = bs4.BeautifulSoup(response.text)
divs = soup.select("div")
for div in divs:
if div.attrs.get("id") and div.attrs.get("id") == "mw-pages":
all_a = div.select('a')
links = ["http://rosettacode.org" + a.attrs.get("href")
for a in all_a
if a.attrs.get("href") and "wiki" in a.attrs.get("href")]
count = 1
for link in links:
response = requests.get(link)
soup = bs4.BeautifulSoup(response.text)
code = soup.select('pre')
for block in code:
for key in languages:
if block.attrs.get('class') is not None and key in block.attrs.get('class'):
soup = bs4.BeautifulSoup(re.sub(r'<br/>', "\n", str(block)))
with open(path + str(count) + languages[key], "w+") as file:
file.write(soup.text)
count += 1


if __name__ == '__main__':
rosetta_scraper(sys.argv[1], sys.argv[2])
42 changes: 42 additions & 0 deletions programming_language_classifier/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import sys
import pandas as pd

extensions = {".gcc": "C",
".c": "C",
".csharp": "C#",
".sbcl": "Common Lisp",
".clojure": "Clojure",
".ghc": "Haskell",
".java": "Java",
".javascript": "JavaScript",
".js": "JavaScript",
".ocaml": "OCaml",
".perl": "Perl",
".hack": "PHP",
".php": "PHP",
".py": "Python",
".python3": "Python",
".jruby": "Ruby",
".yarv": "Ruby",
".scala": "Scala",
".racket": "Scheme",
".tcl": "TCL"}

def get_content(directory):
content = []
for file in os.listdir(directory):
extension = os.path.splitext(file)[1]
if extension in extensions:
with open(directory + file) as fh:
content.append([extensions[extension], fh.read()])
return content


def make_dataframe(content_list):
return pd.DataFrame(content_list)


if __name__ == '__main__':
content_list = get_content(sys.argv[1])
print(make_dataframe(content_list))
Loading