diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..35b4d17
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,725 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import glob"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "file_ext = {\"C\": [\"gcc\", \"c\", \"h\"],\n",
+ " \"C#\": [\"csharp\"],\n",
+ " \"Clojure\": [\"clj\", \"cljs\", \"cljs\", \"edn\", \"clojure\"],\n",
+ " \"Common Lisp\": [\"sbcl\"],\n",
+ " \"Haskell\": [\"hs\", \"lhs\", \"ghc\"],\n",
+ " \"Java\": [\"java\", \"class\", \"jar\"],\n",
+ " \"Javascript\": [\"js\", \"javascript\"],\n",
+ " \"OCaml\": [\"ocaml\", \"ml\"],\n",
+ " \"Perl\": [\"pl\", \"pm\", \"t\", \"pod\", \"perl\"],\n",
+ " \"PHP\": [\"php\", \"phtml\", \"php4\", \"php3\", \"php5\", \"phps\", \"hack\"],\n",
+ " \"Python\": [\"py\", \"pyw\", \"pyc\", \"pyo\", \"pyd\", \"python3\", \"Python2\"],\n",
+ " \"Ruby\": [\"rb\", \"rbw\", \"jruby\", \"yarv\"],\n",
+ " \"Scala\": [\"scala\"],\n",
+ " \"Scheme\": [\"scm\", \"ss\", \"racket\"],\n",
+ " \"Tcl\": [\"tcl\"]}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def read_bench_files():\n",
+ " files = glob.glob(\"benchmarksgame/benchmarksgame/bench/*/*.*\")\n",
+ " texts = []\n",
+ " for file in files:\n",
+ " ext = get_ext(file.split(\".\")[-1])\n",
+ " with open(file) as fh:\n",
+ " if ext != None:\n",
+ " texts.append((fh.read(), ext))\n",
+ " return texts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def get_ext(ext):\n",
+ " for key, value in file_ext.items():\n",
+ " if ext in value:\n",
+ " return key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Code | \n",
+ " Language | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ " Clojure | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ " Clojure | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Code Language\n",
+ "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n",
+ "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n",
+ "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n",
+ "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n",
+ "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = read_bench_files()\n",
+ "data = pd.DataFrame(data, columns = [\"Code\", \"Language\"])\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Ruby 73\n",
+ "C 61\n",
+ "PHP 55\n",
+ "Java 51\n",
+ "Scala 43\n",
+ "C# 41\n",
+ "Clojure 38\n",
+ "Python 36\n",
+ "Common Lisp 34\n",
+ "OCaml 34\n",
+ "Perl 34\n",
+ "Haskell 33\n",
+ "Scheme 29\n",
+ "Javascript 25\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.Language.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 C\n",
+ "1 C\n",
+ "2 C\n",
+ "3 Clojure\n",
+ "4 Clojure\n",
+ "Name: Language, dtype: object"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y = data.loc[:,\"Language\"]\n",
+ "y.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Code | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Code\n",
+ "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n",
+ "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n",
+ "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n",
+ "3 ;; The Computer Language Benchmarks Game\\n;; h...\n",
+ "4 ;; The Computer Language Benchmarks Game\\n;; h..."
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = data.loc[:,[\"Code\"]]\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.cross_validation import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.pipeline import make_pipeline\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.base import TransformerMixin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def char_count(char, code):\n",
+ " return code.count(char)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def char_percent(char, code):\n",
+ " return code.count(char) / len(code)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def string_count(string, code):\n",
+ " value = len(re.findall(string, code))\n",
+ " return value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "class CodeVectorizer(TransformerMixin):\n",
+ " def __init__(self):\n",
+ " self.keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n",
+ " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n",
+ " \"and\", \"var\", \"loop\", \"array\", \"local\"]\n",
+ " self.symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]\n",
+ " \n",
+ " def fit(self, X, y=None):\n",
+ " return self\n",
+ " \n",
+ " def transform(self, X):\n",
+ " feature_list = []\n",
+ " for code in X[\"Code\"]:\n",
+ " features = {}\n",
+ " for keyword in keywords:\n",
+ " features[keyword] = string_count(keyword, code)\n",
+ " for symbol in symbols:\n",
+ " features[symbol] = char_percent(symbol, code)\n",
+ " feature_list.append(features)\n",
+ " return pd.DataFrame(feature_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Code | \n",
+ " Language | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ " Clojure | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ;; The Computer Language Benchmarks Game\\n;; h... | \n",
+ " Clojure | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Code Language\n",
+ "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n",
+ "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n",
+ "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n",
+ "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n",
+ "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " # | \n",
+ " ( | \n",
+ " ) | \n",
+ " , | \n",
+ " : | \n",
+ " ; | \n",
+ " [ | \n",
+ " ] | \n",
+ " and | \n",
+ " array | \n",
+ " ... | \n",
+ " loop | \n",
+ " private | \n",
+ " public | \n",
+ " return | \n",
+ " static | \n",
+ " var | \n",
+ " void | \n",
+ " while | \n",
+ " { | \n",
+ " } | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.004087 | \n",
+ " 0.017938 | \n",
+ " 0.017938 | \n",
+ " 0.010899 | \n",
+ " 0.002044 | \n",
+ " 0.020209 | \n",
+ " 0.002271 | \n",
+ " 0.002271 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 15 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 0.006585 | \n",
+ " 0.006585 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.004470 | \n",
+ " 0.020019 | \n",
+ " 0.020019 | \n",
+ " 0.011273 | \n",
+ " 0.001944 | \n",
+ " 0.018465 | \n",
+ " 0.002332 | \n",
+ " 0.002332 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 18 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 0.006414 | \n",
+ " 0.006414 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.005647 | \n",
+ " 0.015586 | \n",
+ " 0.015586 | \n",
+ " 0.010843 | \n",
+ " 0.002259 | \n",
+ " 0.017619 | \n",
+ " 0.002259 | \n",
+ " 0.002259 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 13 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 0.005421 | \n",
+ " 0.005421 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.000000 | \n",
+ " 0.035950 | \n",
+ " 0.035950 | \n",
+ " 0.000826 | \n",
+ " 0.002066 | \n",
+ " 0.009091 | \n",
+ " 0.008264 | \n",
+ " 0.008264 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.000386 | \n",
+ " 0.035852 | \n",
+ " 0.035852 | \n",
+ " 0.000771 | \n",
+ " 0.002313 | \n",
+ " 0.008096 | \n",
+ " 0.007710 | \n",
+ " 0.007710 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 32 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " # ( ) , : ; [ \\\n",
+ "0 0.004087 0.017938 0.017938 0.010899 0.002044 0.020209 0.002271 \n",
+ "1 0.004470 0.020019 0.020019 0.011273 0.001944 0.018465 0.002332 \n",
+ "2 0.005647 0.015586 0.015586 0.010843 0.002259 0.017619 0.002259 \n",
+ "3 0.000000 0.035950 0.035950 0.000826 0.002066 0.009091 0.008264 \n",
+ "4 0.000386 0.035852 0.035852 0.000771 0.002313 0.008096 0.007710 \n",
+ "\n",
+ " ] and array ... loop private public return static var \\\n",
+ "0 0.002271 0 0 ... 0 0 0 15 1 0 \n",
+ "1 0.002332 0 0 ... 0 0 0 18 1 0 \n",
+ "2 0.002259 0 0 ... 0 0 0 13 3 0 \n",
+ "3 0.008264 1 0 ... 0 0 0 0 0 0 \n",
+ "4 0.007710 2 0 ... 2 0 0 0 0 0 \n",
+ "\n",
+ " void while { } \n",
+ "0 1 5 0.006585 0.006585 \n",
+ "1 1 6 0.006414 0.006414 \n",
+ "2 1 5 0.005421 0.005421 \n",
+ "3 0 0 0.000000 0.000000 \n",
+ "4 0 0 0.000000 0.000000 \n",
+ "\n",
+ "[5 rows x 32 columns]"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv = CodeVectorizer()\n",
+ "cv.fit(data)\n",
+ "cv.transform(data).head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {
+ "collapsed": false,
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "y = data.loc[:,(\"Language\")]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 C\n",
+ "1 C\n",
+ "2 C\n",
+ "3 Clojure\n",
+ "4 Clojure\n",
+ "Name: Language, dtype: object"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.87234042553191493"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n",
+ "pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())\n",
+ "pipe.fit(X_train, y_train)\n",
+ "pipe.score(X_test, y_test)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.4.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/read_data.py b/read_data.py
new file mode 100644
index 0000000..416df47
--- /dev/null
+++ b/read_data.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import glob
+
+file_ext = {"C": ["gcc", "c", "h"],
+ "C#": ["csharp"],
+ "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
+ "Common Lisp": ["sbcl"],
+ "Haskell": ["hs", "lhs", "ghc"],
+ "Java": ["java", "class", "jar"],
+ "Javascript": ["js", "javascript"],
+ "OCaml": ["ocaml", "ml"],
+ "Perl": ["pl", "pm", "t", "pod", "perl"],
+ "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
+ "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
+ "Ruby": ["rb", "rbw", "jruby", "yarv"],
+ "Scala": ["scala"],
+ "Scheme": ["scm", "ss", "racket"],
+ "Tcl": ["tcl"]}
+
+def read_bench_files():
+ files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*")
+ texts = []
+ for file in files:
+ ext = get_ext(file.split(".")[-1])
+ with open(file) as fh:
+ if ext != None:
+ texts.append((fh.read(), ext))
+ return texts
+
+def get_ext(ext):
+ for key, value in file_ext.items():
+ if ext in value:
+ return key
+
+
+data = read_bench_files()
+data = pd.DataFrame(data, columns = ["code", "language"])
+print(data)