diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..35b4d17 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,725 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "file_ext = {\"C\": [\"gcc\", \"c\", \"h\"],\n", + " \"C#\": [\"csharp\"],\n", + " \"Clojure\": [\"clj\", \"cljs\", \"cljs\", \"edn\", \"clojure\"],\n", + " \"Common Lisp\": [\"sbcl\"],\n", + " \"Haskell\": [\"hs\", \"lhs\", \"ghc\"],\n", + " \"Java\": [\"java\", \"class\", \"jar\"],\n", + " \"Javascript\": [\"js\", \"javascript\"],\n", + " \"OCaml\": [\"ocaml\", \"ml\"],\n", + " \"Perl\": [\"pl\", \"pm\", \"t\", \"pod\", \"perl\"],\n", + " \"PHP\": [\"php\", \"phtml\", \"php4\", \"php3\", \"php5\", \"phps\", \"hack\"],\n", + " \"Python\": [\"py\", \"pyw\", \"pyc\", \"pyo\", \"pyd\", \"python3\", \"Python2\"],\n", + " \"Ruby\": [\"rb\", \"rbw\", \"jruby\", \"yarv\"],\n", + " \"Scala\": [\"scala\"],\n", + " \"Scheme\": [\"scm\", \"ss\", \"racket\"],\n", + " \"Tcl\": [\"tcl\"]}" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def read_bench_files():\n", + " files = glob.glob(\"benchmarksgame/benchmarksgame/bench/*/*.*\")\n", + " texts = []\n", + " for file in files:\n", + " ext = get_ext(file.split(\".\")[-1])\n", + " with open(file) as fh:\n", + " if ext != None:\n", + " texts.append((fh.read(), ext))\n", + " return texts" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_ext(ext):\n", + " for key, value in file_ext.items():\n", + " if ext in value:\n", + " return key" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CodeLanguage
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...C
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...C
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...C
3;; The Computer Language Benchmarks Game\\n;; h...Clojure
4;; The Computer Language Benchmarks Game\\n;; h...Clojure
\n", + "
" + ], + "text/plain": [ + " Code Language\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = read_bench_files()\n", + "data = pd.DataFrame(data, columns = [\"Code\", \"Language\"])\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Ruby 73\n", + "C 61\n", + "PHP 55\n", + "Java 51\n", + "Scala 43\n", + "C# 41\n", + "Clojure 38\n", + "Python 36\n", + "Common Lisp 34\n", + "OCaml 34\n", + "Perl 34\n", + "Haskell 33\n", + "Scheme 29\n", + "Javascript 25\n", + "dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.Language.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 C\n", + "1 C\n", + "2 C\n", + "3 Clojure\n", + "4 Clojure\n", + "Name: Language, dtype: object" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = data.loc[:,\"Language\"]\n", + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Code
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...
3;; The Computer Language Benchmarks Game\\n;; h...
4;; The Computer Language Benchmarks Game\\n;; h...
\n", + "
" + ], + "text/plain": [ + " Code\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h...\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h..." + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = data.loc[:,[\"Code\"]]\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.base import TransformerMixin" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def char_count(char, code):\n", + " return code.count(char)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def char_percent(char, code):\n", + " return code.count(char) / len(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def string_count(string, code):\n", + " value = len(re.findall(string, code))\n", + " return value" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class CodeVectorizer(TransformerMixin):\n", + " def __init__(self):\n", + " self.keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", + " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", + " \"and\", \"var\", \"loop\", \"array\", \"local\"]\n", + " self.symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]\n", + " \n", + " def fit(self, X, y=None):\n", + " return self\n", + " \n", + " def transform(self, X):\n", + " feature_list = []\n", + " for code in X[\"Code\"]:\n", + " features = {}\n", + " for keyword in keywords:\n", + " features[keyword] = string_count(keyword, code)\n", + " for symbol in symbols:\n", + " features[symbol] = char_percent(symbol, code)\n", + " feature_list.append(features)\n", + " return pd.DataFrame(feature_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CodeLanguage
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...C
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...C
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...C
3;; The Computer Language Benchmarks Game\\n;; h...Clojure
4;; The Computer Language Benchmarks Game\\n;; h...Clojure
\n", + "
" + ], + "text/plain": [ + " Code Language\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#(),:;[]andarray...loopprivatepublicreturnstaticvarvoidwhile{}
00.0040870.0179380.0179380.0108990.0020440.0202090.0022710.00227100...0001510150.0065850.006585
10.0044700.0200190.0200190.0112730.0019440.0184650.0023320.00233200...0001810160.0064140.006414
20.0056470.0155860.0155860.0108430.0022590.0176190.0022590.00225900...0001330150.0054210.005421
30.0000000.0359500.0359500.0008260.0020660.0090910.0082640.00826410...000000000.0000000.000000
40.0003860.0358520.0358520.0007710.0023130.0080960.0077100.00771020...200000000.0000000.000000
\n", + "

5 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " # ( ) , : ; [ \\\n", + "0 0.004087 0.017938 0.017938 0.010899 0.002044 0.020209 0.002271 \n", + "1 0.004470 0.020019 0.020019 0.011273 0.001944 0.018465 0.002332 \n", + "2 0.005647 0.015586 0.015586 0.010843 0.002259 0.017619 0.002259 \n", + "3 0.000000 0.035950 0.035950 0.000826 0.002066 0.009091 0.008264 \n", + "4 0.000386 0.035852 0.035852 0.000771 0.002313 0.008096 0.007710 \n", + "\n", + " ] and array ... loop private public return static var \\\n", + "0 0.002271 0 0 ... 0 0 0 15 1 0 \n", + "1 0.002332 0 0 ... 0 0 0 18 1 0 \n", + "2 0.002259 0 0 ... 0 0 0 13 3 0 \n", + "3 0.008264 1 0 ... 0 0 0 0 0 0 \n", + "4 0.007710 2 0 ... 2 0 0 0 0 0 \n", + "\n", + " void while { } \n", + "0 1 5 0.006585 0.006585 \n", + "1 1 6 0.006414 0.006414 \n", + "2 1 5 0.005421 0.005421 \n", + "3 0 0 0.000000 0.000000 \n", + "4 0 0 0.000000 0.000000 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv = CodeVectorizer()\n", + "cv.fit(data)\n", + "cv.transform(data).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "y = data.loc[:,(\"Language\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 C\n", + "1 C\n", + "2 C\n", + "3 Clojure\n", + "4 Clojure\n", + "Name: Language, dtype: object" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.87234042553191493" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n", + "pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())\n", + "pipe.fit(X_train, y_train)\n", + "pipe.score(X_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/read_data.py b/read_data.py new file mode 100644 index 0000000..416df47 --- /dev/null +++ b/read_data.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import glob + +file_ext = {"C": ["gcc", "c", "h"], + "C#": ["csharp"], + "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"], + "Common Lisp": ["sbcl"], + "Haskell": ["hs", "lhs", "ghc"], + "Java": ["java", "class", "jar"], + "Javascript": ["js", "javascript"], + "OCaml": ["ocaml", "ml"], + "Perl": ["pl", "pm", "t", "pod", "perl"], + "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"], + "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"], + "Ruby": ["rb", "rbw", "jruby", "yarv"], + "Scala": ["scala"], + "Scheme": ["scm", "ss", "racket"], + "Tcl": ["tcl"]} + +def read_bench_files(): + files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*") + texts = [] + for file in files: + ext = get_ext(file.split(".")[-1]) + with open(file) as fh: + if ext != None: + texts.append((fh.read(), ext)) + return texts + +def get_ext(ext): + for key, value in file_ext.items(): + if ext in value: + return key + + +data = read_bench_files() +data = pd.DataFrame(data, columns = ["code", "language"]) +print(data)