From e1f5ac90964d3897a2a8ce135c0b8ca7e7dc9261 Mon Sep 17 00:00:00 2001 From: Manish Patel Date: Fri, 5 Jun 2015 19:48:15 -0400 Subject: [PATCH 1/3] Read code files --- read_data.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 read_data.py diff --git a/read_data.py b/read_data.py new file mode 100644 index 0000000..416df47 --- /dev/null +++ b/read_data.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import glob + +file_ext = {"C": ["gcc", "c", "h"], + "C#": ["csharp"], + "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"], + "Common Lisp": ["sbcl"], + "Haskell": ["hs", "lhs", "ghc"], + "Java": ["java", "class", "jar"], + "Javascript": ["js", "javascript"], + "OCaml": ["ocaml", "ml"], + "Perl": ["pl", "pm", "t", "pod", "perl"], + "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"], + "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"], + "Ruby": ["rb", "rbw", "jruby", "yarv"], + "Scala": ["scala"], + "Scheme": ["scm", "ss", "racket"], + "Tcl": ["tcl"]} + +def read_bench_files(): + files = glob.glob("benchmarksgame/benchmarksgame/bench/*/*.*") + texts = [] + for file in files: + ext = get_ext(file.split(".")[-1]) + with open(file) as fh: + if ext != None: + texts.append((fh.read(), ext)) + return texts + +def get_ext(ext): + for key, value in file_ext.items(): + if ext in value: + return key + + +data = read_bench_files() +data = pd.DataFrame(data, columns = ["code", "language"]) +print(data) From 689566adb408e2c9de5d35cf35335de172770fa4 Mon Sep 17 00:00:00 2001 From: Manish Patel Date: Sun, 7 Jun 2015 22:28:57 -0400 Subject: [PATCH 2/3] Added ipython notebook --- Untitled.ipynb | 923 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 923 insertions(+) create mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..fef59ba --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,923 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "file_ext = {\"C\": [\"gcc\", \"c\", \"h\"],\n", + " \"C#\": [\"csharp\"],\n", + " \"Clojure\": [\"clj\", \"cljs\", \"cljs\", \"edn\", \"clojure\"],\n", + " \"Common Lisp\": [\"sbcl\"],\n", + " \"Haskell\": [\"hs\", \"lhs\", \"ghc\"],\n", + " \"Java\": [\"java\", \"class\", \"jar\"],\n", + " \"Javascript\": [\"js\", \"javascript\"],\n", + " \"OCaml\": [\"ocaml\", \"ml\"],\n", + " \"Perl\": [\"pl\", \"pm\", \"t\", \"pod\", \"perl\"],\n", + " \"PHP\": [\"php\", \"phtml\", \"php4\", \"php3\", \"php5\", \"phps\", \"hack\"],\n", + " \"Python\": [\"py\", \"pyw\", \"pyc\", \"pyo\", \"pyd\", \"python3\", \"Python2\"],\n", + " \"Ruby\": [\"rb\", \"rbw\", \"jruby\", \"yarv\"],\n", + " \"Scala\": [\"scala\"],\n", + " \"Scheme\": [\"scm\", \"ss\", \"racket\"],\n", + " \"Tcl\": [\"tcl\"]}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def read_bench_files():\n", + " files = glob.glob(\"benchmarksgame/benchmarksgame/bench/*/*.*\")\n", + " texts = []\n", + " for file in files:\n", + " ext = get_ext(file.split(\".\")[-1])\n", + " with open(file) as fh:\n", + " if ext != None:\n", + " texts.append((fh.read(), ext))\n", + " return texts" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_ext(ext):\n", + " for key, value in file_ext.items():\n", + " if ext in value:\n", + " return key" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CodeLanguage
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...C
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...C
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...C
3;; The Computer Language Benchmarks Game\\n;; h...Clojure
4;; The Computer Language Benchmarks Game\\n;; h...Clojure
\n", + "
" + ], + "text/plain": [ + " Code Language\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = read_bench_files()\n", + "data = pd.DataFrame(data, columns = [\"Code\", \"Language\"])\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Ruby 73\n", + "C 61\n", + "PHP 55\n", + "Java 51\n", + "Scala 43\n", + "C# 41\n", + "Clojure 38\n", + "Python 36\n", + "Common Lisp 34\n", + "OCaml 34\n", + "Perl 34\n", + "Haskell 33\n", + "Scheme 29\n", + "Javascript 25\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.Language.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 C\n", + "1 C\n", + "2 C\n", + "3 Clojure\n", + "4 Clojure\n", + "Name: Language, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = data.loc[:,\"Language\"]\n", + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h...\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h...\n", + "Name: Code, dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = data.loc[:,\"Code\"]\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.base import TransformerMixin\n", + "class DumbFeaturizer(TransformerMixin):\n", + " def __init__(self):\n", + " pass\n", + " \n", + " def fit(self, X, y=None):\n", + " return self\n", + " \n", + " def transform(self, X):\n", + " return [[1] for _ in X]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.11914893617021277" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n", + "pipe.fit(X_train, y_train)\n", + "pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def char_count(text, char):\n", + " return text.count(char)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def string_count(string, code):\n", + " value = len(re.findall(string, code))\n", + " return value" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", + " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", + " \"and\", \"var\", \"loop\", \"array\", \"local\"]\n", + "symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(22, 10)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(keywords), len(symbols)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def add_features():\n", + " df = data\n", + " my_dict = {}\n", + " for index, row in df.iterrows():\n", + " for keyword in keywords:\n", + " value = string_count(keyword, row[\"Code\"])\n", + " if keyword not in my_dict:\n", + " my_dict[keyword] = [value]\n", + " else:\n", + " my_dict[keyword].append(value)\n", + " for symbol in symbols:\n", + " count = char_count(symbol, row[\"Code\"])\n", + " if symbol not in my_dict:\n", + " my_dict[symbol] = [count]\n", + " else:\n", + " my_dict[symbol].append(count)\n", + " \n", + " return my_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "features = add_features()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "fdf = pd.DataFrame.from_dict(features)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = fdf.join(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 587 entries, 0 to 586\n", + "Data columns (total 34 columns):\n", + "# 587 non-null int64\n", + "( 587 non-null int64\n", + ") 587 non-null int64\n", + ", 587 non-null int64\n", + ": 587 non-null int64\n", + "; 587 non-null int64\n", + "[ 587 non-null int64\n", + "] 587 non-null int64\n", + "and 587 non-null int64\n", + "array 587 non-null int64\n", + "def 587 non-null int64\n", + "define 587 non-null int64\n", + "elif 587 non-null int64\n", + "else 587 non-null int64\n", + "float 587 non-null int64\n", + "for 587 non-null int64\n", + "format 587 non-null int64\n", + "function 587 non-null int64\n", + "if 587 non-null int64\n", + "import 587 non-null int64\n", + "int 587 non-null int64\n", + "local 587 non-null int64\n", + "loop 587 non-null int64\n", + "private 587 non-null int64\n", + "public 587 non-null int64\n", + "return 587 non-null int64\n", + "static 587 non-null int64\n", + "var 587 non-null int64\n", + "void 587 non-null int64\n", + "while 587 non-null int64\n", + "{ 587 non-null int64\n", + "} 587 non-null int64\n", + "Code 587 non-null object\n", + "Language 587 non-null object\n", + "dtypes: int64(32), object(2)\n", + "memory usage: 160.5+ KB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "y = data.loc[:,(\"Language\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 C\n", + "1 C\n", + "2 C\n", + "3 Clojure\n", + "4 Clojure\n", + "Name: Language, dtype: object" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'Series' object has no attribute 'indo'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2081\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2082\u001b[0m raise AttributeError(\"'%s' object has no attribute '%s'\" %\n\u001b[0;32m-> 2083\u001b[0;31m (type(self).__name__, name))\n\u001b[0m\u001b[1;32m 2084\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2085\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'indo'" + ] + } + ], + "source": [ + "y.indo()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X = data.loc[:, (\"Code\", \"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", + " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", + " \"and\", \"var\", \"loop\", \"array\", \"local\", \":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Codepublicprivatestaticifelseelifdefvoidint...:;{}()#[],
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...00125208121...0000000000
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...001293012122...0000000000
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...003262013125...0000000000
3;; The Computer Language Benchmarks Game\\n;; h...0008001104...0000000000
4;; The Computer Language Benchmarks Game\\n;; h...0001100904...0000000000
\n", + "

5 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " Code public private static \\\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... 0 0 1 \n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... 0 0 1 \n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... 0 0 3 \n", + "3 ;; The Computer Language Benchmarks Game\\n;; h... 0 0 0 \n", + "4 ;; The Computer Language Benchmarks Game\\n;; h... 0 0 0 \n", + "\n", + " if else elif def void int ... : ; { } ( ) # [ ] , \n", + "0 25 2 0 8 1 21 ... 0 0 0 0 0 0 0 0 0 0 \n", + "1 29 3 0 12 1 22 ... 0 0 0 0 0 0 0 0 0 0 \n", + "2 26 2 0 13 1 25 ... 0 0 0 0 0 0 0 0 0 0 \n", + "3 8 0 0 11 0 4 ... 0 0 0 0 0 0 0 0 0 0 \n", + "4 11 0 0 9 0 4 ... 0 0 0 0 0 0 0 0 0 0 \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 587 entries, 0 to 586\n", + "Data columns (total 33 columns):\n", + "Code 587 non-null object\n", + "public 587 non-null int64\n", + "private 587 non-null int64\n", + "static 587 non-null int64\n", + "if 587 non-null int64\n", + "else 587 non-null int64\n", + "elif 587 non-null int64\n", + "def 587 non-null int64\n", + "void 587 non-null int64\n", + "int 587 non-null int64\n", + "float 587 non-null int64\n", + "for 587 non-null int64\n", + "while 587 non-null int64\n", + "import 587 non-null int64\n", + "define 587 non-null int64\n", + "function 587 non-null int64\n", + "return 587 non-null int64\n", + "format 587 non-null int64\n", + "and 587 non-null int64\n", + "var 587 non-null int64\n", + "loop 587 non-null int64\n", + "array 587 non-null int64\n", + "local 587 non-null int64\n", + ": 587 non-null int64\n", + "; 587 non-null int64\n", + "{ 587 non-null int64\n", + "} 587 non-null int64\n", + "( 587 non-null int64\n", + ") 587 non-null int64\n", + "# 587 non-null int64\n", + "[ 587 non-null int64\n", + "] 587 non-null int64\n", + ", 587 non-null int64\n", + "dtypes: int64(32), object(1)\n", + "memory usage: 155.9+ KB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Number of labels=352 does not match number of samples=33", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.40\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mpipe\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDumbFeaturizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDecisionTreeClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 139\u001b[0m \"\"\"\n\u001b[1;32m 140\u001b[0m \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pre_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 141\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/tree/tree.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mn_samples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m raise ValueError(\"Number of labels=%d does not match \"\n\u001b[0;32m--> 221\u001b[0;31m \"number of samples=%d\" % (len(y), n_samples))\n\u001b[0m\u001b[1;32m 222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin_samples_split\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"min_samples_split must be greater than zero.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Number of labels=352 does not match number of samples=33" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n", + "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n", + "pipe.fit(X_train, y_train)\n", + "pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From f4c64c5ad80aca816b38ccc79b9c23c577bb2d05 Mon Sep 17 00:00:00 2001 From: Manish Patel Date: Mon, 8 Jun 2015 14:49:17 -0400 Subject: [PATCH 3/3] Added file after help from Clinton --- Untitled.ipynb | 762 ++++++++++++++++++------------------------------- 1 file changed, 282 insertions(+), 480 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index fef59ba..35b4d17 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 60, "metadata": { "collapsed": true }, @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 61, "metadata": { "collapsed": true }, @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 62, "metadata": { "collapsed": true }, @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 63, "metadata": { "collapsed": true }, @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 64, "metadata": { "collapsed": false }, @@ -129,7 +129,7 @@ "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure" ] }, - "execution_count": 5, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 65, "metadata": { "collapsed": false }, @@ -167,7 +167,7 @@ "dtype: int64" ] }, - "execution_count": 6, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -178,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 66, "metadata": { "collapsed": false }, @@ -194,7 +194,7 @@ "Name: Language, dtype: object" ] }, - "execution_count": 7, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -206,35 +206,69 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Code
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...
3;; The Computer Language Benchmarks Game\\n;; h...
4;; The Computer Language Benchmarks Game\\n;; h...
\n", + "
" + ], "text/plain": [ - "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n", - "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n", - "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n", - "3 ;; The Computer Language Benchmarks Game\\n;; h...\n", - "4 ;; The Computer Language Benchmarks Game\\n;; h...\n", - "Name: Code, dtype: object" + " Code\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h...\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h..." ] }, - "execution_count": 8, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X = data.loc[:,\"Code\"]\n", + "X = data.loc[:,[\"Code\"]]\n", "X.head()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 84, "metadata": { "collapsed": false }, @@ -246,73 +280,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.pipeline import make_pipeline\n", - "from sklearn.feature_extraction.text import CountVectorizer" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from sklearn.base import TransformerMixin\n", - "class DumbFeaturizer(TransformerMixin):\n", - " def __init__(self):\n", - " pass\n", - " \n", - " def fit(self, X, y=None):\n", - " return self\n", - " \n", - " def transform(self, X):\n", - " return [[1] for _ in X]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.11914893617021277" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.tree import DecisionTreeClassifier\n", - "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n", - "pipe.fit(X_train, y_train)\n", - "pipe.score(X_test, y_test)" + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.base import TransformerMixin" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 86, "metadata": { "collapsed": false }, @@ -323,273 +304,141 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 92, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "def char_count(text, char):\n", - " return text.count(char)" + "def char_count(char, code):\n", + " return code.count(char)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 101, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "def string_count(string, code):\n", - " value = len(re.findall(string, code))\n", - " return value" + "def char_percent(char, code):\n", + " return code.count(char) / len(code)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 102, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", - " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", - " \"and\", \"var\", \"loop\", \"array\", \"local\"]\n", - "symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(22, 10)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(keywords), len(symbols)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def add_features():\n", - " df = data\n", - " my_dict = {}\n", - " for index, row in df.iterrows():\n", - " for keyword in keywords:\n", - " value = string_count(keyword, row[\"Code\"])\n", - " if keyword not in my_dict:\n", - " my_dict[keyword] = [value]\n", - " else:\n", - " my_dict[keyword].append(value)\n", - " for symbol in symbols:\n", - " count = char_count(symbol, row[\"Code\"])\n", - " if symbol not in my_dict:\n", - " my_dict[symbol] = [count]\n", - " else:\n", - " my_dict[symbol].append(count)\n", - " \n", - " return my_dict" + "def string_count(string, code):\n", + " value = len(re.findall(string, code))\n", + " return value" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 103, "metadata": { "collapsed": true }, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "features = add_features()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "scrolled": true - }, - "outputs": [], - "source": [ - "fdf = pd.DataFrame.from_dict(features)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [], "source": [ - "data = fdf.join(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 587 entries, 0 to 586\n", - "Data columns (total 34 columns):\n", - "# 587 non-null int64\n", - "( 587 non-null int64\n", - ") 587 non-null int64\n", - ", 587 non-null int64\n", - ": 587 non-null int64\n", - "; 587 non-null int64\n", - "[ 587 non-null int64\n", - "] 587 non-null int64\n", - "and 587 non-null int64\n", - "array 587 non-null int64\n", - "def 587 non-null int64\n", - "define 587 non-null int64\n", - "elif 587 non-null int64\n", - "else 587 non-null int64\n", - "float 587 non-null int64\n", - "for 587 non-null int64\n", - "format 587 non-null int64\n", - "function 587 non-null int64\n", - "if 587 non-null int64\n", - "import 587 non-null int64\n", - "int 587 non-null int64\n", - "local 587 non-null int64\n", - "loop 587 non-null int64\n", - "private 587 non-null int64\n", - "public 587 non-null int64\n", - "return 587 non-null int64\n", - "static 587 non-null int64\n", - "var 587 non-null int64\n", - "void 587 non-null int64\n", - "while 587 non-null int64\n", - "{ 587 non-null int64\n", - "} 587 non-null int64\n", - "Code 587 non-null object\n", - "Language 587 non-null object\n", - "dtypes: int64(32), object(2)\n", - "memory usage: 160.5+ KB\n" - ] - } - ], - "source": [ - "data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false, - "scrolled": true - }, - "outputs": [], - "source": [ - "y = data.loc[:,(\"Language\")]" + "class CodeVectorizer(TransformerMixin):\n", + " def __init__(self):\n", + " self.keywords = [\"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", + " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", + " \"and\", \"var\", \"loop\", \"array\", \"local\"]\n", + " self.symbols = [\":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\"]\n", + " \n", + " def fit(self, X, y=None):\n", + " return self\n", + " \n", + " def transform(self, X):\n", + " feature_list = []\n", + " for code in X[\"Code\"]:\n", + " features = {}\n", + " for keyword in keywords:\n", + " features[keyword] = string_count(keyword, code)\n", + " for symbol in symbols:\n", + " features[symbol] = char_percent(symbol, code)\n", + " feature_list.append(features)\n", + " return pd.DataFrame(feature_list)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 104, "metadata": { "collapsed": false }, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CodeLanguage
0/*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...C
1/*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...C
2/*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...C
3;; The Computer Language Benchmarks Game\\n;; h...Clojure
4;; The Computer Language Benchmarks Game\\n;; h...Clojure
\n", + "
" + ], "text/plain": [ - "0 C\n", - "1 C\n", - "2 C\n", - "3 Clojure\n", - "4 Clojure\n", - "Name: Language, dtype: object" + " Code Language\n", + "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... C\n", + "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... C\n", + "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... C\n", + "3 ;; The Computer Language Benchmarks Game\\n;; h... Clojure\n", + "4 ;; The Computer Language Benchmarks Game\\n;; h... Clojure" ] }, - "execution_count": 47, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "y.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'Series' object has no attribute 'indo'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2081\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2082\u001b[0m raise AttributeError(\"'%s' object has no attribute '%s'\" %\n\u001b[0;32m-> 2083\u001b[0;31m (type(self).__name__, name))\n\u001b[0m\u001b[1;32m 2084\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2085\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'indo'" - ] - } - ], - "source": [ - "y.indo()" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "X = data.loc[:, (\"Code\", \"public\", \"private\", \"static\", \"if\", \"else\", \"elif\", \"def\", \"void\", \"int\", \n", - " \"float\", \"for\", \"while\", \"import\", \"define\", \"function\", \"return\", \"format\", \n", - " \"and\", \"var\", \"loop\", \"array\", \"local\", \":\", \";\", \"{\", \"}\", \"(\", \")\", \"#\", \"[\", \"]\", \",\")]" + "data.head()" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 105, "metadata": { "collapsed": false }, @@ -602,114 +451,114 @@ " \n", " \n", " \n", - " Code\n", - " public\n", + " #\n", + " (\n", + " )\n", + " ,\n", + " :\n", + " ;\n", + " [\n", + " ]\n", + " and\n", + " array\n", + " ...\n", + " loop\n", " private\n", + " public\n", + " return\n", " static\n", - " if\n", - " else\n", - " elif\n", - " def\n", + " var\n", " void\n", - " int\n", - " ...\n", - " :\n", - " ;\n", + " while\n", " {\n", " }\n", - " (\n", - " )\n", - " #\n", - " [\n", - " ]\n", - " ,\n", " \n", " \n", " \n", " \n", " 0\n", - " /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:...\n", - " 0\n", + " 0.004087\n", + " 0.017938\n", + " 0.017938\n", + " 0.010899\n", + " 0.002044\n", + " 0.020209\n", + " 0.002271\n", + " 0.002271\n", " 0\n", - " 1\n", - " 25\n", - " 2\n", " 0\n", - " 8\n", - " 1\n", - " 21\n", " ...\n", " 0\n", " 0\n", " 0\n", + " 15\n", + " 1\n", " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 1\n", + " 5\n", + " 0.006585\n", + " 0.006585\n", " \n", " \n", " 1\n", - " /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04...\n", + " 0.004470\n", + " 0.020019\n", + " 0.020019\n", + " 0.011273\n", + " 0.001944\n", + " 0.018465\n", + " 0.002332\n", + " 0.002332\n", " 0\n", " 0\n", - " 1\n", - " 29\n", - " 3\n", - " 0\n", - " 12\n", - " 1\n", - " 22\n", " ...\n", " 0\n", " 0\n", " 0\n", + " 18\n", + " 1\n", " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 1\n", + " 6\n", + " 0.006414\n", + " 0.006414\n", " \n", " \n", " 2\n", - " /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04...\n", - " 0\n", + " 0.005647\n", + " 0.015586\n", + " 0.015586\n", + " 0.010843\n", + " 0.002259\n", + " 0.017619\n", + " 0.002259\n", + " 0.002259\n", " 0\n", - " 3\n", - " 26\n", - " 2\n", " 0\n", - " 13\n", - " 1\n", - " 25\n", " ...\n", " 0\n", " 0\n", " 0\n", + " 13\n", + " 3\n", " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 1\n", + " 5\n", + " 0.005421\n", + " 0.005421\n", " \n", " \n", " 3\n", - " ;; The Computer Language Benchmarks Game\\n;; h...\n", - " 0\n", - " 0\n", - " 0\n", - " 8\n", - " 0\n", - " 0\n", - " 11\n", + " 0.000000\n", + " 0.035950\n", + " 0.035950\n", + " 0.000826\n", + " 0.002066\n", + " 0.009091\n", + " 0.008264\n", + " 0.008264\n", + " 1\n", " 0\n", - " 4\n", " ...\n", " 0\n", " 0\n", @@ -719,22 +568,23 @@ " 0\n", " 0\n", " 0\n", - " 0\n", - " 0\n", + " 0.000000\n", + " 0.000000\n", " \n", " \n", " 4\n", - " ;; The Computer Language Benchmarks Game\\n;; h...\n", - " 0\n", - " 0\n", - " 0\n", - " 11\n", - " 0\n", - " 0\n", - " 9\n", + " 0.000386\n", + " 0.035852\n", + " 0.035852\n", + " 0.000771\n", + " 0.002313\n", + " 0.008096\n", + " 0.007710\n", + " 0.007710\n", + " 2\n", " 0\n", - " 4\n", " ...\n", + " 2\n", " 0\n", " 0\n", " 0\n", @@ -742,161 +592,113 @@ " 0\n", " 0\n", " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 0.000000\n", + " 0.000000\n", " \n", " \n", "\n", - "

5 rows × 33 columns

\n", + "

5 rows × 32 columns

\n", "" ], "text/plain": [ - " Code public private static \\\n", - "0 /*\\n * $Id: simple_hash.h,v 1.1 2013/01/02 04:... 0 0 1 \n", - "1 /*\\n * $Id: simple_hash2.h,v 1.1 2013/01/02 04... 0 0 1 \n", - "2 /*\\n * $Id: simple_hash3.h,v 1.1 2013/01/02 04... 0 0 3 \n", - "3 ;; The Computer Language Benchmarks Game\\n;; h... 0 0 0 \n", - "4 ;; The Computer Language Benchmarks Game\\n;; h... 0 0 0 \n", + " # ( ) , : ; [ \\\n", + "0 0.004087 0.017938 0.017938 0.010899 0.002044 0.020209 0.002271 \n", + "1 0.004470 0.020019 0.020019 0.011273 0.001944 0.018465 0.002332 \n", + "2 0.005647 0.015586 0.015586 0.010843 0.002259 0.017619 0.002259 \n", + "3 0.000000 0.035950 0.035950 0.000826 0.002066 0.009091 0.008264 \n", + "4 0.000386 0.035852 0.035852 0.000771 0.002313 0.008096 0.007710 \n", + "\n", + " ] and array ... loop private public return static var \\\n", + "0 0.002271 0 0 ... 0 0 0 15 1 0 \n", + "1 0.002332 0 0 ... 0 0 0 18 1 0 \n", + "2 0.002259 0 0 ... 0 0 0 13 3 0 \n", + "3 0.008264 1 0 ... 0 0 0 0 0 0 \n", + "4 0.007710 2 0 ... 2 0 0 0 0 0 \n", "\n", - " if else elif def void int ... : ; { } ( ) # [ ] , \n", - "0 25 2 0 8 1 21 ... 0 0 0 0 0 0 0 0 0 0 \n", - "1 29 3 0 12 1 22 ... 0 0 0 0 0 0 0 0 0 0 \n", - "2 26 2 0 13 1 25 ... 0 0 0 0 0 0 0 0 0 0 \n", - "3 8 0 0 11 0 4 ... 0 0 0 0 0 0 0 0 0 0 \n", - "4 11 0 0 9 0 4 ... 0 0 0 0 0 0 0 0 0 0 \n", + " void while { } \n", + "0 1 5 0.006585 0.006585 \n", + "1 1 6 0.006414 0.006414 \n", + "2 1 5 0.005421 0.005421 \n", + "3 0 0 0.000000 0.000000 \n", + "4 0 0 0.000000 0.000000 \n", "\n", - "[5 rows x 33 columns]" + "[5 rows x 32 columns]" ] }, - "execution_count": 52, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X.head()" + "cv = CodeVectorizer()\n", + "cv.fit(data)\n", + "cv.transform(data).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "y = data.loc[:,(\"Language\")]" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 107, "metadata": { "collapsed": false }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 587 entries, 0 to 586\n", - "Data columns (total 33 columns):\n", - "Code 587 non-null object\n", - "public 587 non-null int64\n", - "private 587 non-null int64\n", - "static 587 non-null int64\n", - "if 587 non-null int64\n", - "else 587 non-null int64\n", - "elif 587 non-null int64\n", - "def 587 non-null int64\n", - "void 587 non-null int64\n", - "int 587 non-null int64\n", - "float 587 non-null int64\n", - "for 587 non-null int64\n", - "while 587 non-null int64\n", - "import 587 non-null int64\n", - "define 587 non-null int64\n", - "function 587 non-null int64\n", - "return 587 non-null int64\n", - "format 587 non-null int64\n", - "and 587 non-null int64\n", - "var 587 non-null int64\n", - "loop 587 non-null int64\n", - "array 587 non-null int64\n", - "local 587 non-null int64\n", - ": 587 non-null int64\n", - "; 587 non-null int64\n", - "{ 587 non-null int64\n", - "} 587 non-null int64\n", - "( 587 non-null int64\n", - ") 587 non-null int64\n", - "# 587 non-null int64\n", - "[ 587 non-null int64\n", - "] 587 non-null int64\n", - ", 587 non-null int64\n", - "dtypes: int64(32), object(1)\n", - "memory usage: 155.9+ KB\n" - ] + "data": { + "text/plain": [ + "0 C\n", + "1 C\n", + "2 C\n", + "3 Clojure\n", + "4 Clojure\n", + "Name: Language, dtype: object" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "X.info()" + "y.head()" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { - "ename": "ValueError", - "evalue": "Number of labels=352 does not match number of samples=33", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.40\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mpipe\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDumbFeaturizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDecisionTreeClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mpipe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 139\u001b[0m \"\"\"\n\u001b[1;32m 140\u001b[0m \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pre_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 141\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 142\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/Manish/TIY/programming-language-classifier/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/tree/tree.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, check_input)\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mn_samples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m raise ValueError(\"Number of labels=%d does not match \"\n\u001b[0;32m--> 221\u001b[0;31m \"number of samples=%d\" % (len(y), n_samples))\n\u001b[0m\u001b[1;32m 222\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin_samples_split\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"min_samples_split must be greater than zero.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Number of labels=352 does not match number of samples=33" - ] + "data": { + "text/plain": [ + "0.87234042553191493" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40)\n", - "pipe = make_pipeline(DumbFeaturizer(), DecisionTreeClassifier())\n", + "pipe = make_pipeline(CodeVectorizer(), DecisionTreeClassifier())\n", "pipe.fit(X_train, y_train)\n", "pipe.score(X_test, y_test)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": {