diff --git a/Classifier.ipynb b/Classifier.ipynb
new file mode 100644
index 0000000..139c251
--- /dev/null
+++ b/Classifier.ipynb
@@ -0,0 +1,146 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from Scrapper import scrape\n",
+    "from Scrapper import load_data\n",
+    "from Learner import Learner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df2 = load_data(200)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "classifier = Learner(df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.36491387126019947"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.test_score()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.57978241160471444"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.train_score()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from FeatureVectorizer import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "longest_run_of_character_feature('hello . work.dl. in. the junghle.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/FeatureVectorizer.py b/FeatureVectorizer.py
new file mode 100644
index 0000000..e17ad50
--- /dev/null
+++ b/FeatureVectorizer.py
@@ -0,0 +1,76 @@
+from sklearn.pipeline import make_pipeline, make_union
+from sklearn.base import TransformerMixin
+import re
+import itertools
+
+
+def longest_run_of_capital_letters_feature(text):
+    """Find the longest run of capital letters and return their length."""
+    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
+    if runs:
+        return [len(runs[-1])]
+    else:
+        return [0]
+
+    
+def longest_run_of_character_feature(text):
+    chars = ['~+', '\.+', '\|+', ';+', '\:+', ';+', '\$+', '\(+', '\)+', '\-+', '\s+', '\t+']
+    runs = []
+    for i in chars:
+        run = sorted(re.findall(r'{}'.format(i), text), key=len)
+    if runs:
+        runs.append(len(run[-1]))
+    else:
+        runs.append(0)
+    return runs
+
+
+def percent_character_feature(text):
+    """Return percentage of text that is a particular char compared to total text length."""
+    chars = [".", "|", "$", "_", "!", "#", "@", "%", "^", "&", "*", "(", ")","+", "=", "{", "}", "[", "]", ":", ";", "?", "<", ">"]
+
+    return [text.count(i)/len(text) for i in chars]
+
+
+def percent_character_combinations(text):
+    """Return percentage of text that is a particular char compared to total text length."""
+    chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|", "\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
+    runs = []
+    for i in chars:
+        run = re.findall(r'{}'.format(i), text)
+        if run:
+            runs.append(len(run)/len(text))
+        else:
+            runs.append(0)
+    return runs
+
+def binary_character_combinations(text):
+    '''Return binary of text that is particular char to total length of text'''
+    chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|","\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
+    runs = []
+    for i in chars:
+        run = re.findall(r'{}'.format(i), text)
+        if run:
+            runs.append(1)
+        else:
+            runs.append(0)
+    return runs
+
+
+class FunctionFeaturizer(TransformerMixin):
+    def __init__(self, *featurizers):
+        self.featurizers = featurizers
+
+    def fit(self, X, y=None):
+        """All SciKit-Learn compatible transformers and classifiers have the
+        same interface. `fit` always returns the same object."""
+        return self
+
+    def transform(self, X):
+        """Given a list of original data, return a list of feature vectors."""
+        fvs = []
+        for datum in X:
+            fv = [f(datum) for f in self.featurizers]
+            a = list(itertools.chain(*fv))
+            fvs.append(a)
+        return fvs
diff --git a/Featurizer.ipynb b/Featurizer.ipynb
new file mode 100644
index 0000000..7c7e747
--- /dev/null
+++ b/Featurizer.ipynb
@@ -0,0 +1,417 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from Scrapper import load_data\n",
+    "from Scrapper import *\n",
+    "from Learner import Learner\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.cross_validation import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from FeatureVectorizer import *\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df = load_data(500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "y = df.loc[:,0]\n",
+    "X = df.loc[:,1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1    #include <stdio.h>#include <stdlib.h>#include ...\n",
+       "2     #include <iostream>#include <sstream>#include...\n",
+       "4    import std.stdio, std.array, std.conv, std.alg...\n",
+       "5    import std.stdio, std.conv, std.ascii, std.arr...\n",
+       "6     -module( solve_hidato_puzzle ). -export( [cre...\n",
+       "Name: 1, dtype: object"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "language_featurizer = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n",
+    "                                         percent_character_combinations, binary_character_combinations,\n",
+    "                                        percent_character_feature))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "pipe = make_pipeline(language_featurizer, RandomForestClassifier())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
+       "       transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
+       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
+       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
+       " ...n_jobs=1,\n",
+       "            oob_score=False, random_state=None, verbose=0,\n",
+       "            warm_start=False))])"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.71267707954958226"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['go'], dtype=object)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe.predict(['~='])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "multi_pipe = make_pipeline(language_featurizer, MultinomialNB())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
+       "       transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
+       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
+       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
+       " ...ormer_weights=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "multi_pipe.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.59898292771521977"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "multi_pipe.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "language_featurizer2 = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n",
+    "                                         percent_character_combinations, binary_character_combinations,\n",
+    "                                        percent_character_feature))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "pipe2 = make_pipeline(language_featurizer2, RandomForestClassifier())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
+       "       transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
+       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
+       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
+       " ...n_jobs=1,\n",
+       "            oob_score=False, random_state=None, verbose=0,\n",
+       "            warm_start=False))])"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe2.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.71758082092262987"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipe2.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Making a smaller data set with smaller number of languages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dfilter = scrape_filter(500, 50, True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df_small= scraper_filter_small(700, 100, True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "URLError",
+     "evalue": "<urlopen error [Errno 50] Network is down>",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m   1181\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1182\u001b[0;31m                 \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1183\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m   1087\u001b[0m         \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1088\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m   1125\u001b[0m             \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'iso-8859-1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1126\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m   1083\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1084\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1085\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m    921\u001b[0m             \u001b[0mmessage_body\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 922\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    923\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m    856\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 857\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    858\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    833\u001b[0m         self.sock = self._create_connection((self.host,self.port),\n\u001b[0;32m--> 834\u001b[0;31m                                             self.timeout, self.source_address)\n\u001b[0m\u001b[1;32m    835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m    511\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 512\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    513\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m    502\u001b[0m                 \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m             \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    504\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mOSError\u001b[0m: [Errno 50] Network is down",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mURLError\u001b[0m                                  Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-46-87ccba757880>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape\u001b[0;34m(num_links, drop_less_than, save)\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msave\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m     \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmake_links_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     52\u001b[0m     \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     53\u001b[0m     \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mndf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mmake_data\u001b[0;34m(url_list)\u001b[0m\n\u001b[1;32m     30\u001b[0m     \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0murl\u001b[0m \u001b[0;32min\u001b[0m \u001b[0murl_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m         \u001b[0msoup_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0murl_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpull_code_from_soup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msoup_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m         \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcode_snippets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape_data\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m     \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'User-Agent'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Mozilla/5.0'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m     \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     19\u001b[0m     \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m\"pre\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"highlighted_source\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m    159\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    160\u001b[0m         \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m    461\u001b[0m             \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 463\u001b[0;31m         \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    464\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    465\u001b[0m         \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m    479\u001b[0m         \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    480\u001b[0m         result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 481\u001b[0;31m                                   '_open', req)\n\u001b[0m\u001b[1;32m    482\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    483\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m    439\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    440\u001b[0m             \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    442\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    443\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mhttp_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m   1208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1209\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1210\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhttp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1211\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1212\u001b[0m     \u001b[0mhttp_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m   1182\u001b[0m                 \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1183\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1184\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1185\u001b[0m             \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1186\u001b[0m         \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mURLError\u001b[0m: <urlopen error [Errno 50] Network is down>"
+     ]
+    }
+   ],
+   "source": [
+    "scrape(700, 100, True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/Learner.py b/Learner.py
new file mode 100644
index 0000000..4734ac0
--- /dev/null
+++ b/Learner.py
@@ -0,0 +1,66 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from sklearn import linear_model
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.cross_validation import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.naive_bayes import GaussianNB
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.tree import DecisionTreeClassifier
+
+
+class Learner(object):
+    algorithms = {'m':MultinomialNB(),
+                  'b':BernoulliNB(),
+                  'r':RandomForestClassifier(),
+                  'f':RandomForestClassifier(),
+                  'k':KNeighborsClassifier(),
+                  'n':KNeighborsClassifier(),
+                  'p':MultinomialNB(),
+                  'd':DecisionTreeClassifier()
+                  }
+        
+    '''Takes a dataframe with outcomes on first column and predictor second column
+    Makes available the score and predict methods
+    '''
+    def __init__(self, dataframe, alg='NBayes'):
+        self.outcome, self.predictor = self.split_data(dataframe)
+        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome, test_size=0.33)
+        self.pipe = Pipeline([('bag_of_words', CountVectorizer()), ('bayes', self.get_algorithm(alg))])
+        self.fit()
+
+        
+    def fit(self):
+        self.pipe.fit(self.X_train, self.y_train)
+
+        
+    def test_score(self):
+        return self.pipe.score(self.X_test, self.y_test)
+
+    
+    def train_score(self):
+        return self.pipe.score(self.X_train, self.y_train)
+
+    
+    def predict(self, string):
+        return self.pipe.predict([string])
+
+
+    def split_data(self, data):
+        return data.loc[:,0], data.loc[:,1]
+
+
+    def classification_report(self):
+        return classification_report(self.pipe.predict(self.X_test), self.y_test)
+
+    def get_algorithm(self, algorithmchoice):
+        return self.algorithms[algorithmchoice[0].lower()]        
+        
diff --git a/README.md b/README.md
index 394f93b..543101e 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,40 @@
+## Programming Language Classifier
+
+Given a code snippet, the classifier will try to predict what language it is.
+
+## Steps taken to develop the classifier
+### Scrapper
+1. Created a scrapper that would collect different code snippets from the web, specifically from [Rosetta Code] (www.rosettacode.org/wiki/Rosetta_Code)
+2. The scrapper collects these code snippets by different tasks for each of the programing languages.
+3. For the scrapper to work, give it a minimum number of links to scrape and if you want to limit the languages retrieved by frequency, you can give it the minimum number of observations for the language to be in the classifier. Decide if you want to save the data frame or not
+
+`from Scrapper import scrape`
+
+`scrape(num_links, drop_less_than, save=(True,False))`
+
+You can decide to also load that dataframe into a variable so you can use it in the next steps or you can load the `.pkl` file using the number of links as the file name.
+
+### Classifier
+To use the classifier, import the Learner class and use the Learner class to define a classifier object
+
+`from Learner import Learner`
+
+`from Scrapper import load_data`
+
+`dataframe = load_data(500)`
+
+`classifier = Learner(dataframe, alg)`
+
+`alg` can be any of MultinomialNB, RandomForestClassifier, KNeighborsClassifier, or Bernoulli
+
+Learner splits (`test_size .33`) and fits the data and makes availabe the methods `test_score()`, `train_score()`, `predict(string)`  and `classification_report()` which you can call on the learner object directly.
+
+### VectorFeaturizer
+The vector featurizer creates more specific feature vectors to use to increase accuracy of the classifier. It mainly adds specific character classes that distinguish different languages more explicitly because most of other general features are common among languages.
+
+### Passing the tests
+Need to import the test files and test those against the data scrapped from RosettaCode
+
 # Classify code snippets into programming languages
 
 ## Description
diff --git a/Scrapper.py b/Scrapper.py
new file mode 100644
index 0000000..9281b81
--- /dev/null
+++ b/Scrapper.py
@@ -0,0 +1,82 @@
+from bs4 import BeautifulSoup
+import requests
+import urllib
+from re import findall
+import pandas as pd
+import random
+import pickle
+
+def get_text(url):
+    """Takes a url and returns text"""
+    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+    content = urllib.request.urlopen(req).read()
+    page_text=BeautifulSoup(content)
+    return page_text.get_text()
+
+def scrape_data(url):
+    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+    content = urllib.request.urlopen(req).read()
+    soup = BeautifulSoup(content)
+    return soup.find_all( "pre", class_="highlighted_source")
+    #pre is an html tag. We want all text from pre with class highlighted_source
+    #returns a list of soup objects
+
+
+def pull_code_from_soup(soup_list):
+    return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))]
+
+
+def make_data(url_list):
+    code_snippets = pd.DataFrame(columns=([0, 1]))
+    for url in url_list:
+        soup_list = scrape_data(url)
+        url_df = pd.DataFrame(pull_code_from_soup(soup_list))
+        code_snippets = code_snippets.append(url_df, ignore_index=True)
+    return code_snippets
+
+
+def scrape_links():
+    req = urllib.request.Request('http://rosettacode.org/wiki/Category:Programming_Tasks', headers={'User-Agent': 'Mozilla/5.0'})
+    content = urllib.request.urlopen(req).read()
+    soup = BeautifulSoup(content)
+    link_list = [link.get('href') for link in soup.find_all('a')]
+    return ["http://www.rosettacode.org{}".format(link) for link in link_list[1:] if link.startswith('/wiki/')]
+
+
+def make_links_list(num_links=30):
+    return random.sample(scrape_links(), num_links)
+
+
+def scrape(num_links=30, drop_less_than=0, save=False):
+    df = make_data(make_links_list(num_links))
+    ndf = df[df[0] != 'text']
+    ndf = ndf.groupby(0).filter( lambda x: len(x) >= drop_less_than)
+    if save:
+        ndf.to_pickle('data/{}.pkl'.format(num_links))
+    return ndf
+
+
+def scrape_filter(num_links=30, drop_less_than=0, save=False):
+    df = make_data(make_links_list(num_links))
+    df = df[df[0] != 'text']
+    df = df[(df[0] == 'ada') | (df[0] == 'clojure') | (df[0] == "algol68") | (df[0] == "awk")
+            | (df[0] == "bash") | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "lisp") | (df[0] == "objc") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")]
+    df = df.groupby(0).filter(lambda x: len(x) >= drop_less_than)
+    if save:
+        name = "data/filtered_{}_of_{}.pkl".format(drop_less_than, num_links)
+        df.to_pickle(name)
+    return df
+
+
+def scraper_filter_small(num_links=30, drop_less_than=0, save=False):
+    df = make_data(make_links_list(num_links))
+    df = df[df[0] != 'text']
+    df = df[(df[0] == 'clojure') | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")]
+    df  = df.groupby(0).filter(lambda x: len(x) >= drop_less_than)
+    if save:
+        name = "data/smaller_{}_of_{}.pkl".format(drop_less_than, num_links)
+        df.to_pickle(name)
+    return df
+    
+def load_data(file_name):
+    return pd.read_pickle('data/{}.pkl'.format(file_name))