diff --git a/Classifier.ipynb b/Classifier.ipynb new file mode 100644 index 0000000..139c251 --- /dev/null +++ b/Classifier.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from Scrapper import scrape\n", + "from Scrapper import load_data\n", + "from Learner import Learner" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2 = load_data(200)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "classifier = Learner(df2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.36491387126019947" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.test_score()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.57978241160471444" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.train_score()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from FeatureVectorizer import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[0]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "longest_run_of_character_feature('hello . work.dl. in. the junghle.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/FeatureVectorizer.py b/FeatureVectorizer.py new file mode 100644 index 0000000..e17ad50 --- /dev/null +++ b/FeatureVectorizer.py @@ -0,0 +1,76 @@ +from sklearn.pipeline import make_pipeline, make_union +from sklearn.base import TransformerMixin +import re +import itertools + + +def longest_run_of_capital_letters_feature(text): + """Find the longest run of capital letters and return their length.""" + runs = sorted(re.findall(r"[A-Z]+", text), key=len) + if runs: + return [len(runs[-1])] + else: + return [0] + + +def longest_run_of_character_feature(text): + chars = ['~+', '\.+', '\|+', ';+', '\:+', ';+', '\$+', '\(+', '\)+', '\-+', '\s+', '\t+'] + runs = [] + for i in chars: + run = sorted(re.findall(r'{}'.format(i), text), key=len) + if runs: + runs.append(len(run[-1])) + else: + runs.append(0) + return runs + + +def percent_character_feature(text): + """Return percentage of text that is a particular char compared to total text length.""" + chars = [".", "|", "$", "_", "!", "#", "@", "%", "^", "&", "*", "(", ")","+", "=", "{", "}", "[", "]", ":", ";", "?", "<", ">"] + + return [text.count(i)/len(text) for i in chars] + + +def percent_character_combinations(text): + """Return percentage of text that is a particular char compared to total text length.""" + chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|", "\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"] + runs = [] + for i in chars: + run = re.findall(r'{}'.format(i), text) + if run: + runs.append(len(run)/len(text)) + else: + runs.append(0) + return runs + +def binary_character_combinations(text): + '''Return binary of text that is particular char to total length of text''' + chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|","\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"] + runs = [] + for i in chars: + run = re.findall(r'{}'.format(i), text) + if run: + runs.append(1) + else: + runs.append(0) + return runs + + +class FunctionFeaturizer(TransformerMixin): + def __init__(self, *featurizers): + self.featurizers = featurizers + + def fit(self, X, y=None): + """All SciKit-Learn compatible transformers and classifiers have the + same interface. `fit` always returns the same object.""" + return self + + def transform(self, X): + """Given a list of original data, return a list of feature vectors.""" + fvs = [] + for datum in X: + fv = [f(datum) for f in self.featurizers] + a = list(itertools.chain(*fv)) + fvs.append(a) + return fvs diff --git a/Featurizer.ipynb b/Featurizer.ipynb new file mode 100644 index 0000000..7c7e747 --- /dev/null +++ b/Featurizer.ipynb @@ -0,0 +1,417 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from Scrapper import load_data\n", + "from Scrapper import *\n", + "from Learner import Learner\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from FeatureVectorizer import *\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = load_data(500)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "y = df.loc[:,0]\n", + "X = df.loc[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1 #include #include #include ...\n", + "2  #include #include #include...\n", + "4 import std.stdio, std.array, std.conv, std.alg...\n", + "5 import std.stdio, std.conv, std.ascii, std.arr...\n", + "6  -module( solve_hidato_puzzle ). -export( [cre...\n", + "Name: 1, dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "language_featurizer = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n", + " percent_character_combinations, binary_character_combinations,\n", + " percent_character_feature))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pipe = make_pipeline(language_featurizer, RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.71267707954958226" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['go'], dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.predict(['~='])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "multi_pipe = make_pipeline(language_featurizer, MultinomialNB())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...ormer_weights=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multi_pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.59898292771521977" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multi_pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "language_featurizer2 = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n", + " percent_character_combinations, binary_character_combinations,\n", + " percent_character_feature))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pipe2 = make_pipeline(language_featurizer2, RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe2.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.71758082092262987" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe2.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Making a smaller data set with smaller number of languages" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dfilter = scrape_filter(500, 50, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df_small= scraper_filter_small(700, 100, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "URLError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1181\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1182\u001b[0;31m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1183\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1087\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1088\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1125\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'iso-8859-1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1126\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1083\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1084\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1085\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 921\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 922\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 923\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 856\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 857\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 858\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 833\u001b[0m self.sock = self._create_connection((self.host,self.port),\n\u001b[0;32m--> 834\u001b[0;31m self.timeout, self.source_address)\n\u001b[0m\u001b[1;32m 835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m 511\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 512\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 513\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mOSError\u001b[0m: [Errno 50] Network is down", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mURLError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape\u001b[0;34m(num_links, drop_less_than, save)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msave\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmake_links_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mndf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mmake_data\u001b[0;34m(url_list)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0murl\u001b[0m \u001b[0;32min\u001b[0m \u001b[0murl_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0msoup_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0murl_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpull_code_from_soup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msoup_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcode_snippets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape_data\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'User-Agent'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Mozilla/5.0'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m\"pre\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"highlighted_source\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 463\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 464\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 481\u001b[0;31m '_open', req)\n\u001b[0m\u001b[1;32m 482\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mhttp_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1209\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1210\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhttp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1211\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1184\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1185\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mURLError\u001b[0m: " + ] + } + ], + "source": [ + "scrape(700, 100, True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Learner.py b/Learner.py new file mode 100644 index 0000000..4734ac0 --- /dev/null +++ b/Learner.py @@ -0,0 +1,66 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from sklearn import linear_model +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.cross_validation import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import BernoulliNB +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.tree import DecisionTreeClassifier + + +class Learner(object): + algorithms = {'m':MultinomialNB(), + 'b':BernoulliNB(), + 'r':RandomForestClassifier(), + 'f':RandomForestClassifier(), + 'k':KNeighborsClassifier(), + 'n':KNeighborsClassifier(), + 'p':MultinomialNB(), + 'd':DecisionTreeClassifier() + } + + '''Takes a dataframe with outcomes on first column and predictor second column + Makes available the score and predict methods + ''' + def __init__(self, dataframe, alg='NBayes'): + self.outcome, self.predictor = self.split_data(dataframe) + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome, test_size=0.33) + self.pipe = Pipeline([('bag_of_words', CountVectorizer()), ('bayes', self.get_algorithm(alg))]) + self.fit() + + + def fit(self): + self.pipe.fit(self.X_train, self.y_train) + + + def test_score(self): + return self.pipe.score(self.X_test, self.y_test) + + + def train_score(self): + return self.pipe.score(self.X_train, self.y_train) + + + def predict(self, string): + return self.pipe.predict([string]) + + + def split_data(self, data): + return data.loc[:,0], data.loc[:,1] + + + def classification_report(self): + return classification_report(self.pipe.predict(self.X_test), self.y_test) + + def get_algorithm(self, algorithmchoice): + return self.algorithms[algorithmchoice[0].lower()] + diff --git a/README.md b/README.md index 394f93b..543101e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,40 @@ +## Programming Language Classifier + +Given a code snippet, the classifier will try to predict what language it is. + +## Steps taken to develop the classifier +### Scrapper +1. Created a scrapper that would collect different code snippets from the web, specifically from [Rosetta Code] (www.rosettacode.org/wiki/Rosetta_Code) +2. The scrapper collects these code snippets by different tasks for each of the programing languages. +3. For the scrapper to work, give it a minimum number of links to scrape and if you want to limit the languages retrieved by frequency, you can give it the minimum number of observations for the language to be in the classifier. Decide if you want to save the data frame or not + +`from Scrapper import scrape` + +`scrape(num_links, drop_less_than, save=(True,False))` + +You can decide to also load that dataframe into a variable so you can use it in the next steps or you can load the `.pkl` file using the number of links as the file name. + +### Classifier +To use the classifier, import the Learner class and use the Learner class to define a classifier object + +`from Learner import Learner` + +`from Scrapper import load_data` + +`dataframe = load_data(500)` + +`classifier = Learner(dataframe, alg)` + +`alg` can be any of MultinomialNB, RandomForestClassifier, KNeighborsClassifier, or Bernoulli + +Learner splits (`test_size .33`) and fits the data and makes availabe the methods `test_score()`, `train_score()`, `predict(string)` and `classification_report()` which you can call on the learner object directly. + +### VectorFeaturizer +The vector featurizer creates more specific feature vectors to use to increase accuracy of the classifier. It mainly adds specific character classes that distinguish different languages more explicitly because most of other general features are common among languages. + +### Passing the tests +Need to import the test files and test those against the data scrapped from RosettaCode + # Classify code snippets into programming languages ## Description diff --git a/Scrapper.py b/Scrapper.py new file mode 100644 index 0000000..9281b81 --- /dev/null +++ b/Scrapper.py @@ -0,0 +1,82 @@ +from bs4 import BeautifulSoup +import requests +import urllib +from re import findall +import pandas as pd +import random +import pickle + +def get_text(url): + """Takes a url and returns text""" + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + page_text=BeautifulSoup(content) + return page_text.get_text() + +def scrape_data(url): + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + soup = BeautifulSoup(content) + return soup.find_all( "pre", class_="highlighted_source") + #pre is an html tag. We want all text from pre with class highlighted_source + #returns a list of soup objects + + +def pull_code_from_soup(soup_list): + return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))] + + +def make_data(url_list): + code_snippets = pd.DataFrame(columns=([0, 1])) + for url in url_list: + soup_list = scrape_data(url) + url_df = pd.DataFrame(pull_code_from_soup(soup_list)) + code_snippets = code_snippets.append(url_df, ignore_index=True) + return code_snippets + + +def scrape_links(): + req = urllib.request.Request('http://rosettacode.org/wiki/Category:Programming_Tasks', headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + soup = BeautifulSoup(content) + link_list = [link.get('href') for link in soup.find_all('a')] + return ["http://www.rosettacode.org{}".format(link) for link in link_list[1:] if link.startswith('/wiki/')] + + +def make_links_list(num_links=30): + return random.sample(scrape_links(), num_links) + + +def scrape(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + ndf = df[df[0] != 'text'] + ndf = ndf.groupby(0).filter( lambda x: len(x) >= drop_less_than) + if save: + ndf.to_pickle('data/{}.pkl'.format(num_links)) + return ndf + + +def scrape_filter(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + df = df[df[0] != 'text'] + df = df[(df[0] == 'ada') | (df[0] == 'clojure') | (df[0] == "algol68") | (df[0] == "awk") + | (df[0] == "bash") | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "lisp") | (df[0] == "objc") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")] + df = df.groupby(0).filter(lambda x: len(x) >= drop_less_than) + if save: + name = "data/filtered_{}_of_{}.pkl".format(drop_less_than, num_links) + df.to_pickle(name) + return df + + +def scraper_filter_small(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + df = df[df[0] != 'text'] + df = df[(df[0] == 'clojure') | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")] + df = df.groupby(0).filter(lambda x: len(x) >= drop_less_than) + if save: + name = "data/smaller_{}_of_{}.pkl".format(drop_less_than, num_links) + df.to_pickle(name) + return df + +def load_data(file_name): + return pd.read_pickle('data/{}.pkl'.format(file_name))