tiyd-python-2015-05 · sovello · Jun 5, 2015 · Jun 6, 2015 · Jun 8, 2015 · Jun 8, 2015
diff --git a/Classifier.ipynb b/Classifier.ipynb
@@ -0,0 +1,146 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from Scrapper import scrape\n",
+    "from Scrapper import load_data\n",
+    "from Learner import Learner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df2 = load_data(200)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "classifier = Learner(df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.36491387126019947"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.test_score()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.57978241160471444"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier.train_score()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from FeatureVectorizer import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "longest_run_of_character_feature('hello . work.dl. in. the junghle.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/FeatureVectorizer.py b/FeatureVectorizer.py
@@ -0,0 +1,76 @@
+from sklearn.pipeline import make_pipeline, make_union
+from sklearn.base import TransformerMixin
+import re
+import itertools
+
+
+def longest_run_of_capital_letters_feature(text):
+    """Find the longest run of capital letters and return their length."""
+    runs = sorted(re.findall(r"[A-Z]+", text), key=len)
+    if runs:
+        return [len(runs[-1])]
+    else:
+        return [0]
+
+
+def longest_run_of_character_feature(text):
+    chars = ['~+', '\.+', '\|+', ';+', '\:+', ';+', '\$+', '\(+', '\)+', '\-+', '\s+', '\t+']
+    runs = []
+    for i in chars:
+        run = sorted(re.findall(r'{}'.format(i), text), key=len)
+    if runs:
+        runs.append(len(run[-1]))
+    else:
+        runs.append(0)
+    return runs
+
+
+def percent_character_feature(text):
+    """Return percentage of text that is a particular char compared to total text length."""
+    chars = [".", "|", "$", "_", "!", "#", "@", "%", "^", "&", "*", "(", ")","+", "=", "{", "}", "[", "]", ":", ";", "?", "<", ">"]
+
+    return [text.count(i)/len(text) for i in chars]
+
+
+def percent_character_combinations(text):
+    """Return percentage of text that is a particular char compared to total text length."""
+    chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|", "\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
+    runs = []
+    for i in chars:
+        run = re.findall(r'{}'.format(i), text)
+        if run:
+            runs.append(len(run)/len(text))
+        else:
+            runs.append(0)
+    return runs
+
+def binary_character_combinations(text):
+    '''Return binary of text that is particular char to total length of text'''
+    chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|","\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
+    runs = []
+    for i in chars:
+        run = re.findall(r'{}'.format(i), text)
+        if run:
+            runs.append(1)
+        else:
+            runs.append(0)
+    return runs
+
+
+class FunctionFeaturizer(TransformerMixin):
+    def __init__(self, *featurizers):
+        self.featurizers = featurizers
+
+    def fit(self, X, y=None):
+        """All SciKit-Learn compatible transformers and classifiers have the
+        same interface. `fit` always returns the same object."""
+        return self
+
+    def transform(self, X):
+        """Given a list of original data, return a list of feature vectors."""
+        fvs = []
+        for datum in X:
+            fv = [f(datum) for f in self.featurizers]
+            a = list(itertools.chain(*fv))
+            fvs.append(a)
+        return fvs