Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions Classifier.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from Scrapper import scrape\n",
"from Scrapper import load_data\n",
"from Learner import Learner"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df2 = load_data(200)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"classifier = Learner(df2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.36491387126019947"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.test_score()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.57978241160471444"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classifier.train_score()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from FeatureVectorizer import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"longest_run_of_character_feature('hello . work.dl. in. the junghle.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
76 changes: 76 additions & 0 deletions FeatureVectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
import re
import itertools


def longest_run_of_capital_letters_feature(text):
"""Find the longest run of capital letters and return their length."""
runs = sorted(re.findall(r"[A-Z]+", text), key=len)
if runs:
return [len(runs[-1])]
else:
return [0]


def longest_run_of_character_feature(text):
chars = ['~+', '\.+', '\|+', ';+', '\:+', ';+', '\$+', '\(+', '\)+', '\-+', '\s+', '\t+']
runs = []
for i in chars:
run = sorted(re.findall(r'{}'.format(i), text), key=len)
if runs:
runs.append(len(run[-1]))
else:
runs.append(0)
return runs


def percent_character_feature(text):
"""Return percentage of text that is a particular char compared to total text length."""
chars = [".", "|", "$", "_", "!", "#", "@", "%", "^", "&", "*", "(", ")","+", "=", "{", "}", "[", "]", ":", ";", "?", "<", ">"]

return [text.count(i)/len(text) for i in chars]


def percent_character_combinations(text):
"""Return percentage of text that is a particular char compared to total text length."""
chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|", "\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
runs = []
for i in chars:
run = re.findall(r'{}'.format(i), text)
if run:
runs.append(len(run)/len(text))
else:
runs.append(0)
return runs

def binary_character_combinations(text):
'''Return binary of text that is particular char to total length of text'''
chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|","\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"]
runs = []
for i in chars:
run = re.findall(r'{}'.format(i), text)
if run:
runs.append(1)
else:
runs.append(0)
return runs


class FunctionFeaturizer(TransformerMixin):
def __init__(self, *featurizers):
self.featurizers = featurizers

def fit(self, X, y=None):
"""All SciKit-Learn compatible transformers and classifiers have the
same interface. `fit` always returns the same object."""
return self

def transform(self, X):
"""Given a list of original data, return a list of feature vectors."""
fvs = []
for datum in X:
fv = [f(datum) for f in self.featurizers]
a = list(itertools.chain(*fv))
fvs.append(a)
return fvs
Loading