Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,13 @@ docs/_build/
# PyBuilder
target/

.direnv/
.envrc
.idea/
.DS_Store
scraper_500x100.pkl
scraper_50x10.pkl
scraper_50x1_17.pkl
scraper_700x1_17.pkl
test_X_values.pkl
test_y_values.pkl
6,549 changes: 6,549 additions & 0 deletions bs4_scratch_work.ipynb

Large diffs are not rendered by default.

337 changes: 337 additions & 0 deletions classification_accuracy_by_dataframe_size.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from scraper import pipeline_runner\n",
"from feature_vectors import *\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Small Dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###Only using CountVectorizer, no additional regex feature vectors"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_50x10 = pd.read_pickle('scraper_50x10.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_mnb = pipeline_runner(df_50x10, 'Multinomial')\n",
"df_knn = pipeline_runner(df_50x10, 'KNeighbors')\n",
"df_forest = pipeline_runner(df_50x10, 'Forest')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df_mnb: (0.89317507418397624, 0.51333333333333331)\n",
"df_knn: (0.49406528189910981, 0.31777777777777777)\n",
"df_forest: (0.99183976261127593, 0.62666666666666671)\n"
]
}
],
"source": [
"print(\"df_mnb: {}\".format(df_mnb))\n",
"print(\"df_knn: {}\".format(df_knn))\n",
"print(\"df_forest: {}\".format(df_forest))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"###Getting low scores. Random Forest is the most accurate with 0.62 mean score. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Larger Dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###Still no additional feature vectorizers"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#see scraper.py for how the 500x100 dataframe was pulled from RosettaCode. \n",
"df_500x100 = pd.read_pickle('scraper_500x100.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_mnb_500 = pipeline_runner(df_500x100, 'Multinomial')\n",
"df_knn_500 = pipeline_runner(df_500x100, 'KNeighbors')\n",
"df_forest_500 = pipeline_runner(df_500x100, 'Forest')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df_mnb_500: (0.80047166883882581, 0.62267311988086371)\n",
"df_knn_500: (0.60628064295910133, 0.40171258376768426)\n",
"df_forest_500: (0.98907714267982372, 0.72040208488458679)\n"
]
}
],
"source": [
"print(\"df_mnb_500: {}\".format(df_mnb_500))\n",
"print(\"df_knn_500: {}\".format(df_knn_500))\n",
"print(\"df_forest_500: {}\".format(df_forest_500))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##Random Forest, again, produces the best mean accuracy. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Adding additional feature vectors to improve score accuracy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = df_500x100.loc[:, 1]\n",
"y = df_500x100.loc[:, 0]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y) "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"language_featurizer = make_union(CountVectorizer(),\n",
" FunctionFeaturizer(longest_run_of_capital_letters_feature,\n",
" longest_run_of_character_feature,\n",
" percent_character_combinations,\n",
" percent_character_feature,\n",
" binary_character_combinations))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<21485x67251 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 833585 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"language_featurizer.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pipe = make_pipeline(language_featurizer, RandomForestClassifier())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
" transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ...n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False))])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.76954579300074455"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"The test mean accuracy is 0.76 using only the data pulled from RosettaCode (not using the test data from the file)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###Score accuracy imporved 0.05 by adding additional feature vectors. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading