From 399d7bd22f93934c7792803598dd2639e154e23e Mon Sep 17 00:00:00 2001 From: CarlosJun Date: Tue, 19 Dec 2017 20:32:47 -0300 Subject: [PATCH] =?UTF-8?q?solu=C3=A7=C3=A3o=20decision=20tree=20-=20Franc?= =?UTF-8?q?isco=20Carlos=20Freire?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adicionando minha solução do decision tree. Aluno: Francisco Carlos Freire Nunes Junior --- .../decision_tree_Francisco_Carlos.ipynb | 712 ++++++++++++++++++ 1 file changed, 712 insertions(+) create mode 100644 2017/07-decision-tree/decision_tree_Francisco_Carlos.ipynb diff --git a/2017/07-decision-tree/decision_tree_Francisco_Carlos.ipynb b/2017/07-decision-tree/decision_tree_Francisco_Carlos.ipynb new file mode 100644 index 0000000..45fade1 --- /dev/null +++ b/2017/07-decision-tree/decision_tree_Francisco_Carlos.ipynb @@ -0,0 +1,712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### Atividades\n", + "\n", + "1. Utilizamos a medida de Entropia como fator de decisão (medida de impureza de um nó). Teste o mesmo conjunto \n", + "randômico de dados para a medida Gini e compare os resultados.\n", + "Ref1.: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier\n", + "Ref2.: https://en.wikipedia.org/wiki/Decision_tree_learning\n", + "\n", + "2. Faça o balanceamento dos dados contidos em \"train.csv\", aplique o algoritmo de Decision Tree e faça a submissão no kaggle. Tente melhorar o resultado obtido em sala de aula (posição 3100 no leaderboard).\n", + "Dataset: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction\n", + "\n", + "3. (Opcional) Execute uma Random Forest na competição do Kaggle e veja se a acurácia melhora. Utilize 10, 100 ou 1000 árvores (dependendo de quanto o seu computador aguentar =]): http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Atividade 1" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "#Importa as bibliotecas\n", + "import pandas as pd\n", + "import math\n", + "import numpy as np\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import train_test_split \n", + "from sklearn.ensemble import RandomForestClassifier\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtargetps_ind_01ps_ind_02_catps_ind_03ps_ind_04_catps_ind_05_catps_ind_06_binps_ind_07_binps_ind_08_bin...ps_calc_11ps_calc_12ps_calc_13ps_calc_14ps_calc_15_binps_calc_16_binps_calc_17_binps_calc_18_binps_calc_19_binps_calc_20_bin
07022510010...9158011001
19011700001...3119011010
213054910001...4277011010
316001200100...2249000000
417002010100...3113000110
\n", + "

5 rows × 59 columns

\n", + "
" + ], + "text/plain": [ + " id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat \\\n", + "0 7 0 2 2 5 1 \n", + "1 9 0 1 1 7 0 \n", + "2 13 0 5 4 9 1 \n", + "3 16 0 0 1 2 0 \n", + "4 17 0 0 2 0 1 \n", + "\n", + " ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ... \\\n", + "0 0 0 1 0 ... \n", + "1 0 0 0 1 ... \n", + "2 0 0 0 1 ... \n", + "3 0 1 0 0 ... \n", + "4 0 1 0 0 ... \n", + "\n", + " ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin \\\n", + "0 9 1 5 8 0 \n", + "1 3 1 1 9 0 \n", + "2 4 2 7 7 0 \n", + "3 2 2 4 9 0 \n", + "4 3 1 1 3 0 \n", + "\n", + " ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin \\\n", + "0 1 1 0 0 \n", + "1 1 1 0 1 \n", + "2 1 1 0 1 \n", + "3 0 0 0 0 \n", + "4 0 0 1 1 \n", + "\n", + " ps_calc_20_bin \n", + "0 1 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 59 columns]" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Essa atividade utiliza o dataset da porto seguro, \n", + "#caso seja necessário reproduzir os resultados, por favor \n", + "#baixar o dataset em kaggle.com\n", + "data = pd.read_csv('train.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Separa o dataset em classes e features\n", + "data = data.sample(frac=1).reset_index(drop=True)\n", + "X_data = data.iloc[:,2:]\n", + "y_data = data.iloc[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [], + "source": [ + "#Separa os dados em treino e teste e validação\n", + "X_train, X_test, y_train, y_test = train_test_split(x_data,y_data.values.flatten(), test_size=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,\n", + " max_features=None, max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", + " splitter='best')" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Carrega o decision tree com entropia\n", + "dec_tree_entropy = DecisionTreeClassifier(criterion='entropy')\n", + "dec_tree_entropy.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [], + "source": [ + "#Armazena o resultado para entropia\n", + "result_entropy = dec_tree_entropy.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", + " max_features=None, max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", + " splitter='best')" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Carrega o decision tree com gini\n", + "dec_tree_gini = DecisionTreeClassifier(criterion='gini')\n", + "dec_tree_gini.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Armazena o resultado para gini\n", + "result_gini = dec_tree_gini.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "O resulta para decision tree com fator de decisão\n", + "entropy foi 0.9222932333353494, e para o gini 0.9167153888026451\n" + ] + } + ], + "source": [ + "print('O resulta para decision tree com fator de decisão\\nentropy foi {0}, e para o gini {1}'.format(result_entropy,result_gini))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Em geral, o resultado para o fator de decisão entropia apresentou melhores resultados em comparação com gini. Os valores variam de acordo com a execução, então em algumas execuções o fator gini pode apresentar melhores resultados. Os valores obtidos são apresentados acima." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Atividade 2" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Número de dados de classe 0 = 573518 e classe 1 = 21694\n" + ] + } + ], + "source": [ + "#Verifica o número de dados de cada classe\n", + "print('Número de dados de classe 0 = {0} e classe 1 = {1}'.format(sum(y_data == 0),sum(y_data == 1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Função de balanceamento das classes\n", + "def balanced_subsample(x,y,subsample_size=1.0):\n", + "\n", + " class_xs = []\n", + " min_elems = None\n", + "\n", + " for yi in np.unique(y):\n", + " elems = x[(y == yi)]\n", + " class_xs.append((yi, elems))\n", + " if min_elems == None or elems.shape[0] < min_elems:\n", + " min_elems = elems.shape[0]\n", + "\n", + " use_elems = min_elems\n", + " if subsample_size < 1:\n", + " use_elems = int(min_elems*subsample_size)\n", + "\n", + " xs = []\n", + " ys = []\n", + "\n", + " for ci,this_xs in class_xs:\n", + " if len(this_xs) > use_elems:\n", + " this_xs = this_xs.reindex(np.random.permutation(this_xs.index))\n", + "\n", + " x_ = this_xs[:use_elems]\n", + " y_ = np.empty(use_elems)\n", + " y_.fill(ci)\n", + "\n", + " xs.append(x_)\n", + " ys.append(y_)\n", + "\n", + " xs = pd.concat(xs)\n", + " ys = pd.Series(data=np.concatenate(ys),name='target')\n", + "\n", + " return xs,ys" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "#Balanceamento dos dados em relação a classe\n", + "X_train_balanced,y_train_balanced = balanced_subsample(X_data,y_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Número de dados de classe 0 = 21694 e classe 1 = 21694\n" + ] + } + ], + "source": [ + "#Verifica novamente o número de dados de cada classe\n", + "print('Número de dados de classe 0 = {0} e classe 1 = {1}'.format(sum(y_train_balanced == 0),sum(y_train_balanced == 1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Separa os dados em treino e teste e validação\n", + "X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_train_balanced,y_train_balanced.values.flatten(), test_size=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,\n", + " max_features=None, max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", + " splitter='best')" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Executa o decision tree para os novos dados balanceados\n", + "dec_tree_entropy = DecisionTreeClassifier(criterion='entropy')\n", + "dec_tree_entropy.fit(X_train_b, y_train_b)" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.520604775513967" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Resultado após o balanceamento\n", + "dec_tree_entropy.score(X_test_b, y_test_b)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [], + "source": [ + "#Carrega e prepara o dataset para submissão \n", + "test_data = pd.read_csv('test.csv')\n", + "pred = dec_tree_entropy.predict(test_data.iloc[:,1:])\n", + "submission = pd.DataFrame()\n", + "submission['id'] = test_data.iloc[:, 0]\n", + "submission['target'] = pred\n", + "submission.to_csv('kaggle_submission.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Após a submissão no kaggle, a posição no rank geral foi de 4889 com score de 0.04633. O balanceamento entre as classes piorou a posição, que anteriormente era de 3100. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Atividade 3" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "#Carrega o random forest com várias árvores\n", + "accuracy = []\n", + "for i in range(1,20):\n", + " rdn_forest = RandomForestClassifier(n_estimators=i, criterion='entropy')\n", + " rdn_forest.fit(X_train_b, y_train_b)\n", + " accuracy.append(rdn_forest.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD8CAYAAAB3u9PLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3X2cXVV97/HPd55CSGIeZsY0JQkE\njA+xxIAjUiwG8WqD9RIebBvgKmpvqdX0yRdew9XS3rSU2tL6yO0tKgK+VMRcH6iNBZoS+7pWKQN5\nIsGEMQqZBGFOEsJMIjOZmd/946yT7JycyZzJTGYm2d/363Ves/daa++z9snJ/p219l5rKyIwMzOr\nGesKmJnZ+OCAYGZmgAOCmZklDghmZgY4IJiZWeKAYGZmgAOCmZklVQUESUskbZXUJmlFhfwzJa2R\ntFHSWkmzM3lzJT0o6UlJWySdldLvkvRTSevTa9FIHZSZmQ2dBhuYJqkW2Aa8DWgHHgWuiYgtmTLf\nAL4bEXdLuhR4X0S8O+WtBW6JiIckTQb6I+KApLvSNqtOwHGZmdkQ1VVR5gKgLSK2A0i6F1gKbMmU\nWQD8SVp+GPh2KrsAqIuIhwAioms4lW1qaoqzzjprOLswM8udxx57rBARzYOVqyYgnAHsyKy3A28s\nK7MBuBr4NHAlMEVSI/BK4AVJ3wTmAf8KrIiIvrTdLZJuBtak9O5jVeSss86itbW1iiqbmVmJpKer\nKVfNNQRVSCvvZ7oRWCxpHbAY2An0Ugw4F6f8NwBnA+9N29wEvDqlzwA+WvHNpRsktUpq7ejoqKK6\nZmZ2PKoJCO3AnMz6bGBXtkBE7IqIqyLiPOBjKW1f2nZdRGyPiF6KXUnnp/xno6gb+BLFrqmjRMQd\nEdESES3NzYO2eMzM7DhVExAeBeZLmiepAVgG3J8tIKlJUmlfNwF3ZradLql0Jr+UdO1B0qz0V8AV\nwBPDORAzMxueQQNC+mW/HHgAeBK4LyI2S1op6fJU7BJgq6RtwEzglrRtH8XuojWSNlHsfvp82uYr\nKW0T0AT85YgdlZmZDdmgt52OJy0tLeGLymZmQyPpsYhoGaycRyqbmRnggGBmZkk14xDMDokIdu17\niS27XqSvv5+6mhrqanXob32tqK2poa5G1NfWUFtTTKurLabV1RSX62tFjcTBvn56evs52Bf09PbT\n09dHd+/RaT29/XRn03r76OkrrtdI1AhqJJT+1ghqa4Skyvk1pXLFtP4oHlsEBEF/P/RHEBxO749K\naUH/CPW6lrpv44j3Kdan1LN7uI5H5kVpQztlXX/RWTROnnBC38MBwY7ppYN9bNq5j3XP7OXxp19g\n3Y69PPfiMccP2hhSpVFDdkq4fNEZDgh2WFd3L3/w1ccpdPUwd8bpzG08nbkzTufMtDxr6kRqa47/\njBAR7NjzC9bt2Mu6Z17g8Wf2smXXi/Smn8BzZ5zOhWc3cv7c6Zw7eyqn1dXS1x8c7O+nty/o7eun\ntz/o7S/+cu/rDw72FfOOKNd/uGxDbQ0NdTXUp78NdTU01NYwoULaEX/Tqy4db+mXen8E0Q99ESnt\nyF/y/f3F9b5DecX0GoEkxJEtB6X0Qy0MUrlMS0So8vDN46Aj3qe479JJPrt+qB4crqPZcDkgnCQi\ngo9/axPf39bBRec0sXnXPh7Y/PNDJ2uA+loxe/rpzCkFiUzQmDvjdCZNOPKf+0BPLxt27GPdjuKv\n//U79lLo6gFgYn0tr5szld9989mcP3c6i+ZMo3nKif11YmZjywHhJPGN1na+vX4XH37bK/nDt84H\noLevn2f3vcSOPQd4es8BntlzgGd2F/+uf2YvL77Ue8Q+miY3MHfG6cyaNpGfduxn63Od9KWAMq9p\nEm9+ZTPnzZ3O+XOn8aqZU6ir9T0HZnnigHAS2PZcJzff/wQXndPIh97yikPpdbU1zJlRbBFcVGG7\nfQcO8vSe/Tyz5wBP7z5QDBy7D/DEzn3MmX46H7zkHM6bO43z5kxn+qSG0TsgMxuXHBDGuQM9vXzw\nK48zeUIdn1q2aEjXCKaeXs/C06excPa0E1hDMztVOCCMc3/2nc38pKOLL7//jbx8ymljXR0zO4W5\nk7hK39v0LA9s/vmovuc3H2/nG4+1s/wtr+DX5jeN6nubWf44IFTp7x/axu99+TG++sgzo/J+bc93\n8fFvP8EFZ83gj9JFZDOzE8kBoUqFrm7qa8X//NYm7vrBT0/oe710sI/lX32c0+pr+cw15/luHzMb\nFb6GUIWDff3sPXCQD73lHJ56ros//6ct9PT1c8Obzzkh77fyu1v48c87+dL73sAvTfV1AzMbHf7p\nWYU9+4uDtWZNncjt153PbyycxV+t/jGf+7enRvy9/mnDLr76yDP83uKzecurXj7i+zczG4hbCFXo\n6CzO3dM0eQL1tTV8+rcX0VBbw20PbqOnt58/edsrR2TqgJ8V9nPTNzdx/txp3Pj2Vw17f2ZmQ+GA\nUIXdqYXQPKU4eKuutobbfvN11NeKz/xbG919/axY8uphBYXu3j6Wf+1xamvEZ689n3pfNzCzUVbV\nWUfSEklbJbVJWlEh/0xJayRtlLRW0uxM3lxJD0p6UtIWSWel9HmSHpH0lKSvp+c1j0uFTAuhpLZG\n/PVVC/lvF87lH7+/nZXf3cJwnj73V//8JE/sfJHbfvN1nDFt4rDrbGY2VIMGBEm1wO3AZcAC4BpJ\nC8qK3QbcExELgZXArZm8e4C/jYjXABcAz6f0TwCfjIj5wF7gd4ZzICdSoasYEMqnnq2pEX+x9Fd4\n35vO4ks/+Bkf//YT9B/H5Pj/8sSz3P3Dp3n/m+bxtgUzR6TOZmZDVU0L4QKgLSK2R0QPcC+wtKzM\nAmBNWn64lJ8CR11EPAQQEV0RcUDFvpVLgVVpm7uBK4Z1JCdQoaub0+prmNRQe1SeJG5+5wI+sPgc\nvvLIM6z45sZDE8ZVY8eeA3xk1UYWzp7KistePZLVNjMbkmoCwhnAjsx6e0rL2gBcnZavBKZIagRe\nCbwg6ZuS1kn629TiaAReiIjeY+wTAEk3SGqV1NrR0VHdUY2wQlcPTZMnDHiNQBIfXfIq/vCt87mv\ntZ0bv7GB3r7+Qffb09vP8q+tg4DPXXM+DXW+bmBmY6eaM1Cls2D5T+AbgcWS1gGLgZ1AL8WL1hen\n/DcAZwPvrXKfxcSIOyKiJSJampubq6juyCt0dR9x/aASSXz4ba/kI7/+Kr61bid/dO96Dg4SFP7m\nX37Mhh0v8Il3LWRu4+kjWWUzsyGr5i6jdmBOZn02sCtbICJ2AVcBSJoMXB0R+yS1A+siYnvK+zZw\nIXAnME1SXWolHLXP8aTQ1VP1hd4PveUVNNTWcMvqJznY189nrz2PCXVHdzX965bn+ML/+ynvvvBM\n3nHurJGuspnZkFXTQngUmJ/uCmoAlgH3ZwtIapJU2tdNFE/4pW2nSyr9tL8U2BLF23EeBt6V0q8H\nvnP8h3FiFbq6D91yWo3fffPZ/K/LX8uDW57jA19+jJcO9h2Rv+uFX3Djqg0smPUyPvYbrxnp6pqZ\nHZdBA0L6Bb8ceAB4ErgvIjZLWinp8lTsEmCrpG3ATOCWtG0fxe6iNZI2Uewq+nza5qPAhyW1Ubym\n8MURO6oR1N8f7NnfQ+OkoT0+8vqLzuKvrjyXtds6+O93t/KLnmJQONjXzx98bR0He/u5/brzOa3+\n6NaDmdlYqGpgWkSsBlaXpd2cWV7F4TuGyrd9CFhYIX07xTuYxrW9B3ro6w+aJg99mMS1b5xLQ10N\n/2PVBt5313/yxevfwOcebuOxp/fy6WWLmNc06QTU2Mzs+Hik8iBKD51vOs4HzL/r9bOprxUfvm8D\nV/7vH7DtuS6WvWEOSxdVvKnKzGzMOCAMYnfX0aOUh2rpojNoqK3hD762jlfNnMKf/dfXjlT1zMxG\njAPCIDpGICAAXHbuLL738sk0TZ7AxAoD3MzMxpoDwiAOdRkdxzWEcvNnThn2PszMThQPjR1E6Ulp\nUyfWj3VVzMxOKAeEQRQ6u2mcNPC0FWZmpwoHhEHs3t9D0xAGpZmZnawcEAZRzTxGZmanAgeEQZS6\njMzMTnUOCMcQEcWpr91lZGY54IBwDC++1EtPXz/N7jIysxxwQDiGwggNSjMzOxk4IBzD7kOD0hwQ\nzOzU54BwDKUWQuMIjFI2MxvvHBCOwV1GZpYnDgjHUOjspkYwY5JbCGZ26qsqIEhaImmrpDZJKyrk\nnylpjaSNktZKmp3J65O0Pr3uz6TfJemnmbxFI3NII6ejq4cZkxqorfG0FWZ26ht0tlNJtcDtwNuA\nduBRSfdHxJZMsduAeyLibkmXArcC7055v4iIgU72H0lPWxuXdnuUspnlSDUthAuAtojYHhE9wL3A\n0rIyC4A1afnhCvknJU9bYWZ5Uk1AOAPYkVlvT2lZG4Cr0/KVwBRJjWn9NEmtkn4k6Yqy7W5J3Uyf\nlDTuzryFrh7fYWRmuVFNQKjUgR5l6zcCiyWtAxYDO4HelDc3IlqAa4FPSTonpd8EvBp4AzAD+GjF\nN5duSAGltaOjo4rqjhy3EMwsT6oJCO3AnMz6bGBXtkBE7IqIqyLiPOBjKW1fKS/93Q6sBc5L689G\nUTfwJYpdU0eJiDsioiUiWpqbm4dybMNyoKeXAz19DghmlhvVBIRHgfmS5klqAJYB92cLSGqSVNrX\nTcCdKX16qStIUhPwJmBLWp+V/gq4Anhi+IczcnaP4KMzzcxOBoPeZRQRvZKWAw8AtcCdEbFZ0kqg\nNSLuBy4BbpUUwL8DH0qbvwb4R0n9FIPPX2fuTvqKpGaKXVLrgQ+M4HENW0dpUNoUtxDMLB8GDQgA\nEbEaWF2WdnNmeRVw1O2jEfEfwLkD7PPSIdV0lBU6U0DwsxDMLCc8UnkAhVKXkZ+FYGY54YAwgEMT\n27mFYGY54YAwgEJXN1Mn1tNQ54/IzPLBZ7sB7O7q8R1GZpYrDggD6OjqptFjEMwsRxwQBlDo6vaz\nlM0sVxwQBlDo7HaXkZnligNCBd29fbz4Uq+nrTCzXHFAqGDP/tIYBAcEM8sPB4QKCp2leYwcEMws\nPxwQKjg0KM3XEMwsRxwQKihNbOe7jMwsTxwQKii1ENxlZGZ54oBQwe6uHiY11DKxoXasq2JmNmoc\nECoodHX7DiMzyx0HhAoKXd00TvIFZTPLFweECgqdPb5+YGa5U1VAkLRE0lZJbZJWVMg/U9IaSRsl\nrZU0O5PXJ2l9et2fSZ8n6RFJT0n6enpe87jgLiMzy6NBA4KkWuB24DJgAXCNpAVlxW4D7omIhcBK\n4NZM3i8iYlF6XZ5J/wTwyYiYD+wFfmcYxzFievv62XPALQQzy59qWggXAG0RsT0ieoB7gaVlZRYA\na9LywxXyjyBJwKUcfg7z3cAV1Vb6RNp74CAR0OxBaWaWM9UEhDOAHZn19pSWtQG4Oi1fCUyR1JjW\nT5PUKulHkkon/UbghYjoPcY+x8ThUcpuIZhZvlQTEFQhLcrWbwQWS1oHLAZ2AqWT/dyIaAGuBT4l\n6Zwq91l8c+mGFFBaOzo6qqju8HhQmpnlVTUBoR2Yk1mfDezKFoiIXRFxVUScB3wspe0r5aW/24G1\nwHlAAZgmqW6gfWb2fUdEtERES3Nzc7XHddwOBwR3GZlZvlQTEB4F5qe7ghqAZcD92QKSmiSV9nUT\ncGdKny5pQqkM8CZgS0QExWsN70rbXA98Z7gHMxIOzXTqu4zMLGcGDQipn3858ADwJHBfRGyWtFJS\n6a6hS4CtkrYBM4FbUvprgFZJGygGgL+OiC0p76PAhyW1Ubym8MUROqZhKezvpqGuhikT6gYvbGZ2\nCqnqrBcRq4HVZWk3Z5ZXcfiOoWyZ/wDOHWCf2ynewTSuFDp7aJrUQPFGKDOz/PBI5TIelGZmeeWA\nUKbQ1e07jMwslxwQyhQDgu8wMrP8cUDIiAh2d3naCjPLJweEjH2/OEhvfzggmFkuOSBkHJ62wl1G\nZpY/DggZHWlQWrNbCGaWQw4IGYemrfBtp2aWQw4IGZ7YzszyzAEhY3dXD7U1YtrE+rGuipnZqHNA\nyCh0dTNjUgM1NZ62wszyxwEhw6OUzSzPHBAyOrp6PErZzHLLASGj0NntW07NLLccEJKIYPd+z3Rq\nZvnlgJDs7+njpYP9NE5yl5GZ5ZMDQlLo9BgEM8u3qgKCpCWStkpqk7SiQv6ZktZI2ihpraTZZfkv\nk7RT0ucyaWvTPten18uHfzjHz6OUzSzvBg0IkmqB24HLgAXANZIWlBW7DbgnIhYCK4Fby/L/Avh+\nhd1fFxGL0uv5Idd+BB0epewuIzPLp2paCBcAbRGxPSJ6gHuBpWVlFgBr0vLD2XxJrwdmAg8Ov7on\nTqHLE9uZWb5VExDOAHZk1ttTWtYG4Oq0fCUwRVKjpBrg74CPDLDvL6Xuoj/VAE+1l3SDpFZJrR0d\nHVVU9/iUWggzfFHZzHKqmoBQ6UQdZes3AoslrQMWAzuBXuCDwOqI2MHRrouIc4GL0+vdld48Iu6I\niJaIaGlubq6iusen0NXN9NPrqav1dXYzy6e6Ksq0A3My67OBXdkCEbELuApA0mTg6ojYJ+lXgYsl\nfRCYDDRI6oqIFRGxM23bKemrFLum7hn2ER2nQqcfnWlm+VZNQHgUmC9pHsVf/suAa7MFJDUBeyKi\nH7gJuBMgIq7LlHkv0BIRKyTVAdMioiCpHngn8K8jcDzHzfMYmVneDdo/EhG9wHLgAeBJ4L6I2Cxp\npaTLU7FLgK2StlG8gHzLILudADwgaSOwnmKg+fzxHcLIKHR5lLKZ5Vs1LQQiYjWwuizt5szyKmDV\nIPu4C7grLe8HXj+0qp5Yuz2xnZnlnK+gAi8d7KOzu9ddRmaWaw4IeFCamRk4IACHB6W5hWBmeeaA\ngCe2MzMDBwQAdu/3xHZmZg4IHO4y8rMQzCzPHBCAjs5upkyo47T62rGuipnZmHFAwIPSzMzAAQEo\nTVvh7iIzyzcHBEqjlN1CMLN8c0Cg2EJodAvBzHIu9wHhYF8/ew8cdAvBzHIv9wFhz36PUjYzAwcE\nOjxK2cwMcEBgd2ohNE/xNQQzy7fcBwTPY2RmVlRVQJC0RNJWSW2SVlTIP1PSGkkbJa2VNLss/2WS\ndkr6XCbt9ZI2pX1+RpKGfzhDV5r6utEBwcxybtCAIKkWuB24DFgAXCNpQVmx24B7ImIhsBK4tSz/\nL4Dvl6X9A3ADMD+9lgy59iOg0NXNafU1TGrwtBVmlm/VtBAuANoiYntE9AD3AkvLyiwA1qTlh7P5\nkl5P8TnLD2bSZgEvi4gfRkQA9wBXHPdRDEMhDUobowaKmdm4UU1AOAPYkVlvT2lZG4Cr0/KVwBRJ\njZJqgL8DPlJhn+2D7HNUFKetcHeRmVk1AaHST+coW78RWCxpHbAY2An0Ah8EVkfEjrLy1eyzWFC6\nQVKrpNaOjo4qqjs0BU9bYWYGQF0VZdqBOZn12cCubIGI2AVcBSBpMnB1ROyT9KvAxZI+CEwGGiR1\nAZ9O+xlwn5l93wHcAdDS0lIxaAxHoaub182eOtK7NTM76VQTEB4F5kuaR/GX/zLg2mwBSU3Anojo\nB24C7gSIiOsyZd4LtETEirTeKelC4BHgPcBnh300Q9TfH+zZ7xaCmRlU0WUUEb3AcuAB4EngvojY\nLGmlpMtTsUuArZK2UbyAfEsV7/37wBeANuAnwPeGXv3h2Xugh77+8NTXZmZU10IgIlYDq8vSbs4s\nrwJWDbKPu4C7MuutwK9UX9WRV3p0ph+OY2aW85HKu7s8StnMrCTXAaHjUEBwl5GZWa4DwqEuI7cQ\nzMzyHhC6qa8VUyfWj3VVzMzGXL4DQmc3jZM8bYWZGeQ8IOze30OTn4NgZgbkPCB4HiMzs8PyHRBS\nl5GZmeU4IEREcWI7dxmZmQE5DggvvtRLT18/ze4yMjMDchwQCh6lbGZ2hNwGhN0elGZmdoTcBoRS\nC6HR01aYmQEOCG4hmJkl+Q0Ind3UCGZMcgvBzAxyHBA6unqYMamB2hpPW2FmBjkOCLs9StnM7AhV\nBQRJSyRtldQmaUWF/DMlrZG0UdJaSbMz6Y9JWi9ps6QPZLZZm/a5Pr1ePnKHNbhCV7cvKJuZZQwa\nECTVArcDlwELgGskLSgrdhtwT0QsBFYCt6b0Z4GLImIR8EZghaRfzmx3XUQsSq/nh3ksQ1Lo6nEL\nwcwso5oWwgVAW0Rsj4ge4F5gaVmZBcCatPxwKT8ieiKiO6VPqPL9RoUntjMzO1I1J+gzgB2Z9faU\nlrUBuDotXwlMkdQIIGmOpI1pH5+IiF2Z7b6Uuov+VKP4UIIDPb0c6OlzQDAzy6gmIFQ6UUfZ+o3A\nYknrgMXATqAXICJ2pK6kVwDXS5qZtrkuIs4FLk6vd1d8c+kGSa2SWjs6Oqqo7uAOj1L2NQQzs5Jq\nAkI7MCezPhvI/sonInZFxFURcR7wsZS2r7wMsJniyZ+I2Jn+dgJfpdg1dZSIuCMiWiKipbm5uaqD\nGkyHB6WZmR2lmoDwKDBf0jxJDcAy4P5sAUlNkkr7ugm4M6XPljQxLU8H3gRslVQnqSml1wPvBJ4Y\niQOqRqHTAcHMrNygASEieoHlwAPAk8B9EbFZ0kpJl6dil1A80W8DZgK3pPTXAI9I2gB8H7gtIjZR\nvMD8QLq2sJ5iF9PnR+6wjq1Q6jLysxDMzA6pq6ZQRKwGVpel3ZxZXgWsqrDdQ8DCCun7gdcPtbIj\n5dDEdn5ampnZIePmNtDRVOjqZurEehrqcnn4ZmYV5fKMuLurx3cYmZmVyWVA6OjqptEXlM3MjpDL\ngFDo6vazlM3MyuQzIHR2u8vIzKxM7gJCd28fL77U6zEIZmZlchcQ9uwvjUFwQDAzy8pdQCh0FgNC\nox+daWZ2hPwFhNI8Rm4hmJkdIXcBoTSxne8yMjM7Uu4CQsEznZqZVZS7gLC7q4dJDbVMbKgd66qY\nmY0ruQsIBY9SNjOrKJcBwYPSzMyOlr+A0Nnj6wdmZhXkLyB0dfuWUzOzCnIVEHr7+tlzwC0EM7NK\nqgoIkpZI2iqpTdKKCvlnSlojaaOktZJmZ9Ifk7Re0mZJH8hs83pJm9I+PyNJI3dYle09cJAIaPY1\nBDOzowwaECTVArcDlwELgGskLSgrdhtwT0QsBFYCt6b0Z4GLImIR8EZghaRfTnn/ANwAzE+vJcM8\nlkEdenSmWwhmZkeppoVwAdAWEdsjoge4F1haVmYBsCYtP1zKj4ieiOhO6RNK7ydpFvCyiPhhRARw\nD3DFsI6kCh6UZmY2sGoCwhnAjsx6e0rL2gBcnZavBKZIagSQNEfSxrSPT0TErrR9+yD7JG1/g6RW\nSa0dHR1VVHdghwOCu4zMzMpVExAq9e1H2fqNwGJJ64DFwE6gFyAidqSupFcA10uaWeU+SdvfEREt\nEdHS3NxcRXUHVprp1HcZmZkdra6KMu3AnMz6bGBXtkD61X8VgKTJwNURsa+8jKTNwMXAD9J+Btzn\niVDY301DXQ1TJlRz2GZm+VJNC+FRYL6keZIagGXA/dkCkpoklfZ1E3BnSp8taWJang68CdgaEc8C\nnZIuTHcXvQf4zogc0TEUOntomtTAKNzQZGZ20hk0IEREL7AceAB4ErgvIjZLWinp8lTsEmCrpG3A\nTOCWlP4a4BFJG4DvA7dFxKaU9/vAF4A24CfA90bmkAbmQWlmZgOrqu8kIlYDq8vSbs4srwJWVdju\nIWDhAPtsBX5lKJUdrkJXNzNfdtpovqWZ2UkjVyOVPbGdmdnAchMQIoLdXZ62wsxsILkJCPt+cZDe\n/vAoZTOzAeQmIHhQmpnZseUmIHSkQWnNbiGYmVWUm4BwqIXg207NzCrKX0BwC8HMrKLcBITdXT3U\n1ohpE+vHuipmZuNSbgJCoaubGZMaqKnxtBVmZpXkKiC4u8jMbGC5CQgdXT2+5dTM7BhyExAKnd2+\n5dTM7BhyERAigt37PdOpmdmx5CIg7O/p46WD/TROcpeRmdlAchEQCp0eg2BmNph8BASPUjYzG1RV\nAUHSEklbJbVJWlEh/0xJayRtlLRW0uyUvkjSDyVtTnm/ndnmLkk/lbQ+vRaN3GEdyRPbmZkNbtCA\nIKkWuB24DFgAXCNpQVmx24B7ImIhsBK4NaUfAN4TEa8FlgCfkjQts91HImJReq0f5rEMqNDlie3M\nzAZTTQvhAqAtIrZHRA9wL7C0rMwCYE1afriUHxHbIuKptLwLeB5oHomKD0WphTDdF5XNzAZUTUA4\nA9iRWW9PaVkbgKvT8pXAFEmN2QKSLgAagJ9kkm9JXUmflHTCfr4XurqZfno99bW5uGRiZnZcqjlD\nVpr8J8rWbwQWS1oHLAZ2Ar2HdiDNAr4MvC8i+lPyTcCrgTcAM4CPVnxz6QZJrZJaOzo6qqju0Qqd\nfnSmmdlg6qoo0w7MyazPBnZlC6TuoKsAJE0Gro6IfWn9ZcA/Ax+PiB9ltnk2LXZL+hLFoHKUiLgD\nuAOgpaWlPBBV5dzZU5nXPOl4NjUzy41qAsKjwHxJ8yj+8l8GXJstIKkJ2JN+/d8E3JnSG4BvUbzg\n/I2ybWZFxLOSBFwBPDHcgxnIh97yihO1azOzU8agXUYR0QssBx4AngTui4jNklZKujwVuwTYKmkb\nMBO4JaX/FvBm4L0Vbi/9iqRNwCagCfjLkTooMzMbOkUcVy/MmGhpaYnW1taxroaZ2UlF0mMR0TJY\nOd92Y2ZmgAOCmZklDghmZgY4IJiZWeKAYGZmgAOCmZklJ9Vtp5I6gKePc/MmoDCC1TlRTpZ6wslT\nV9dzZJ0s9YSTp64nup5nRsSgE4ueVAFhOCS1VnMf7lg7WeoJJ09dXc+RdbLUE06euo6XerrLyMzM\nAAcEMzNL8hQQ7hjrClTpZKknnDx1dT1H1slSTzh56jou6pmbawhmZnZseWohmJnZMZxyAUHSEklb\nJbVJWlEhf4Kkr6f8RySdNQZ1nCPpYUlPStos6Y8qlLlE0r7MtOE3j3Y9Uz1+JmlTqsNRU82q6DPp\n89wo6fwxquerMp/VekkvSvrjsjJj8plKulPS85KeyKTNkPSQpKfS3+kDbHt9KvOUpOvHoJ5/K+nH\n6d/2W5KmDbDtMb8no1TXP5fDAcsoAAAEKElEQVS0M/Pv+44Btj3mOWIU6vn1TB1/Jmn9ANuO6mcK\nQEScMi+gluIzm8+m+PzmDcCCsjIfBP5PWl4GfH0M6jkLOD8tTwG2VajnJcB3x8Fn+jOg6Rj57wC+\nR/FRqxcCj4yDOtcCP6d47/WYf6YUnwlyPvBEJu1vgBVpeQXwiQrbzQC2p7/T0/L0Ua7n24G6tPyJ\nSvWs5nsySnX9c+DGKr4bxzxHnOh6luX/HXDzePhMI+KUayFcALRFxPaI6AHuBZaWlVkK3J2WVwFv\nTU9tGzUR8WxEPJ6WOyk+eOiM0azDCFpK8Yl4EcVHpE5Lz9AeS28FfhIRxzuIcURFxL8De8qSs9/D\nuyk+NbDcrwMPRcSeiNgLPAQsGc16RsSDUXxIFsCPKD5Cd8wN8JlWo5pzxIg5Vj3Teee3gK+dqPcf\nqlMtIJwB7Mist3P0ifZQmfRF3wc0jkrtKkhdVucBj1TI/lVJGyR9T9JrR7VihwXwoKTHJN1QIb+a\nz3y0LWPg/2Tj4TMFmBnpueLp78srlBlvn+37KbYGKxnsezJalqfurTsH6IYbT5/pxcBzEfHUAPmj\n/pmeagGh0i/98tuoqikzKiRNBv4v8McR8WJZ9uMUuzxeB3wW+PZo1y95U0ScD1wGfEjSm8vyx83n\nCYee43058I0K2ePlM63WuPlsJX0M6AW+MkCRwb4no+EfgHOARcCzFLtjyo2bzxS4hmO3Dkb9Mz3V\nAkI7MCezPhvYNVAZSXXAVI6v6TkskuopBoOvRMQ3y/Mj4sWI6ErLq4F6SU2jXE0iYlf6+zzwLYpN\n7qxqPvPRdBnweEQ8V54xXj7T5LlS11r6+3yFMuPis00Xs98JXBepc7tcFd+TEy4inouIvojoBz4/\nQB3Gy2daB1wFfH2gMmPxmZ5qAeFRYL6keemX4jLg/rIy9wOluzXeBfzbQF/yEyX1HX4ReDIi/n6A\nMr9UurYh6QKK/1a7R6+WIGmSpCmlZYoXGJ8oK3Y/8J50t9GFwL5SV8gYGfBX13j4TDOy38Prge9U\nKPMA8HZJ01P3x9tT2qiRtAT4KHB5RBwYoEw135MTruza1ZUD1KGac8Ro+C/AjyOivVLmmH2mo3kF\nezReFO962UbxToKPpbSVFL/QAKdR7E5oA/4TOHsM6vhrFJupG4H16fUO4APAB1KZ5cBmindB/Ai4\naAzqeXZ6/w2pLqXPM1tPAbenz3sT0DKG//anUzzBT82kjflnSjFAPQscpPgL9XcoXrdaAzyV/s5I\nZVuAL2S2fX/6rrYB7xuDerZR7HMvfU9Ld+j9MrD6WN+TMajrl9N3cCPFk/ys8rqm9aPOEaNZz5R+\nV+l7mSk7pp9pRHikspmZFZ1qXUZmZnacHBDMzAxwQDAzs8QBwczMAAcEMzNLHBDMzAxwQDAzs8QB\nwczMAPj/hibXpNTBfoQAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Melhor resultado 0.9633071913872704 utilizando 11 árvores\n" + ] + } + ], + "source": [ + "#Plota os resultados\n", + "plt.plot(accuracy)\n", + "plt.show()\n", + "\n", + "best_result = np.argmax(accuracy)\n", + "print (\"Melhor resultado {0} utilizando {1} árvores\".format(accuracy[np.argmax(accuracy)], best_result))" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=11, n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 204, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Executa o random forest para o melhor resultado\n", + "rdn_forest = RandomForestClassifier(n_estimators=best_result, criterion='entropy')\n", + "rdn_forest.fit(X_train_b, y_train_b)" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Carrega e prepara o dataset para submissão \n", + "test_data = pd.read_csv('test.csv')\n", + "pred = rdn_forest.predict(test_data.iloc[:,1:])\n", + "submission = pd.DataFrame()\n", + "submission['id'] = test_data.iloc[:, 0]\n", + "submission['target'] = pred\n", + "submission.to_csv('kaggle_submission.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Utilizando o random forest com entropia para os mesmo dados balancedados. O score do kaggle passou de 0.04633 para 0.09799. Tendo uma melhoria significativa. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}