diff --git a/findingCorrelation.ipynb b/findingCorrelation.ipynb index 6f7eab3..b40fc6f 100644 --- a/findingCorrelation.ipynb +++ b/findingCorrelation.ipynb @@ -2,11 +2,42 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.5.2)\n", + "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.23.5)\n", + "Requirement already satisfied: statsmodels in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.13.5)\n", + "Requirement already satisfied: matplotlib in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (3.7.0)\n", + "Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.1.3)\n", + "Requirement already satisfied: seaborn in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.12.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2022.6)\n", + "Requirement already satisfied: patsy>=0.5.2 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (0.5.3)\n", + "Requirement already satisfied: scipy>=1.3 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from statsmodels) (1.9.3)\n", + "Requirement already satisfied: packaging>=21.3 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (21.3)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: pillow>=6.2.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (9.4.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (4.38.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.0.7)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (3.0.9)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.4.4)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n", + "Requirement already satisfied: joblib>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (1.2.0)\n", + "Requirement already satisfied: six in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "%pip install pandas numpy statsmodels matplotlib seaborn --user" + "%pip install pandas numpy statsmodels matplotlib scikit-learn seaborn --user" ] }, { @@ -17,7 +48,9 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import seaborn as sns" + "import seaborn as sns\n", + "from sklearn.feature_selection import mutual_info_classif\n", + "from sklearn.model_selection import train_test_split" ] }, { @@ -38,18 +71,9 @@ "#dropping unecessary columns\n", "df.drop(['Nacionality', 'GDP', 'Application order', 'International', 'Inflation rate', 'Unemployment rate', 'Displaced'], axis = 1, inplace = True)\n", "#creating dataframe with only graduate and dropout students\n", - "df = df[df.Target != 'Enrolled']" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ + "df = df[df.Target != 'Enrolled']\n", "#data cleaning - changing categorical column to numerical column\n", - "df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)\n", - "target = df['Target']" + "df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)" ] }, { @@ -58,6 +82,9 @@ "metadata": {}, "outputs": [], "source": [ + "#creating dataframe with only target values\n", + "target = df['Target']\n", + "#creating data fram with only all features (excluding targer)\n", "features = df.drop('Target', axis='columns')" ] }, @@ -88,12 +115,13 @@ } ], "source": [ + "#heatmap that shows correlation between features\n", "sns.heatmap(features.corr())" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -110,10 +138,56 @@ } ], "source": [ + "#Using Pearson correlation coefficient to find correlation of variables to target \n", "correlation = features.corrwith(target).abs().sort_values(ascending=False)\n", + "#getting top correlation values\n", "top5Correlation = correlation[:5]\n", "print(top5Correlation)" ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "X_train,X_test,y_train,y_test=train_test_split(df.drop(labels=['Target'], axis=1),\n", + " df['Target'],\n", + " test_size=0.4,\n", + " random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Curricular units 2nd sem (approved) 0.321944\n", + "Curricular units 1st sem (approved) 0.279852\n", + "Curricular units 2nd sem (grade) 0.258264\n", + "Curricular units 1st sem (grade) 0.203101\n", + "Tuition fees up to date 0.116928\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "#measuring the dependency between the variables\n", + "mutualIndepence = mutual_info_classif(X_train,y_train)\n", + "mutualIndepence = pd.Series(mutualIndepence)\n", + "#setting index to feature columns\n", + "mutualIndepence.index=X_train.columns\n", + "#Ranking the estimate mutual information between each feature and the target\n", + "mutualIndepence = mutualIndepence.sort_values(ascending=False)\n", + "\n", + "#getting top 5 correlation values\n", + "top5MutualIndepence = mutualIndepence[:5]\n", + "print(top5MutualIndepence)" + ] } ], "metadata": {