Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 90 additions & 16 deletions findingCorrelation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,42 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.5.2)\n",
"Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.23.5)\n",
"Requirement already satisfied: statsmodels in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.13.5)\n",
"Requirement already satisfied: matplotlib in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (3.7.0)\n",
"Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.1.3)\n",
"Requirement already satisfied: seaborn in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.12.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2022.6)\n",
"Requirement already satisfied: patsy>=0.5.2 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (0.5.3)\n",
"Requirement already satisfied: scipy>=1.3 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from statsmodels) (1.9.3)\n",
"Requirement already satisfied: packaging>=21.3 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (21.3)\n",
"Requirement already satisfied: cycler>=0.10 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (0.11.0)\n",
"Requirement already satisfied: pillow>=6.2.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (9.4.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (4.38.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.0.7)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (3.0.9)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.4.4)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
"Requirement already satisfied: joblib>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (1.2.0)\n",
"Requirement already satisfied: six in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install pandas numpy statsmodels matplotlib seaborn --user"
"%pip install pandas numpy statsmodels matplotlib scikit-learn seaborn --user"
]
},
{
Expand All @@ -17,7 +48,9 @@
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns"
"import seaborn as sns\n",
"from sklearn.feature_selection import mutual_info_classif\n",
"from sklearn.model_selection import train_test_split"
]
},
{
Expand All @@ -38,18 +71,9 @@
"#dropping unecessary columns\n",
"df.drop(['Nacionality', 'GDP', 'Application order', 'International', 'Inflation rate', 'Unemployment rate', 'Displaced'], axis = 1, inplace = True)\n",
"#creating dataframe with only graduate and dropout students\n",
"df = df[df.Target != 'Enrolled']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df = df[df.Target != 'Enrolled']\n",
"#data cleaning - changing categorical column to numerical column\n",
"df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)\n",
"target = df['Target']"
"df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)"
]
},
{
Expand All @@ -58,6 +82,9 @@
"metadata": {},
"outputs": [],
"source": [
"#creating dataframe with only target values\n",
"target = df['Target']\n",
"#creating data fram with only all features (excluding targer)\n",
"features = df.drop('Target', axis='columns')"
]
},
Expand Down Expand Up @@ -88,12 +115,13 @@
}
],
"source": [
"#heatmap that shows correlation between features\n",
"sns.heatmap(features.corr())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -110,10 +138,56 @@
}
],
"source": [
"#Using Pearson correlation coefficient to find correlation of variables to target \n",
"correlation = features.corrwith(target).abs().sort_values(ascending=False)\n",
"#getting top correlation values\n",
"top5Correlation = correlation[:5]\n",
"print(top5Correlation)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"X_train,X_test,y_train,y_test=train_test_split(df.drop(labels=['Target'], axis=1),\n",
" df['Target'],\n",
" test_size=0.4,\n",
" random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Curricular units 2nd sem (approved) 0.321944\n",
"Curricular units 1st sem (approved) 0.279852\n",
"Curricular units 2nd sem (grade) 0.258264\n",
"Curricular units 1st sem (grade) 0.203101\n",
"Tuition fees up to date 0.116928\n",
"dtype: float64\n"
]
}
],
"source": [
"#measuring the dependency between the variables\n",
"mutualIndepence = mutual_info_classif(X_train,y_train)\n",
"mutualIndepence = pd.Series(mutualIndepence)\n",
"#setting index to feature columns\n",
"mutualIndepence.index=X_train.columns\n",
"#Ranking the estimate mutual information between each feature and the target\n",
"mutualIndepence = mutualIndepence.sort_values(ascending=False)\n",
"\n",
"#getting top 5 correlation values\n",
"top5MutualIndepence = mutualIndepence[:5]\n",
"print(top5MutualIndepence)"
]
}
],
"metadata": {
Expand Down