CS-UCR · Jeana-T · Mar 11, 2023 · Mar 13, 2023
diff --git a/findingCorrelation.ipynb b/findingCorrelation.ipynb
@@ -2,11 +2,42 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.5.2)\n",
+      "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.23.5)\n",
+      "Requirement already satisfied: statsmodels in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.13.5)\n",
+      "Requirement already satisfied: matplotlib in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (3.7.0)\n",
+      "Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (1.1.3)\n",
+      "Requirement already satisfied: seaborn in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (0.12.2)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from pandas) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from pandas) (2022.6)\n",
+      "Requirement already satisfied: patsy>=0.5.2 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (0.5.3)\n",
+      "Requirement already satisfied: scipy>=1.3 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from statsmodels) (1.9.3)\n",
+      "Requirement already satisfied: packaging>=21.3 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from statsmodels) (21.3)\n",
+      "Requirement already satisfied: cycler>=0.10 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (0.11.0)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (9.4.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (4.38.0)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.0.7)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (3.0.9)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from matplotlib) (1.4.4)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
+      "Requirement already satisfied: joblib>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages (from scikit-learn) (1.2.0)\n",
+      "Requirement already satisfied: six in /Users/jeanatijerina/Library/Python/3.10/lib/python/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
-    "%pip install pandas numpy statsmodels matplotlib seaborn --user"
+    "%pip install pandas numpy statsmodels matplotlib scikit-learn seaborn --user"
    ]
   },
   {
@@ -17,7 +48,9 @@
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "import seaborn as sns"
+    "import seaborn as sns\n",
+    "from sklearn.feature_selection import mutual_info_classif\n",
+    "from sklearn.model_selection import train_test_split"
    ]
   },
   {
@@ -38,18 +71,9 @@
     "#dropping unecessary columns\n",
     "df.drop(['Nacionality', 'GDP', 'Application order', 'International', 'Inflation rate', 'Unemployment rate', 'Displaced'], axis = 1, inplace = True)\n",
     "#creating dataframe with only graduate and dropout students\n",
-    "df = df[df.Target != 'Enrolled']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "df = df[df.Target != 'Enrolled']\n",
     "#data cleaning - changing categorical column to numerical column\n",
-    "df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)\n",
-    "target = df['Target']"
+    "df['Target'].replace(['Graduate', 'Dropout'], [0, 1], inplace=True)"
    ]
   },
   {
@@ -58,6 +82,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "#creating dataframe with only target values\n",
+    "target = df['Target']\n",
+    "#creating data fram with only all features (excluding targer)\n",
     "features = df.drop('Target', axis='columns')"
    ]
   },
@@ -88,12 +115,13 @@
     }
    ],
    "source": [
+    "#heatmap that shows correlation between features\n",
     "sns.heatmap(features.corr())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -110,10 +138,56 @@
     }
    ],
    "source": [
+    "#Using Pearson correlation coefficient to find correlation of variables to target \n",
     "correlation = features.corrwith(target).abs().sort_values(ascending=False)\n",
+    "#getting top correlation values\n",
     "top5Correlation = correlation[:5]\n",
     "print(top5Correlation)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train,X_test,y_train,y_test=train_test_split(df.drop(labels=['Target'], axis=1),\n",
+    "    df['Target'],\n",
+    "    test_size=0.4,\n",
+    "    random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Curricular units 2nd sem (approved)    0.321944\n",
+      "Curricular units 1st sem (approved)    0.279852\n",
+      "Curricular units 2nd sem (grade)       0.258264\n",
+      "Curricular units 1st sem (grade)       0.203101\n",
+      "Tuition fees up to date                0.116928\n",
+      "dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "#measuring the dependency between the variables\n",
+    "mutualIndepence = mutual_info_classif(X_train,y_train)\n",
+    "mutualIndepence = pd.Series(mutualIndepence)\n",
+    "#setting index to feature columns\n",
+    "mutualIndepence.index=X_train.columns\n",
+    "#Ranking the estimate mutual information between each feature and the target\n",
+    "mutualIndepence = mutualIndepence.sort_values(ascending=False)\n",
+    "\n",
+    "#getting top 5 correlation values\n",
+    "top5MutualIndepence = mutualIndepence[:5]\n",
+    "print(top5MutualIndepence)"
+   ]
   }
  ],
  "metadata": {