From 3c9f3d109dabf1d7e8ddf6543652bd3b434aff95 Mon Sep 17 00:00:00 2001
From: iweanzz <51839645+iweanzz@users.noreply.github.com>
Date: Sat, 30 Nov 2019 23:26:49 -0500
Subject: [PATCH] Data Wrangling Code
---
Data_wrangling.ipynb | 1682 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 1682 insertions(+)
create mode 100644 Data_wrangling.ipynb
diff --git a/Data_wrangling.ipynb b/Data_wrangling.ipynb
new file mode 100644
index 0000000..abcf063
--- /dev/null
+++ b/Data_wrangling.ipynb
@@ -0,0 +1,1682 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import psycopg2\n",
+ "import pandas as pd\n",
+ "import pandas.io.sql as pd_sql\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def connectDB(DB):\n",
+ " # connect to the PostgreSQL server\n",
+ " return psycopg2.connect(\n",
+ " database=DB,\n",
+ " user=\"postgres\",\n",
+ " password=\"Georgetown16\",\n",
+ " host=\"database-1.c5vispb5ezxg.us-east-1.rds.amazonaws.com\",\n",
+ " port='5432')\n",
+ "\n",
+ "def disconnectDB(conn):\n",
+ " \n",
+ " conn.close()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id_student | \n",
+ " code_module | \n",
+ " code_presentation | \n",
+ " module_domain | \n",
+ " module_presentation_length | \n",
+ " term | \n",
+ " year | \n",
+ " num_of_prev_attempts | \n",
+ " final_result | \n",
+ " pass_fail_ind | \n",
+ " reg_period | \n",
+ " date_registration | \n",
+ " date_unregistration | \n",
+ " disability | \n",
+ " gender | \n",
+ " age_band | \n",
+ " region | \n",
+ " highest_education | \n",
+ " imd_band | \n",
+ " studied_credits | \n",
+ " b4_sum_clicks | \n",
+ " qtr_sum_clicks | \n",
+ " half_sum_clicks | \n",
+ " threeqtr_sum_clicks | \n",
+ " qtr_half_sum_clicks | \n",
+ " half_threeqtr_sum_clicks | \n",
+ " thrd_sum_clicks | \n",
+ " twothrd_sum_clicks | \n",
+ " thrd_twothrd_sum_clicks | \n",
+ " allclicks | \n",
+ " std_half_score | \n",
+ " std_total_weight | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 3733 | \n",
+ " DDD | \n",
+ " 2013J | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " QUARTERB4 | \n",
+ " -68.0 | \n",
+ " -8.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " South Region | \n",
+ " HE Qualification | \n",
+ " 90-100% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 6516 | \n",
+ " AAA | \n",
+ " 2014J | \n",
+ " SocialScience | \n",
+ " 269 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " PASS | \n",
+ " QUARTERB4 | \n",
+ " -52.0 | \n",
+ " NaN | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " Scotland | \n",
+ " HE Qualification | \n",
+ " 80-90% | \n",
+ " 60 | \n",
+ " 256.0 | \n",
+ " 862.0 | \n",
+ " 1347.0 | \n",
+ " 2066.0 | \n",
+ " 513.0 | \n",
+ " 719.0 | \n",
+ " 1040.0 | \n",
+ " 1875.0 | \n",
+ " 307.0 | \n",
+ " 2791.0 | \n",
+ " 56.400000 | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 8462 | \n",
+ " DDD | \n",
+ " 2013J | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " LONGB4 | \n",
+ " -137.0 | \n",
+ " 119.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " London Region | \n",
+ " HE Qualification | \n",
+ " 30-40% | \n",
+ " 90 | \n",
+ " 81.0 | \n",
+ " 446.0 | \n",
+ " 565.0 | \n",
+ " 565.0 | \n",
+ " 119.0 | \n",
+ " 0.0 | \n",
+ " 525.0 | \n",
+ " 565.0 | \n",
+ " 40.0 | \n",
+ " 646.0 | \n",
+ " 58.166667 | \n",
+ " 40.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 8462 | \n",
+ " DDD | \n",
+ " 2014J | \n",
+ " STEM | \n",
+ " 262 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 1 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " QUARTERB4 | \n",
+ " -38.0 | \n",
+ " 18.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " London Region | \n",
+ " HE Qualification | \n",
+ " 30-40% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 11391 | \n",
+ " AAA | \n",
+ " 2013J | \n",
+ " SocialScience | \n",
+ " 268 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " PASS | \n",
+ " LONGB4 | \n",
+ " -159.0 | \n",
+ " NaN | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " East Anglian Region | \n",
+ " HE Qualification | \n",
+ " 90-100% | \n",
+ " 240 | \n",
+ " 98.0 | \n",
+ " 447.0 | \n",
+ " 612.0 | \n",
+ " 650.0 | \n",
+ " 165.0 | \n",
+ " 38.0 | \n",
+ " 489.0 | \n",
+ " 615.0 | \n",
+ " 123.0 | \n",
+ " 934.0 | \n",
+ " 81.600000 | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " 95 | \n",
+ " 33897 | \n",
+ " DDD | \n",
+ " 2013J | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " QUARTERB4 | \n",
+ " -58.0 | \n",
+ " 107.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 0-35 | \n",
+ " West Midlands Region | \n",
+ " Lower Than A Level | \n",
+ " 30-40% | \n",
+ " 60 | \n",
+ " 0.0 | \n",
+ " 322.0 | \n",
+ " 322.0 | \n",
+ " 322.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 322.0 | \n",
+ " 322.0 | \n",
+ " 0.0 | \n",
+ " 322.0 | \n",
+ " 8.333333 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " 96 | \n",
+ " 33897 | \n",
+ " DDD | \n",
+ " 2014J | \n",
+ " STEM | \n",
+ " 262 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 1 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " MONTHB4 | \n",
+ " -18.0 | \n",
+ " 41.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 0-35 | \n",
+ " West Midlands Region | \n",
+ " Lower Than A Level | \n",
+ " 30-40% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 97 | \n",
+ " 97 | \n",
+ " 33915 | \n",
+ " FFF | \n",
+ " 2013B | \n",
+ " STEM | \n",
+ " 240 | \n",
+ " B | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Distinction | \n",
+ " PASS | \n",
+ " QUARTERB4 | \n",
+ " -64.0 | \n",
+ " NaN | \n",
+ " N | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " South East Region | \n",
+ " A Level or Equivalent | \n",
+ " 30-40% | \n",
+ " 60 | \n",
+ " 144.0 | \n",
+ " 1169.0 | \n",
+ " 2145.0 | \n",
+ " 3206.0 | \n",
+ " 1012.0 | \n",
+ " 1061.0 | \n",
+ " 1456.0 | \n",
+ " 3137.0 | \n",
+ " 689.0 | \n",
+ " 4121.0 | \n",
+ " 91.500000 | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " | 98 | \n",
+ " 98 | \n",
+ " 33930 | \n",
+ " DDD | \n",
+ " 2013J | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " PASS | \n",
+ " LONGB4 | \n",
+ " -143.0 | \n",
+ " NaN | \n",
+ " N | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " South West Region | \n",
+ " A Level or Equivalent | \n",
+ " 20-30% | \n",
+ " 90 | \n",
+ " 64.0 | \n",
+ " 175.0 | \n",
+ " 213.0 | \n",
+ " 410.0 | \n",
+ " 38.0 | \n",
+ " 197.0 | \n",
+ " 185.0 | \n",
+ " 394.0 | \n",
+ " 28.0 | \n",
+ " 544.0 | \n",
+ " 84.625000 | \n",
+ " 80.0 | \n",
+ "
\n",
+ " \n",
+ " | 99 | \n",
+ " 99 | \n",
+ " 34068 | \n",
+ " CCC | \n",
+ " 2014J | \n",
+ " STEM | \n",
+ " 269 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " None | \n",
+ " QUARTERB4 | \n",
+ " -74.0 | \n",
+ " 108.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 0-35 | \n",
+ " Scotland | \n",
+ " Lower Than A Level | \n",
+ " 50-60% | \n",
+ " 30 | \n",
+ " 30.0 | \n",
+ " 544.0 | \n",
+ " 583.0 | \n",
+ " 583.0 | \n",
+ " 147.0 | \n",
+ " 0.0 | \n",
+ " 579.0 | \n",
+ " 583.0 | \n",
+ " 4.0 | \n",
+ " 613.0 | \n",
+ " 33.050000 | \n",
+ " 18.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 33 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id_student code_module code_presentation module_domain \\\n",
+ "0 0 3733 DDD 2013J STEM \n",
+ "1 1 6516 AAA 2014J SocialScience \n",
+ "2 2 8462 DDD 2013J STEM \n",
+ "3 3 8462 DDD 2014J STEM \n",
+ "4 4 11391 AAA 2013J SocialScience \n",
+ ".. ... ... ... ... ... \n",
+ "95 95 33897 DDD 2013J STEM \n",
+ "96 96 33897 DDD 2014J STEM \n",
+ "97 97 33915 FFF 2013B STEM \n",
+ "98 98 33930 DDD 2013J STEM \n",
+ "99 99 34068 CCC 2014J STEM \n",
+ "\n",
+ " module_presentation_length term year num_of_prev_attempts final_result \\\n",
+ "0 261 J 2013 0 Withdrawn \n",
+ "1 269 J 2014 0 Pass \n",
+ "2 261 J 2013 0 Withdrawn \n",
+ "3 262 J 2014 1 Withdrawn \n",
+ "4 268 J 2013 0 Pass \n",
+ ".. ... ... ... ... ... \n",
+ "95 261 J 2013 0 Withdrawn \n",
+ "96 262 J 2014 1 Withdrawn \n",
+ "97 240 B 2013 0 Distinction \n",
+ "98 261 J 2013 0 Pass \n",
+ "99 269 J 2014 0 Withdrawn \n",
+ "\n",
+ " pass_fail_ind reg_period date_registration date_unregistration \\\n",
+ "0 None QUARTERB4 -68.0 -8.0 \n",
+ "1 PASS QUARTERB4 -52.0 NaN \n",
+ "2 None LONGB4 -137.0 119.0 \n",
+ "3 None QUARTERB4 -38.0 18.0 \n",
+ "4 PASS LONGB4 -159.0 NaN \n",
+ ".. ... ... ... ... \n",
+ "95 None QUARTERB4 -58.0 107.0 \n",
+ "96 None MONTHB4 -18.0 41.0 \n",
+ "97 PASS QUARTERB4 -64.0 NaN \n",
+ "98 PASS LONGB4 -143.0 NaN \n",
+ "99 None QUARTERB4 -74.0 108.0 \n",
+ "\n",
+ " disability gender age_band region highest_education \\\n",
+ "0 N M 55<= South Region HE Qualification \n",
+ "1 N M 55<= Scotland HE Qualification \n",
+ "2 N M 55<= London Region HE Qualification \n",
+ "3 N M 55<= London Region HE Qualification \n",
+ "4 N M 55<= East Anglian Region HE Qualification \n",
+ ".. ... ... ... ... ... \n",
+ "95 N M 0-35 West Midlands Region Lower Than A Level \n",
+ "96 N M 0-35 West Midlands Region Lower Than A Level \n",
+ "97 N F 0-35 South East Region A Level or Equivalent \n",
+ "98 N F 0-35 South West Region A Level or Equivalent \n",
+ "99 N M 0-35 Scotland Lower Than A Level \n",
+ "\n",
+ " imd_band studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n",
+ "0 90-100% 60 NaN NaN NaN \n",
+ "1 80-90% 60 256.0 862.0 1347.0 \n",
+ "2 30-40% 90 81.0 446.0 565.0 \n",
+ "3 30-40% 60 NaN NaN NaN \n",
+ "4 90-100% 240 98.0 447.0 612.0 \n",
+ ".. ... ... ... ... ... \n",
+ "95 30-40% 60 0.0 322.0 322.0 \n",
+ "96 30-40% 60 NaN NaN NaN \n",
+ "97 30-40% 60 144.0 1169.0 2145.0 \n",
+ "98 20-30% 90 64.0 175.0 213.0 \n",
+ "99 50-60% 30 30.0 544.0 583.0 \n",
+ "\n",
+ " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n",
+ "0 NaN NaN NaN \n",
+ "1 2066.0 513.0 719.0 \n",
+ "2 565.0 119.0 0.0 \n",
+ "3 NaN NaN NaN \n",
+ "4 650.0 165.0 38.0 \n",
+ ".. ... ... ... \n",
+ "95 322.0 0.0 0.0 \n",
+ "96 NaN NaN NaN \n",
+ "97 3206.0 1012.0 1061.0 \n",
+ "98 410.0 38.0 197.0 \n",
+ "99 583.0 147.0 0.0 \n",
+ "\n",
+ " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 1040.0 1875.0 307.0 2791.0 \n",
+ "2 525.0 565.0 40.0 646.0 \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 489.0 615.0 123.0 934.0 \n",
+ ".. ... ... ... ... \n",
+ "95 322.0 322.0 0.0 322.0 \n",
+ "96 NaN NaN NaN NaN \n",
+ "97 1456.0 3137.0 689.0 4121.0 \n",
+ "98 185.0 394.0 28.0 544.0 \n",
+ "99 579.0 583.0 4.0 613.0 \n",
+ "\n",
+ " std_half_score std_total_weight \n",
+ "0 NaN NaN \n",
+ "1 56.400000 100.0 \n",
+ "2 58.166667 40.0 \n",
+ "3 NaN NaN \n",
+ "4 81.600000 100.0 \n",
+ ".. ... ... \n",
+ "95 8.333333 10.0 \n",
+ "96 NaN NaN \n",
+ "97 91.500000 100.0 \n",
+ "98 84.625000 80.0 \n",
+ "99 33.050000 18.0 \n",
+ "\n",
+ "[100 rows x 33 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# connect to \"Dataset\" DB \n",
+ "conn = connectDB(\"Dataset\")\n",
+ "\n",
+ "# extract everything from 'table_name' into a dataframe\n",
+ "df = pd_sql.read_sql(f\"select * from public.\\\"analysisFeatures\\\" \", con=conn).reset_index()\n",
+ "\n",
+ "#make sure that all columns are displayed in our dataframe\n",
+ "pd.set_option('display.max_column',50)\n",
+ "\n",
+ "#check dataframe\n",
+ "df.head(100)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "22521"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#count null values of date_unregistration\n",
+ "df['date_unregistration'].isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id_student | \n",
+ " code_module | \n",
+ " module_domain | \n",
+ " module_presentation_length | \n",
+ " term | \n",
+ " year | \n",
+ " num_of_prev_attempts | \n",
+ " final_result | \n",
+ " date_registration | \n",
+ " disability | \n",
+ " gender | \n",
+ " age_band | \n",
+ " region | \n",
+ " highest_education | \n",
+ " imd_band | \n",
+ " studied_credits | \n",
+ " b4_sum_clicks | \n",
+ " qtr_sum_clicks | \n",
+ " half_sum_clicks | \n",
+ " threeqtr_sum_clicks | \n",
+ " qtr_half_sum_clicks | \n",
+ " half_threeqtr_sum_clicks | \n",
+ " thrd_sum_clicks | \n",
+ " twothrd_sum_clicks | \n",
+ " thrd_twothrd_sum_clicks | \n",
+ " allclicks | \n",
+ " std_half_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 3733 | \n",
+ " DDD | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " -68.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " South Region | \n",
+ " HE Qualification | \n",
+ " 90-100% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 6516 | \n",
+ " AAA | \n",
+ " SocialScience | \n",
+ " 269 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " -52.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " Scotland | \n",
+ " HE Qualification | \n",
+ " 80-90% | \n",
+ " 60 | \n",
+ " 256.0 | \n",
+ " 862.0 | \n",
+ " 1347.0 | \n",
+ " 2066.0 | \n",
+ " 513.0 | \n",
+ " 719.0 | \n",
+ " 1040.0 | \n",
+ " 1875.0 | \n",
+ " 307.0 | \n",
+ " 2791.0 | \n",
+ " 56.400000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 8462 | \n",
+ " DDD | \n",
+ " STEM | \n",
+ " 261 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " -137.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " London Region | \n",
+ " HE Qualification | \n",
+ " 30-40% | \n",
+ " 90 | \n",
+ " 81.0 | \n",
+ " 446.0 | \n",
+ " 565.0 | \n",
+ " 565.0 | \n",
+ " 119.0 | \n",
+ " 0.0 | \n",
+ " 525.0 | \n",
+ " 565.0 | \n",
+ " 40.0 | \n",
+ " 646.0 | \n",
+ " 58.166667 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 8462 | \n",
+ " DDD | \n",
+ " STEM | \n",
+ " 262 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 1 | \n",
+ " Withdrawn | \n",
+ " -38.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " London Region | \n",
+ " HE Qualification | \n",
+ " 30-40% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 11391 | \n",
+ " AAA | \n",
+ " SocialScience | \n",
+ " 268 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " -159.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 55<= | \n",
+ " East Anglian Region | \n",
+ " HE Qualification | \n",
+ " 90-100% | \n",
+ " 240 | \n",
+ " 98.0 | \n",
+ " 447.0 | \n",
+ " 612.0 | \n",
+ " 650.0 | \n",
+ " 165.0 | \n",
+ " 38.0 | \n",
+ " 489.0 | \n",
+ " 615.0 | \n",
+ " 123.0 | \n",
+ " 934.0 | \n",
+ " 81.600000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " 23629 | \n",
+ " BBB | \n",
+ " SocialScience | \n",
+ " 240 | \n",
+ " B | \n",
+ " 2013 | \n",
+ " 2 | \n",
+ " Fail | \n",
+ " -47.0 | \n",
+ " N | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " East Anglian Region | \n",
+ " Lower Than A Level | \n",
+ " 20-30% | \n",
+ " 60 | \n",
+ " 14.0 | \n",
+ " 105.0 | \n",
+ " 147.0 | \n",
+ " 147.0 | \n",
+ " 42.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " 147.0 | \n",
+ " 30.0 | \n",
+ " 161.0 | \n",
+ " 38.813953 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 23632 | \n",
+ " BBB | \n",
+ " SocialScience | \n",
+ " 268 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Withdrawn | \n",
+ " -194.0 | \n",
+ " N | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " East Anglian Region | \n",
+ " A Level or Equivalent | \n",
+ " 40-50% | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " 23698 | \n",
+ " CCC | \n",
+ " STEM | \n",
+ " 269 | \n",
+ " J | \n",
+ " 2014 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " -110.0 | \n",
+ " N | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " East Anglian Region | \n",
+ " A Level or Equivalent | \n",
+ " 50-60% | \n",
+ " 120 | \n",
+ " 181.0 | \n",
+ " 325.0 | \n",
+ " 475.0 | \n",
+ " 587.0 | \n",
+ " 153.0 | \n",
+ " 112.0 | \n",
+ " 325.0 | \n",
+ " 565.0 | \n",
+ " 150.0 | \n",
+ " 910.0 | \n",
+ " 74.325000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 23798 | \n",
+ " BBB | \n",
+ " SocialScience | \n",
+ " 268 | \n",
+ " J | \n",
+ " 2013 | \n",
+ " 0 | \n",
+ " Distinction | \n",
+ " -27.0 | \n",
+ " N | \n",
+ " M | \n",
+ " 0-35 | \n",
+ " Wales | \n",
+ " A Level or Equivalent | \n",
+ " 50-60% | \n",
+ " 60 | \n",
+ " 6.0 | \n",
+ " 212.0 | \n",
+ " 365.0 | \n",
+ " 490.0 | \n",
+ " 153.0 | \n",
+ " 125.0 | \n",
+ " 255.0 | \n",
+ " 459.0 | \n",
+ " 110.0 | \n",
+ " 590.0 | \n",
+ " 89.612903 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9 | \n",
+ " 24186 | \n",
+ " GGG | \n",
+ " SocialScience | \n",
+ " 241 | \n",
+ " B | \n",
+ " 2014 | \n",
+ " 0 | \n",
+ " Pass | \n",
+ " -25.0 | \n",
+ " Y | \n",
+ " F | \n",
+ " 0-35 | \n",
+ " Yorkshire Region | \n",
+ " Lower Than A Level | \n",
+ " 10-20 | \n",
+ " 30 | \n",
+ " 12.0 | \n",
+ " 15.0 | \n",
+ " 42.0 | \n",
+ " 109.0 | \n",
+ " 27.0 | \n",
+ " 67.0 | \n",
+ " 38.0 | \n",
+ " 91.0 | \n",
+ " 4.0 | \n",
+ " 184.0 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id_student code_module module_domain module_presentation_length \\\n",
+ "0 0 3733 DDD STEM 261 \n",
+ "1 1 6516 AAA SocialScience 269 \n",
+ "2 2 8462 DDD STEM 261 \n",
+ "3 3 8462 DDD STEM 262 \n",
+ "4 4 11391 AAA SocialScience 268 \n",
+ "5 5 23629 BBB SocialScience 240 \n",
+ "6 6 23632 BBB SocialScience 268 \n",
+ "7 7 23698 CCC STEM 269 \n",
+ "8 8 23798 BBB SocialScience 268 \n",
+ "9 9 24186 GGG SocialScience 241 \n",
+ "\n",
+ " term year num_of_prev_attempts final_result date_registration disability \\\n",
+ "0 J 2013 0 Withdrawn -68.0 N \n",
+ "1 J 2014 0 Pass -52.0 N \n",
+ "2 J 2013 0 Withdrawn -137.0 N \n",
+ "3 J 2014 1 Withdrawn -38.0 N \n",
+ "4 J 2013 0 Pass -159.0 N \n",
+ "5 B 2013 2 Fail -47.0 N \n",
+ "6 J 2013 0 Withdrawn -194.0 N \n",
+ "7 J 2014 0 Pass -110.0 N \n",
+ "8 J 2013 0 Distinction -27.0 N \n",
+ "9 B 2014 0 Pass -25.0 Y \n",
+ "\n",
+ " gender age_band region highest_education imd_band \\\n",
+ "0 M 55<= South Region HE Qualification 90-100% \n",
+ "1 M 55<= Scotland HE Qualification 80-90% \n",
+ "2 M 55<= London Region HE Qualification 30-40% \n",
+ "3 M 55<= London Region HE Qualification 30-40% \n",
+ "4 M 55<= East Anglian Region HE Qualification 90-100% \n",
+ "5 F 0-35 East Anglian Region Lower Than A Level 20-30% \n",
+ "6 F 0-35 East Anglian Region A Level or Equivalent 40-50% \n",
+ "7 F 0-35 East Anglian Region A Level or Equivalent 50-60% \n",
+ "8 M 0-35 Wales A Level or Equivalent 50-60% \n",
+ "9 F 0-35 Yorkshire Region Lower Than A Level 10-20 \n",
+ "\n",
+ " studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n",
+ "0 60 NaN NaN NaN \n",
+ "1 60 256.0 862.0 1347.0 \n",
+ "2 90 81.0 446.0 565.0 \n",
+ "3 60 NaN NaN NaN \n",
+ "4 240 98.0 447.0 612.0 \n",
+ "5 60 14.0 105.0 147.0 \n",
+ "6 60 NaN NaN NaN \n",
+ "7 120 181.0 325.0 475.0 \n",
+ "8 60 6.0 212.0 365.0 \n",
+ "9 30 12.0 15.0 42.0 \n",
+ "\n",
+ " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n",
+ "0 NaN NaN NaN \n",
+ "1 2066.0 513.0 719.0 \n",
+ "2 565.0 119.0 0.0 \n",
+ "3 NaN NaN NaN \n",
+ "4 650.0 165.0 38.0 \n",
+ "5 147.0 42.0 0.0 \n",
+ "6 NaN NaN NaN \n",
+ "7 587.0 153.0 112.0 \n",
+ "8 490.0 153.0 125.0 \n",
+ "9 109.0 27.0 67.0 \n",
+ "\n",
+ " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 1040.0 1875.0 307.0 2791.0 \n",
+ "2 525.0 565.0 40.0 646.0 \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 489.0 615.0 123.0 934.0 \n",
+ "5 117.0 147.0 30.0 161.0 \n",
+ "6 NaN NaN NaN NaN \n",
+ "7 325.0 565.0 150.0 910.0 \n",
+ "8 255.0 459.0 110.0 590.0 \n",
+ "9 38.0 91.0 4.0 184.0 \n",
+ "\n",
+ " std_half_score \n",
+ "0 NaN \n",
+ "1 56.400000 \n",
+ "2 58.166667 \n",
+ "3 NaN \n",
+ "4 81.600000 \n",
+ "5 38.813953 \n",
+ "6 NaN \n",
+ "7 74.325000 \n",
+ "8 89.612903 \n",
+ "9 0.000000 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "drop_list = ['reg_period','code_presentation','date_unregistration','pass_fail_ind','std_total_weight']\n",
+ "\n",
+ "df = df.drop(drop_list, axis=1)\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "STEM 21402\n",
+ "SocialScience 11191\n",
+ "Name: module_domain, dtype: int64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['module_domain'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['code_module'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#mapping the columns\n",
+ "df['imd_band'] = df['imd_band'].map({'0-10%':0,'10-20':1,'20-30%':2,'30-40%':3,'40-50%':4,'50-60%':5,'60-70%':6,'70-80%':7,'80-90%':8,'90-100%':9})\n",
+ "df['module_domain'] = df['module_domain'].map({'SocialScience': 0,'STEM': 1})\n",
+ "df['code_module'] = df['code_module'].map({'AAA': 0,'BBB': 1,'CCC':2,'DDD':3,'EEE':4,'GGG':5})\n",
+ "df['term'] = df['term'].map({'J': 0,'B': 1})\n",
+ "df['year'] = df['year'].map({'2013': 0,'2014': 1})\n",
+ "df['gender'] = df['gender'].map({'M': 0,'F': 1})\n",
+ "df['age_band'] = df['age_band'].map({'0-35': 0,'35-55': 1,'55<=':2})\n",
+ "df['region'] = df['region'].map({'Scotland': 0,'East Anglian Region': 1,'London Region':2,'South Region': 3,'North Western Region': 4,'West Midlands Region':5,'South West Region': 6,'East Midlands Region': 7,'South East Region':8,'Wales': 9,'Yorkshire Region': 10,'North Region':11,'Ireland':12})\n",
+ "df['final_result'] = df['final_result'].map({'Withdrawn':0, 'Fail':0,'Pass':1,'Distinction':1})\n",
+ "df['disability'] = df['disability'].map({'N':0,'Y':1})\n",
+ "df['highest_education'] = df['highest_education'].map({'No Formal quals':0,'Lower Than A Level':1,'A Level or Equivalent':2,'HE Qualification':3,'Post Graduate Qualification':4})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id_student | \n",
+ " code_module | \n",
+ " module_domain | \n",
+ " module_presentation_length | \n",
+ " term | \n",
+ " year | \n",
+ " num_of_prev_attempts | \n",
+ " final_result | \n",
+ " date_registration | \n",
+ " disability | \n",
+ " gender | \n",
+ " age_band | \n",
+ " region | \n",
+ " highest_education | \n",
+ " imd_band | \n",
+ " studied_credits | \n",
+ " b4_sum_clicks | \n",
+ " qtr_sum_clicks | \n",
+ " half_sum_clicks | \n",
+ " threeqtr_sum_clicks | \n",
+ " qtr_half_sum_clicks | \n",
+ " half_threeqtr_sum_clicks | \n",
+ " thrd_sum_clicks | \n",
+ " twothrd_sum_clicks | \n",
+ " thrd_twothrd_sum_clicks | \n",
+ " allclicks | \n",
+ " std_half_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 3733 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 261 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " -68.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 9.0 | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 6516 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 269 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -52.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 8.0 | \n",
+ " 60 | \n",
+ " 256.0 | \n",
+ " 862.0 | \n",
+ " 1347.0 | \n",
+ " 2066.0 | \n",
+ " 513.0 | \n",
+ " 719.0 | \n",
+ " 1040.0 | \n",
+ " 1875.0 | \n",
+ " 307.0 | \n",
+ " 2791.0 | \n",
+ " 56.400000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 8462 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 261 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " -137.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3.0 | \n",
+ " 90 | \n",
+ " 81.0 | \n",
+ " 446.0 | \n",
+ " 565.0 | \n",
+ " 565.0 | \n",
+ " 119.0 | \n",
+ " 0.0 | \n",
+ " 525.0 | \n",
+ " 565.0 | \n",
+ " 40.0 | \n",
+ " 646.0 | \n",
+ " 58.166667 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 8462 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 262 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " -38.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3.0 | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 11391 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 268 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -159.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 9.0 | \n",
+ " 240 | \n",
+ " 98.0 | \n",
+ " 447.0 | \n",
+ " 612.0 | \n",
+ " 650.0 | \n",
+ " 165.0 | \n",
+ " 38.0 | \n",
+ " 489.0 | \n",
+ " 615.0 | \n",
+ " 123.0 | \n",
+ " 934.0 | \n",
+ " 81.600000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " 23629 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 240 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " -47.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 60 | \n",
+ " 14.0 | \n",
+ " 105.0 | \n",
+ " 147.0 | \n",
+ " 147.0 | \n",
+ " 42.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " 147.0 | \n",
+ " 30.0 | \n",
+ " 161.0 | \n",
+ " 38.813953 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 23632 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 268 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " -194.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4.0 | \n",
+ " 60 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " 23698 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 269 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -110.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 5.0 | \n",
+ " 120 | \n",
+ " 181.0 | \n",
+ " 325.0 | \n",
+ " 475.0 | \n",
+ " 587.0 | \n",
+ " 153.0 | \n",
+ " 112.0 | \n",
+ " 325.0 | \n",
+ " 565.0 | \n",
+ " 150.0 | \n",
+ " 910.0 | \n",
+ " 74.325000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 23798 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ " 268 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 2 | \n",
+ " 5.0 | \n",
+ " 60 | \n",
+ " 6.0 | \n",
+ " 212.0 | \n",
+ " 365.0 | \n",
+ " 490.0 | \n",
+ " 153.0 | \n",
+ " 125.0 | \n",
+ " 255.0 | \n",
+ " 459.0 | \n",
+ " 110.0 | \n",
+ " 590.0 | \n",
+ " 89.612903 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9 | \n",
+ " 24186 | \n",
+ " 5.0 | \n",
+ " 0 | \n",
+ " 241 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -25.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 30 | \n",
+ " 12.0 | \n",
+ " 15.0 | \n",
+ " 42.0 | \n",
+ " 109.0 | \n",
+ " 27.0 | \n",
+ " 67.0 | \n",
+ " 38.0 | \n",
+ " 91.0 | \n",
+ " 4.0 | \n",
+ " 184.0 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id_student code_module module_domain module_presentation_length \\\n",
+ "0 0 3733 3.0 1 261 \n",
+ "1 1 6516 0.0 0 269 \n",
+ "2 2 8462 3.0 1 261 \n",
+ "3 3 8462 3.0 1 262 \n",
+ "4 4 11391 0.0 0 268 \n",
+ "5 5 23629 1.0 0 240 \n",
+ "6 6 23632 1.0 0 268 \n",
+ "7 7 23698 2.0 1 269 \n",
+ "8 8 23798 1.0 0 268 \n",
+ "9 9 24186 5.0 0 241 \n",
+ "\n",
+ " term year num_of_prev_attempts final_result date_registration \\\n",
+ "0 0 0 0 0 -68.0 \n",
+ "1 0 1 0 1 -52.0 \n",
+ "2 0 0 0 0 -137.0 \n",
+ "3 0 1 1 0 -38.0 \n",
+ "4 0 0 0 1 -159.0 \n",
+ "5 1 0 2 0 -47.0 \n",
+ "6 0 0 0 0 -194.0 \n",
+ "7 0 1 0 1 -110.0 \n",
+ "8 0 0 0 1 -27.0 \n",
+ "9 1 1 0 1 -25.0 \n",
+ "\n",
+ " disability gender age_band region highest_education imd_band \\\n",
+ "0 0 0 2 3 3 9.0 \n",
+ "1 0 0 2 0 3 8.0 \n",
+ "2 0 0 2 2 3 3.0 \n",
+ "3 0 0 2 2 3 3.0 \n",
+ "4 0 0 2 1 3 9.0 \n",
+ "5 0 1 0 1 1 2.0 \n",
+ "6 0 1 0 1 2 4.0 \n",
+ "7 0 1 0 1 2 5.0 \n",
+ "8 0 0 0 9 2 5.0 \n",
+ "9 1 1 0 10 1 1.0 \n",
+ "\n",
+ " studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n",
+ "0 60 NaN NaN NaN \n",
+ "1 60 256.0 862.0 1347.0 \n",
+ "2 90 81.0 446.0 565.0 \n",
+ "3 60 NaN NaN NaN \n",
+ "4 240 98.0 447.0 612.0 \n",
+ "5 60 14.0 105.0 147.0 \n",
+ "6 60 NaN NaN NaN \n",
+ "7 120 181.0 325.0 475.0 \n",
+ "8 60 6.0 212.0 365.0 \n",
+ "9 30 12.0 15.0 42.0 \n",
+ "\n",
+ " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n",
+ "0 NaN NaN NaN \n",
+ "1 2066.0 513.0 719.0 \n",
+ "2 565.0 119.0 0.0 \n",
+ "3 NaN NaN NaN \n",
+ "4 650.0 165.0 38.0 \n",
+ "5 147.0 42.0 0.0 \n",
+ "6 NaN NaN NaN \n",
+ "7 587.0 153.0 112.0 \n",
+ "8 490.0 153.0 125.0 \n",
+ "9 109.0 27.0 67.0 \n",
+ "\n",
+ " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 1040.0 1875.0 307.0 2791.0 \n",
+ "2 525.0 565.0 40.0 646.0 \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 489.0 615.0 123.0 934.0 \n",
+ "5 117.0 147.0 30.0 161.0 \n",
+ "6 NaN NaN NaN NaN \n",
+ "7 325.0 565.0 150.0 910.0 \n",
+ "8 255.0 459.0 110.0 590.0 \n",
+ "9 38.0 91.0 4.0 184.0 \n",
+ "\n",
+ " std_half_score \n",
+ "0 NaN \n",
+ "1 56.400000 \n",
+ "2 58.166667 \n",
+ "3 NaN \n",
+ "4 81.600000 \n",
+ "5 38.813953 \n",
+ "6 NaN \n",
+ "7 74.325000 \n",
+ "8 89.612903 \n",
+ "9 0.000000 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# write dataframe to database\n",
+ "from sqlalchemy import create_engine\n",
+ "engine = create_engine('postgresql://postgres:Georgetown16@database-1.c5vispb5ezxg.us-east-1.rds.amazonaws.com:5432/Dataset')\n",
+ "df.to_sql('featureSTG5', engine, if_exists='replace')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "disconnectDB(conn)"
+ ]
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Raw Cell Format",
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}