diff --git a/Data_wrangling.ipynb b/Data_wrangling.ipynb new file mode 100644 index 0000000..abcf063 --- /dev/null +++ b/Data_wrangling.ipynb @@ -0,0 +1,1682 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import psycopg2\n", + "import pandas as pd\n", + "import pandas.io.sql as pd_sql\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def connectDB(DB):\n", + " # connect to the PostgreSQL server\n", + " return psycopg2.connect(\n", + " database=DB,\n", + " user=\"postgres\",\n", + " password=\"Georgetown16\",\n", + " host=\"database-1.c5vispb5ezxg.us-east-1.rds.amazonaws.com\",\n", + " port='5432')\n", + "\n", + "def disconnectDB(conn):\n", + " \n", + " conn.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexid_studentcode_modulecode_presentationmodule_domainmodule_presentation_lengthtermyearnum_of_prev_attemptsfinal_resultpass_fail_indreg_perioddate_registrationdate_unregistrationdisabilitygenderage_bandregionhighest_educationimd_bandstudied_creditsb4_sum_clicksqtr_sum_clickshalf_sum_clicksthreeqtr_sum_clicksqtr_half_sum_clickshalf_threeqtr_sum_clicksthrd_sum_clickstwothrd_sum_clicksthrd_twothrd_sum_clicksallclicksstd_half_scorestd_total_weight
003733DDD2013JSTEM261J20130WithdrawnNoneQUARTERB4-68.0-8.0NM55<=South RegionHE Qualification90-100%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
116516AAA2014JSocialScience269J20140PassPASSQUARTERB4-52.0NaNNM55<=ScotlandHE Qualification80-90%60256.0862.01347.02066.0513.0719.01040.01875.0307.02791.056.400000100.0
228462DDD2013JSTEM261J20130WithdrawnNoneLONGB4-137.0119.0NM55<=London RegionHE Qualification30-40%9081.0446.0565.0565.0119.00.0525.0565.040.0646.058.16666740.0
338462DDD2014JSTEM262J20141WithdrawnNoneQUARTERB4-38.018.0NM55<=London RegionHE Qualification30-40%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4411391AAA2013JSocialScience268J20130PassPASSLONGB4-159.0NaNNM55<=East Anglian RegionHE Qualification90-100%24098.0447.0612.0650.0165.038.0489.0615.0123.0934.081.600000100.0
......................................................................................................
959533897DDD2013JSTEM261J20130WithdrawnNoneQUARTERB4-58.0107.0NM0-35West Midlands RegionLower Than A Level30-40%600.0322.0322.0322.00.00.0322.0322.00.0322.08.33333310.0
969633897DDD2014JSTEM262J20141WithdrawnNoneMONTHB4-18.041.0NM0-35West Midlands RegionLower Than A Level30-40%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
979733915FFF2013BSTEM240B20130DistinctionPASSQUARTERB4-64.0NaNNF0-35South East RegionA Level or Equivalent30-40%60144.01169.02145.03206.01012.01061.01456.03137.0689.04121.091.500000100.0
989833930DDD2013JSTEM261J20130PassPASSLONGB4-143.0NaNNF0-35South West RegionA Level or Equivalent20-30%9064.0175.0213.0410.038.0197.0185.0394.028.0544.084.62500080.0
999934068CCC2014JSTEM269J20140WithdrawnNoneQUARTERB4-74.0108.0NM0-35ScotlandLower Than A Level50-60%3030.0544.0583.0583.0147.00.0579.0583.04.0613.033.05000018.0
\n", + "

100 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " index id_student code_module code_presentation module_domain \\\n", + "0 0 3733 DDD 2013J STEM \n", + "1 1 6516 AAA 2014J SocialScience \n", + "2 2 8462 DDD 2013J STEM \n", + "3 3 8462 DDD 2014J STEM \n", + "4 4 11391 AAA 2013J SocialScience \n", + ".. ... ... ... ... ... \n", + "95 95 33897 DDD 2013J STEM \n", + "96 96 33897 DDD 2014J STEM \n", + "97 97 33915 FFF 2013B STEM \n", + "98 98 33930 DDD 2013J STEM \n", + "99 99 34068 CCC 2014J STEM \n", + "\n", + " module_presentation_length term year num_of_prev_attempts final_result \\\n", + "0 261 J 2013 0 Withdrawn \n", + "1 269 J 2014 0 Pass \n", + "2 261 J 2013 0 Withdrawn \n", + "3 262 J 2014 1 Withdrawn \n", + "4 268 J 2013 0 Pass \n", + ".. ... ... ... ... ... \n", + "95 261 J 2013 0 Withdrawn \n", + "96 262 J 2014 1 Withdrawn \n", + "97 240 B 2013 0 Distinction \n", + "98 261 J 2013 0 Pass \n", + "99 269 J 2014 0 Withdrawn \n", + "\n", + " pass_fail_ind reg_period date_registration date_unregistration \\\n", + "0 None QUARTERB4 -68.0 -8.0 \n", + "1 PASS QUARTERB4 -52.0 NaN \n", + "2 None LONGB4 -137.0 119.0 \n", + "3 None QUARTERB4 -38.0 18.0 \n", + "4 PASS LONGB4 -159.0 NaN \n", + ".. ... ... ... ... \n", + "95 None QUARTERB4 -58.0 107.0 \n", + "96 None MONTHB4 -18.0 41.0 \n", + "97 PASS QUARTERB4 -64.0 NaN \n", + "98 PASS LONGB4 -143.0 NaN \n", + "99 None QUARTERB4 -74.0 108.0 \n", + "\n", + " disability gender age_band region highest_education \\\n", + "0 N M 55<= South Region HE Qualification \n", + "1 N M 55<= Scotland HE Qualification \n", + "2 N M 55<= London Region HE Qualification \n", + "3 N M 55<= London Region HE Qualification \n", + "4 N M 55<= East Anglian Region HE Qualification \n", + ".. ... ... ... ... ... \n", + "95 N M 0-35 West Midlands Region Lower Than A Level \n", + "96 N M 0-35 West Midlands Region Lower Than A Level \n", + "97 N F 0-35 South East Region A Level or Equivalent \n", + "98 N F 0-35 South West Region A Level or Equivalent \n", + "99 N M 0-35 Scotland Lower Than A Level \n", + "\n", + " imd_band studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n", + "0 90-100% 60 NaN NaN NaN \n", + "1 80-90% 60 256.0 862.0 1347.0 \n", + "2 30-40% 90 81.0 446.0 565.0 \n", + "3 30-40% 60 NaN NaN NaN \n", + "4 90-100% 240 98.0 447.0 612.0 \n", + ".. ... ... ... ... ... \n", + "95 30-40% 60 0.0 322.0 322.0 \n", + "96 30-40% 60 NaN NaN NaN \n", + "97 30-40% 60 144.0 1169.0 2145.0 \n", + "98 20-30% 90 64.0 175.0 213.0 \n", + "99 50-60% 30 30.0 544.0 583.0 \n", + "\n", + " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n", + "0 NaN NaN NaN \n", + "1 2066.0 513.0 719.0 \n", + "2 565.0 119.0 0.0 \n", + "3 NaN NaN NaN \n", + "4 650.0 165.0 38.0 \n", + ".. ... ... ... \n", + "95 322.0 0.0 0.0 \n", + "96 NaN NaN NaN \n", + "97 3206.0 1012.0 1061.0 \n", + "98 410.0 38.0 197.0 \n", + "99 583.0 147.0 0.0 \n", + "\n", + " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n", + "0 NaN NaN NaN NaN \n", + "1 1040.0 1875.0 307.0 2791.0 \n", + "2 525.0 565.0 40.0 646.0 \n", + "3 NaN NaN NaN NaN \n", + "4 489.0 615.0 123.0 934.0 \n", + ".. ... ... ... ... \n", + "95 322.0 322.0 0.0 322.0 \n", + "96 NaN NaN NaN NaN \n", + "97 1456.0 3137.0 689.0 4121.0 \n", + "98 185.0 394.0 28.0 544.0 \n", + "99 579.0 583.0 4.0 613.0 \n", + "\n", + " std_half_score std_total_weight \n", + "0 NaN NaN \n", + "1 56.400000 100.0 \n", + "2 58.166667 40.0 \n", + "3 NaN NaN \n", + "4 81.600000 100.0 \n", + ".. ... ... \n", + "95 8.333333 10.0 \n", + "96 NaN NaN \n", + "97 91.500000 100.0 \n", + "98 84.625000 80.0 \n", + "99 33.050000 18.0 \n", + "\n", + "[100 rows x 33 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# connect to \"Dataset\" DB \n", + "conn = connectDB(\"Dataset\")\n", + "\n", + "# extract everything from 'table_name' into a dataframe\n", + "df = pd_sql.read_sql(f\"select * from public.\\\"analysisFeatures\\\" \", con=conn).reset_index()\n", + "\n", + "#make sure that all columns are displayed in our dataframe\n", + "pd.set_option('display.max_column',50)\n", + "\n", + "#check dataframe\n", + "df.head(100)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "22521" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#count null values of date_unregistration\n", + "df['date_unregistration'].isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexid_studentcode_modulemodule_domainmodule_presentation_lengthtermyearnum_of_prev_attemptsfinal_resultdate_registrationdisabilitygenderage_bandregionhighest_educationimd_bandstudied_creditsb4_sum_clicksqtr_sum_clickshalf_sum_clicksthreeqtr_sum_clicksqtr_half_sum_clickshalf_threeqtr_sum_clicksthrd_sum_clickstwothrd_sum_clicksthrd_twothrd_sum_clicksallclicksstd_half_score
003733DDDSTEM261J20130Withdrawn-68.0NM55<=South RegionHE Qualification90-100%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
116516AAASocialScience269J20140Pass-52.0NM55<=ScotlandHE Qualification80-90%60256.0862.01347.02066.0513.0719.01040.01875.0307.02791.056.400000
228462DDDSTEM261J20130Withdrawn-137.0NM55<=London RegionHE Qualification30-40%9081.0446.0565.0565.0119.00.0525.0565.040.0646.058.166667
338462DDDSTEM262J20141Withdrawn-38.0NM55<=London RegionHE Qualification30-40%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4411391AAASocialScience268J20130Pass-159.0NM55<=East Anglian RegionHE Qualification90-100%24098.0447.0612.0650.0165.038.0489.0615.0123.0934.081.600000
5523629BBBSocialScience240B20132Fail-47.0NF0-35East Anglian RegionLower Than A Level20-30%6014.0105.0147.0147.042.00.0117.0147.030.0161.038.813953
6623632BBBSocialScience268J20130Withdrawn-194.0NF0-35East Anglian RegionA Level or Equivalent40-50%60NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
7723698CCCSTEM269J20140Pass-110.0NF0-35East Anglian RegionA Level or Equivalent50-60%120181.0325.0475.0587.0153.0112.0325.0565.0150.0910.074.325000
8823798BBBSocialScience268J20130Distinction-27.0NM0-35WalesA Level or Equivalent50-60%606.0212.0365.0490.0153.0125.0255.0459.0110.0590.089.612903
9924186GGGSocialScience241B20140Pass-25.0YF0-35Yorkshire RegionLower Than A Level10-203012.015.042.0109.027.067.038.091.04.0184.00.000000
\n", + "
" + ], + "text/plain": [ + " index id_student code_module module_domain module_presentation_length \\\n", + "0 0 3733 DDD STEM 261 \n", + "1 1 6516 AAA SocialScience 269 \n", + "2 2 8462 DDD STEM 261 \n", + "3 3 8462 DDD STEM 262 \n", + "4 4 11391 AAA SocialScience 268 \n", + "5 5 23629 BBB SocialScience 240 \n", + "6 6 23632 BBB SocialScience 268 \n", + "7 7 23698 CCC STEM 269 \n", + "8 8 23798 BBB SocialScience 268 \n", + "9 9 24186 GGG SocialScience 241 \n", + "\n", + " term year num_of_prev_attempts final_result date_registration disability \\\n", + "0 J 2013 0 Withdrawn -68.0 N \n", + "1 J 2014 0 Pass -52.0 N \n", + "2 J 2013 0 Withdrawn -137.0 N \n", + "3 J 2014 1 Withdrawn -38.0 N \n", + "4 J 2013 0 Pass -159.0 N \n", + "5 B 2013 2 Fail -47.0 N \n", + "6 J 2013 0 Withdrawn -194.0 N \n", + "7 J 2014 0 Pass -110.0 N \n", + "8 J 2013 0 Distinction -27.0 N \n", + "9 B 2014 0 Pass -25.0 Y \n", + "\n", + " gender age_band region highest_education imd_band \\\n", + "0 M 55<= South Region HE Qualification 90-100% \n", + "1 M 55<= Scotland HE Qualification 80-90% \n", + "2 M 55<= London Region HE Qualification 30-40% \n", + "3 M 55<= London Region HE Qualification 30-40% \n", + "4 M 55<= East Anglian Region HE Qualification 90-100% \n", + "5 F 0-35 East Anglian Region Lower Than A Level 20-30% \n", + "6 F 0-35 East Anglian Region A Level or Equivalent 40-50% \n", + "7 F 0-35 East Anglian Region A Level or Equivalent 50-60% \n", + "8 M 0-35 Wales A Level or Equivalent 50-60% \n", + "9 F 0-35 Yorkshire Region Lower Than A Level 10-20 \n", + "\n", + " studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n", + "0 60 NaN NaN NaN \n", + "1 60 256.0 862.0 1347.0 \n", + "2 90 81.0 446.0 565.0 \n", + "3 60 NaN NaN NaN \n", + "4 240 98.0 447.0 612.0 \n", + "5 60 14.0 105.0 147.0 \n", + "6 60 NaN NaN NaN \n", + "7 120 181.0 325.0 475.0 \n", + "8 60 6.0 212.0 365.0 \n", + "9 30 12.0 15.0 42.0 \n", + "\n", + " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n", + "0 NaN NaN NaN \n", + "1 2066.0 513.0 719.0 \n", + "2 565.0 119.0 0.0 \n", + "3 NaN NaN NaN \n", + "4 650.0 165.0 38.0 \n", + "5 147.0 42.0 0.0 \n", + "6 NaN NaN NaN \n", + "7 587.0 153.0 112.0 \n", + "8 490.0 153.0 125.0 \n", + "9 109.0 27.0 67.0 \n", + "\n", + " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n", + "0 NaN NaN NaN NaN \n", + "1 1040.0 1875.0 307.0 2791.0 \n", + "2 525.0 565.0 40.0 646.0 \n", + "3 NaN NaN NaN NaN \n", + "4 489.0 615.0 123.0 934.0 \n", + "5 117.0 147.0 30.0 161.0 \n", + "6 NaN NaN NaN NaN \n", + "7 325.0 565.0 150.0 910.0 \n", + "8 255.0 459.0 110.0 590.0 \n", + "9 38.0 91.0 4.0 184.0 \n", + "\n", + " std_half_score \n", + "0 NaN \n", + "1 56.400000 \n", + "2 58.166667 \n", + "3 NaN \n", + "4 81.600000 \n", + "5 38.813953 \n", + "6 NaN \n", + "7 74.325000 \n", + "8 89.612903 \n", + "9 0.000000 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drop_list = ['reg_period','code_presentation','date_unregistration','pass_fail_ind','std_total_weight']\n", + "\n", + "df = df.drop(drop_list, axis=1)\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "STEM 21402\n", + "SocialScience 11191\n", + "Name: module_domain, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['module_domain'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['code_module'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#mapping the columns\n", + "df['imd_band'] = df['imd_band'].map({'0-10%':0,'10-20':1,'20-30%':2,'30-40%':3,'40-50%':4,'50-60%':5,'60-70%':6,'70-80%':7,'80-90%':8,'90-100%':9})\n", + "df['module_domain'] = df['module_domain'].map({'SocialScience': 0,'STEM': 1})\n", + "df['code_module'] = df['code_module'].map({'AAA': 0,'BBB': 1,'CCC':2,'DDD':3,'EEE':4,'GGG':5})\n", + "df['term'] = df['term'].map({'J': 0,'B': 1})\n", + "df['year'] = df['year'].map({'2013': 0,'2014': 1})\n", + "df['gender'] = df['gender'].map({'M': 0,'F': 1})\n", + "df['age_band'] = df['age_band'].map({'0-35': 0,'35-55': 1,'55<=':2})\n", + "df['region'] = df['region'].map({'Scotland': 0,'East Anglian Region': 1,'London Region':2,'South Region': 3,'North Western Region': 4,'West Midlands Region':5,'South West Region': 6,'East Midlands Region': 7,'South East Region':8,'Wales': 9,'Yorkshire Region': 10,'North Region':11,'Ireland':12})\n", + "df['final_result'] = df['final_result'].map({'Withdrawn':0, 'Fail':0,'Pass':1,'Distinction':1})\n", + "df['disability'] = df['disability'].map({'N':0,'Y':1})\n", + "df['highest_education'] = df['highest_education'].map({'No Formal quals':0,'Lower Than A Level':1,'A Level or Equivalent':2,'HE Qualification':3,'Post Graduate Qualification':4})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexid_studentcode_modulemodule_domainmodule_presentation_lengthtermyearnum_of_prev_attemptsfinal_resultdate_registrationdisabilitygenderage_bandregionhighest_educationimd_bandstudied_creditsb4_sum_clicksqtr_sum_clickshalf_sum_clicksthreeqtr_sum_clicksqtr_half_sum_clickshalf_threeqtr_sum_clicksthrd_sum_clickstwothrd_sum_clicksthrd_twothrd_sum_clicksallclicksstd_half_score
0037333.012610000-68.0002339.060NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1165160.002690101-52.0002038.060256.0862.01347.02066.0513.0719.01040.01875.0307.02791.056.400000
2284623.012610000-137.0002233.09081.0446.0565.0565.0119.00.0525.0565.040.0646.058.166667
3384623.012620110-38.0002233.060NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
44113910.002680001-159.0002139.024098.0447.0612.0650.0165.038.0489.0615.0123.0934.081.600000
55236291.002401020-47.0010112.06014.0105.0147.0147.042.00.0117.0147.030.0161.038.813953
66236321.002680000-194.0010124.060NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
77236982.012690101-110.0010125.0120181.0325.0475.0587.0153.0112.0325.0565.0150.0910.074.325000
88237981.002680001-27.0000925.0606.0212.0365.0490.0153.0125.0255.0459.0110.0590.089.612903
99241865.002411101-25.01101011.03012.015.042.0109.027.067.038.091.04.0184.00.000000
\n", + "
" + ], + "text/plain": [ + " index id_student code_module module_domain module_presentation_length \\\n", + "0 0 3733 3.0 1 261 \n", + "1 1 6516 0.0 0 269 \n", + "2 2 8462 3.0 1 261 \n", + "3 3 8462 3.0 1 262 \n", + "4 4 11391 0.0 0 268 \n", + "5 5 23629 1.0 0 240 \n", + "6 6 23632 1.0 0 268 \n", + "7 7 23698 2.0 1 269 \n", + "8 8 23798 1.0 0 268 \n", + "9 9 24186 5.0 0 241 \n", + "\n", + " term year num_of_prev_attempts final_result date_registration \\\n", + "0 0 0 0 0 -68.0 \n", + "1 0 1 0 1 -52.0 \n", + "2 0 0 0 0 -137.0 \n", + "3 0 1 1 0 -38.0 \n", + "4 0 0 0 1 -159.0 \n", + "5 1 0 2 0 -47.0 \n", + "6 0 0 0 0 -194.0 \n", + "7 0 1 0 1 -110.0 \n", + "8 0 0 0 1 -27.0 \n", + "9 1 1 0 1 -25.0 \n", + "\n", + " disability gender age_band region highest_education imd_band \\\n", + "0 0 0 2 3 3 9.0 \n", + "1 0 0 2 0 3 8.0 \n", + "2 0 0 2 2 3 3.0 \n", + "3 0 0 2 2 3 3.0 \n", + "4 0 0 2 1 3 9.0 \n", + "5 0 1 0 1 1 2.0 \n", + "6 0 1 0 1 2 4.0 \n", + "7 0 1 0 1 2 5.0 \n", + "8 0 0 0 9 2 5.0 \n", + "9 1 1 0 10 1 1.0 \n", + "\n", + " studied_credits b4_sum_clicks qtr_sum_clicks half_sum_clicks \\\n", + "0 60 NaN NaN NaN \n", + "1 60 256.0 862.0 1347.0 \n", + "2 90 81.0 446.0 565.0 \n", + "3 60 NaN NaN NaN \n", + "4 240 98.0 447.0 612.0 \n", + "5 60 14.0 105.0 147.0 \n", + "6 60 NaN NaN NaN \n", + "7 120 181.0 325.0 475.0 \n", + "8 60 6.0 212.0 365.0 \n", + "9 30 12.0 15.0 42.0 \n", + "\n", + " threeqtr_sum_clicks qtr_half_sum_clicks half_threeqtr_sum_clicks \\\n", + "0 NaN NaN NaN \n", + "1 2066.0 513.0 719.0 \n", + "2 565.0 119.0 0.0 \n", + "3 NaN NaN NaN \n", + "4 650.0 165.0 38.0 \n", + "5 147.0 42.0 0.0 \n", + "6 NaN NaN NaN \n", + "7 587.0 153.0 112.0 \n", + "8 490.0 153.0 125.0 \n", + "9 109.0 27.0 67.0 \n", + "\n", + " thrd_sum_clicks twothrd_sum_clicks thrd_twothrd_sum_clicks allclicks \\\n", + "0 NaN NaN NaN NaN \n", + "1 1040.0 1875.0 307.0 2791.0 \n", + "2 525.0 565.0 40.0 646.0 \n", + "3 NaN NaN NaN NaN \n", + "4 489.0 615.0 123.0 934.0 \n", + "5 117.0 147.0 30.0 161.0 \n", + "6 NaN NaN NaN NaN \n", + "7 325.0 565.0 150.0 910.0 \n", + "8 255.0 459.0 110.0 590.0 \n", + "9 38.0 91.0 4.0 184.0 \n", + "\n", + " std_half_score \n", + "0 NaN \n", + "1 56.400000 \n", + "2 58.166667 \n", + "3 NaN \n", + "4 81.600000 \n", + "5 38.813953 \n", + "6 NaN \n", + "7 74.325000 \n", + "8 89.612903 \n", + "9 0.000000 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# write dataframe to database\n", + "from sqlalchemy import create_engine\n", + "engine = create_engine('postgresql://postgres:Georgetown16@database-1.c5vispb5ezxg.us-east-1.rds.amazonaws.com:5432/Dataset')\n", + "df.to_sql('featureSTG5', engine, if_exists='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "disconnectDB(conn)" + ] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}