From 411cf4dfaf848ff5cf5ddfcbc681ffda20f50930 Mon Sep 17 00:00:00 2001 From: Shu-Yi Date: Thu, 19 Dec 2019 23:48:40 -0500 Subject: [PATCH] assignment 7 --- Assignment7.Rmd | 80 ++++++++++++++++++++++++++++++++++++++---------- tree2.ps | Bin 0 -> 24337 bytes 2 files changed, 64 insertions(+), 16 deletions(-) create mode 100644 tree2.ps diff --git a/Assignment7.Rmd b/Assignment7.Rmd index 105cbdf..984b710 100644 --- a/Assignment7.Rmd +++ b/Assignment7.Rmd @@ -1,7 +1,7 @@ --- title: "Assignment 7 - Answers" -author: "Charles Lang" -date: "11/30/2016" +author: "Shu-Yi Hsu" +date: "12/17/2016" output: html_document --- @@ -11,42 +11,88 @@ In the following assignment you will be looking at data from an one level of an #Upload data ```{r} - +D1 <- read.csv("online.data.csv") +View(D1) ``` #Visualization ```{r} #Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation) -#Then visualize the relationships between variables +library(ggplot2) +library(tidyr) + + +D1$level.up <- ifelse(D1$level.up == 'yes', 1, 0) +View(D1) +D2 <- D1[,-1] +View(D2) +Dlong <- gather(D2, 'vars', 'value', 1:6) +Dlong$value <- as.numeric(Dlong$value) +ggplot(Dlong,aes(value, ..density..)) + geom_histogram(binwidth = .1) + facet_wrap(~vars, scales ='free') + +#Then visualize the relationships between variables +cor(D2) +pairs(D2) +qplot(x = forum.posts, y=pre.test.score, data =D2, main = "pre-test", margins =TRUE) +qplot(x = forum.posts, y=post.test.score, data =D2, main = "post-test", margins =TRUE) +qplot(x = forum.posts, y=av.assignment.score, data =D2, main = "Average assignment", margins =TRUE) +qplot(x= av.assignment.score, y=post.test.score, data =D2, main = "post-test", margins =TRUE) #Try to capture an intution about the data and the relationships ``` #Classification tree ```{r} #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums) - +install.packages("rpart.plot") +library(rpart) +library(rpart.plot) +tree1<-rpart(level.up~ messages+ forum.posts+pre.test.score, method ="class", data=D1) +str(D1) +D1$messages <- as.numeric(D1$messages) +D1$forum.posts <- as.numeric(D1$forum.posts) + +printcp(tree1) #Plot and generate a CP table for your tree - +rpart.plot(tree1) #Generate a probability value that represents the probability that a student levels up based your classification tree -D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on. +D1$pred <- predict(tree1, type = "prob") [,2] +#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on. ``` ## Part II #Now you can generate the ROC curve for your model. You will need to install the package ROCR to do this. ```{r} library(ROCR) - #Plot the curve + pred.detail <- prediction(D1$pred, D1$level.up) plot(performance(pred.detail, "tpr", "fpr")) abline(0, 1, lty = 2) +pred.detail1 <- prediction(D1$pred1, D1$level.up) +plot(performance(pred.detail1, "tpr", "fpr")) +abline(0, 1, lty = 2) + #Calculate the Area Under the Curve -unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR +unlist(slot(performance(Pred2,"auc"), "y.values")) + +unlist(slot(performance(pred.detail1,"auc"), "y.values")) + +#Unlist liberates the AUC value from the "performance" object created by ROCR #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why? +tree2 <- rpart(level.up ~ forum.posts + av.assignment.score + pre.test.score, method ='class', control = rpart.control(minsplit =1, minbucket =1, cp =0.001), data =D1 ) +printcp(tree2) +post(tree2, file = "tree2.ps", title = "Level Up - Tree 2") +D1$pred2 <- predict(tree2, type = "prob")[,2] +pred.detail2 <- prediction(D1$pred2, D1$level.up) +plot(performance(pred.detail2, "tpr", "fpr")) +abline(0, 1, lty = 2) + +unlist(slot(performance(pred.detail2,"auc"), "y.values")) #0.997 + ``` ## Part III #Thresholds @@ -54,28 +100,30 @@ unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC valu #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold. threshold.pred1 <- - +D1$threshold.pred1 <- ifelse(D1$pred > 0.1, 1, 0) #Now generate three diagnostics: -D1$accuracy.model1 <- +D1$accuracy.model1 <- (571+398)/(571+29+2+398) #0.969 -D1$precision.model1 <- +D1$precision.model1 <- 571/(571+2) #0.99 -D1$recall.model1 <- +D1$recall.model1 <- 571/(571+29) #0.9517 #Finally, calculate Kappa for your model according to: +table1 <- table(D1$level.up, D1$threshold.pred1) #First generate the table of comparisons -table1 <- table(D1$level.up, D1$threshold.pred1) +matrix1 <- as.matrix(table1) #Convert to matrix -matrix1 <- as.matrix(table1) +kappa(matrix1, exact = TRUE)/kappa(matrix1) #Calculate kappa -kappa(matrix1, exact = TRUE)/kappa(matrix1) +kappa(matrix1, exact = TRUE)/kappa(matrix1) # =1.103 #Now choose a different threshold value and repeat these diagnostics. What conclusions can you draw about your two thresholds? + ``` ### To Submit Your Assignment diff --git a/tree2.ps b/tree2.ps new file mode 100644 index 0000000000000000000000000000000000000000..3d80f7981c0aa243a16d71c4cbfaee109baa3a49 GIT binary patch literal 24337 zcmeHPTaVjDvVPaE=zTCS!D1+q-Mq-K4aB)^fFus$EcPMDgGQpq+SN!}N#i8&`oHg2 z)pwGTC^?QD#OGu^BAeCSRb5|Qy2*O*x9^`{e>z{^G}kgoM-Lu+zMkFRHLLx1O*3!i zKQz1b{dU&u9*P(1)n5FgS-xuai`f)U{=IeU`ssQ(@4Nr(X1Vz9ebd)OB%S8xzun!e zm%&`VZRU&VLorpNm=vN`nMlReC-@J~{J7XJn}^~D@zr*^dAXSF#IyVT=6;W+&$i8U zzur>g^Yx4UFVii};rr<=v2!)uH>)}I@NByPztjCposK3(6h=8M(s z)Ag?pMOM>)MOBG9S7`iVHUDgVN0Qpnqs{GN_5F6eT;GnyHya^di`$*p{qWUOnsSn+ znDKr)UG0__R!mZ{U1P+C9(p6@&5O|kaYJ2*WV1^)rXMtNKhswYp1N7A=KSdRcFXs- z%?{kfO$`sdd%6B)12zSo#D5{}ogmt~HCAyQH7x1=cDY!!8&=%qszHzYml4T~TV~7k zuIaSB;Bj6omdl7_&9`^^?fO6Q#An=KM@fY&Q!(AnB4V(OIVzDudg#|Mk%`%Q^P7<3 zI@9-?UuQ2NZvJB#g1`-Hc3M=Vw2H;J+s~9)Ns3_mASbU%;v3K56eU-_KiG3 z&^JICJ>RSc3{f2==^13t@+`!4EKtpsR?rb1q5sPSw6FPA z2LdJwkA+5pZOm)&lNf*Y^%qa4yJr6B)6eXr@a&^UV*GhCThE(szWpw^k0t@C*J6IZ z5owajNnVR((|&>?$E9`fh-#`xPQbn{m*DSk>5mF23az*nB-WhIHhY$BzKX&iI-ktVXJimJqq zDl4)g$B!(VWRv_DO;)E$(l}|AXH|h8lOmOA)}GwIUM}{{_v-~*C}xLe3K9?fub!mQ z!%v?0@qQ*{VNKa>rnA^!?wmGtRO!KkuUE6>{k*Z>yW_ma&X2=(t>&a>BhrIqv$_Sc z=Zo21`y2e8*@fR!T8|;Mm`uVeO z;8<6gyF#*KU^M-!u;ekP)3KcDVijxyVZNZeoi8}@UQ!PfGF zcd)1k>hNc@+Kgm6Nf4hasS>Hg-AJhjxKxNkbyI-=ZxrS-oe{Vx5r|I8d_#}$r|7HCZlq`|^eHLv=>p*?*j;ekMp;18B&z|h{r zNI1S40R~7RzJYRk@;EXva4-;@goj0Vb$oQP%7KZI8G^w=pfg-U2*Ab1)XQ7BCO;JF7TW{fT9*ObiR|9M+TA1NTS z5eg9zB2R@d1*>w$ferj0T`cJWW)T+@j9$aO=d@E->5Cc;maGR_IM9HHw1&`v5CS($uq#A z32moo8W>@m`4A_d;S0>&Ug0@h+0GHh;Y# zEWon@CUK>?1plD30)S}HW@Q5k+SfK>V}asIOO$1eD=lo8c+AYKgS#M^y5~5%Fx_(( zQSZ^fZcYjv@S51^I!;Ky5tF=U|y%HGh;E9B4gSoRl>8X1~vl>XuhN=nqIGqiZ*=jinltm4(lcaHcwULzn z10LAWKo)@1kv9*(B9_Oj4lZJp&>;ONA^bR$OgboG{DfS+N0dNM1C&7b2ciU88=!=7 z0Y48^I<>P_;uuPBSQns#uMVIDi1jGZ7kw!4U9c9|-iM>abHXZUnLLUTEj5D@EzrX# zA#JrN(Z9WYdq*f?4Kyfms@Xz`Q)a8>hfzXl7e}IIn~YF`5b1y~aUDaH9+$K*ZYsEd zEw!f9`jU?*VF^Tf3>hKvu?i^QHFU&Nd;Xz#GEs_qh z5M%M6&efZ%R>b8^A0j>Y_^vN8BkPDuPbxCAU<NJNp}6M3#-XkaW`(^LsW}VcC zd#s5UXn*#8E=qK_I(ygS@!k$E#K7$RZSs;nlo(%P5C8rGDCvv13}WH>kq zrihE?d~U==wlCUjh+@YmtnAO;4N82R+V4w9(Ouw6u(!TM|JJdx0a<5{>upd%>7R|b zh}m-@E)uA95+rHwOB7xSfPr?eZ4gRaV1|qLn3udOU-D-`Nw}>A|4zVoA1L9S@b*q{ z4@w9>{l4T7N@(BTh6fIPX2a1Yr^K<@3Fe8M@+H3mO57H2fRYOHgnz>EAHSe<93`|} z;C6z6Eq`9(HWILy`n-gbSG&g*otKcUpq*eGzQyMyI=Bf1q+=o<62%c0y}PWBGeT2E zsXX%A;JgGXW*>}9-`hl7$MO>B5H|T(08!vW*}IR4DChCvlD0?$g(DGHA4+@|QSZ^f zye2fK$d}NfIA204%WGnjFY8NsQKI8i=L|4;K7ayBysV>T2p5Q-G7Hi|3B6g)hgYEz zv{2&I#-1$bsdvvydVGn~S(bfkkIVRyK9rCz!O5Y06%YJgo|lk8Ae3}>f_qWYIaTaO zi92Z;ke9HA@rr&g3;2 zgc2WC9z}`vC8So7*}2@8XrFQPRPmrM=|PFpK$gEb%47N9hL#m};eitRZ6l_`5f|-o z;mtnx(kY?jaKzR1B{Ygwg+r%`=-ix2TkL;OzQnILK!|-;n9}&{U2Ak?W>4(|YhPkf z!fLEfTq8YxTVFyJA#z7i@6ZN6@Bm*zixM$uQNsL3qH+7@eJHumml%|=FVWrV>^+FM zcp_m466zCz7N=;Too=CoeThMdQ)Y`25~x*w8rMf$67OQ-m2mfZweuzPn6_^=2w>9} zL|lYL#^rI8{624}kmTNvFVP`gq(Fp{OW#mo)zxAf_bAxH})PSN^MLQ=54gdBOO%q^7k>~Yb1=GZN99}+lU5}|~3 zloEBxUc;C}@ogQ1VXScGP)^MTxeG282gZ;zM>H zaT!+5c-v9?l4E%Znln6I}4oskIWl zDEU+P5`z-AvD89|Q|6X0aS4Yr{(O!jo0?Rq{0_l=rclzb2**t-Gy?bA)LJ*&Pz_C#2E$aOZrjb_R#Uz zk@FIK#C_;(M}#XU&P(EmYY0l-%X!ILpad88_4auQ0B9Bk zzQiq1-+772OWtOWE6m={MG1dJ74C7xc}YnnB7#y4F5;rcj-$k#&_Mk0oAYjt$9>5O zl<@umTl?XsiVRI&5xYE;<)20g=K}rNyM}IOkLxdr5;7o>2WHC!VcWMIy{w~UXhmG^ zJUBFbcCE=|@miRG;z-0r$S^6@L0=N)B~E9dLa#ZZh=QVz+^IMS6?xzp0wbsy zfYL0KpyE4jbJKev`rZ{UA)M}~4?%?cSqKX41ea@IvoCAA#GQ|h8 z=kY1=b^?_zsPN@3l|UITp&TCuqig{Rno;qRvnatQULw4JQsM({RGztjQb7~osxP6$ z*B2$eHLbYb%Q<{f=@O);IqC@E!|!t_QSgoG8=zna)xtQ3QsJ9lsQLwZt58&wzBfK! z;|r>qLB$Zx#!5-b1IqI?zC_8O0^h`(Pgy||Qc#{ZTU&MtaG|gFQ&)D%HlRDauy9qT z=~$&fvne8lB2fz8z;_$K01A&>L55@BNV|?W5LAXLbEDY?LzUOjV7BTMB50hiU3oIo zAnChsG~}3L?fSIUrclxXSyeDa+W<=mQD0SK-(hB|OyQ*_Dk0%#ZW~Y?i7Inc0aH2Z zYyk&U2Z3mOXSTu6q`^3*x)T*@qPck*+hFFF$+5zh^QIJ-tz*?4$rQLUBY`SIwE;_o z$MI&2MByDAN(DU8t6S7SONj;<@MNc9Xu9(=G#Qu*Ml$uTsC-GJ6lUT%%4<+ zY?GsCLby(*0VOw3ndk~czC%+&D@QpPyxSBKnc7pBA+FBMOoJIF3udOEg*cw=&dfCQ&8&tNU}i?n zo*8KxjGVJE%F-ABk%A!d>8__~$jo}e`gj@u7*&tDila<}QJi$_4VHmI*O1r-XGn5S zRMhOGRL3@&25TGZu`Imm*ve%c`(qo7{mI8=z>=<5W~`4_9b3xGOfLjofm%K(Dt2@% zmThn>`Eg2A5hSzJnVW4eb1TPE1yrUcT4y9%EX|B0LFHJo#n3vY(Kc8!7bsnc>UJ;* z)|`3`vJHkN4+pAHlL(t_T{+kWKfBDY5_AM0)&F*?O>#lW+!f0N*xCgpVK^#L0E1N7 z=bND18XOY}wlz!f&yBbiKT`cADr%BZnXYZq2+Tm&Y-^~b?2p^&V%co*%hZF<@bimj HUyl9{LHaKN literal 0 HcmV?d00001