core-methods-in-edm · beibeicao28 · Nov 21, 2019 · Dec 10, 2019 · Dec 10, 2019
diff --git a/Assignment7.Rmd b/Assignment7.Rmd
@@ -1,7 +1,7 @@
 ---
 title: "Assignment 7 - Answers"
-author: "Charles Lang"
-date: "11/30/2016"
+author: "Beibei Cao"
+date: "12/09/2019"
 output: html_document
 ---
 
@@ -11,24 +11,35 @@ In the following assignment you will be looking at data from an one level of an
 
 #Upload data
 ```{r}
-
+library(readr)
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+f<-file.choose("~/Desktop/hudk4050/Assignment 7/online.data.csv")
+D1 <- read.csv(f, header = TRUE)
 ```
 
 #Visualization 
 ```{r}
 #Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
-
+D1$level.up <- ifelse(D1$level.up == "yes", 1,0)
+D2 <- gather(D1, "measure", "score", 2:7)
+p <- ggplot(D2, aes(score)) + facet_wrap(~measure, scales = "free")
+p + geom_histogram()
 #Then visualize the relationships between variables
-
+pairs(D1)
 #Try to capture an intution about the data and the relationships
 
 ```
 #Classification tree
 ```{r}
 #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums)
-
+library(rpart)
+library(rpart.plot)
+rp <- rpart(level.up ~ post.test.score + av.assignment.score + forum.posts, method = "class", data = D1, control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
 #Plot and generate a CP table for your tree 
-
+printcp(rp)
+rpart.plot(rp)
 #Generate a probability value that represents the probability that a student levels up based your classification tree 
 
 D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
@@ -44,24 +55,34 @@ plot(performance(pred.detail, "tpr", "fpr"))
 abline(0, 1, lty = 2)
 
 #Calculate the Area Under the Curve
-unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
+unlist(slot(performance(pred.detail,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
 
 #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why?
+rp2 <- rpart(level.up ~ pre.test.score + messages, method = "class", data = D1, control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
+printcp(rp2)
+rpart.plot(rp2)
+D1$pred2 <- predict(rp2, type = "prob")[,2]
+pred.detail2 <- prediction(D1$pred2, D1$level.up) 
+plot(performance(pred.detail2, "tpr", "fpr"))
+abline(0, 1, lty = 2)
+unlist(slot(performance(pred.detail2,"auc"), "y.values"))
+#I think the first one is a better model since the auc is 1, which means that there is 100% percent to distinguish between positive class and negative class.
 ```
 ## Part III
 #Thresholds
 ```{r}
 #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold.
 
-threshold.pred1 <- 
+D1$threshold.pred1 <- ifelse(D1$pred2 >= 0.707, "yes", "no")
 
 #Now generate three diagnostics:
 
-D1$accuracy.model1 <-
-
-D1$precision.model1 <- 
-
-D1$recall.model1 <- 
+accuracy.model1 <- mean(ifelse(D1$level.up == D1$threshold.pred1,1,0))
+D1$truepos.model1 <- ifelse(D1$level.up == "yes"&D1$threshold.pred1 == "yes",1,0)
+D1$falsepos.model1 <- ifelse(D1$level.up == "no" & D1$threshold.pred1 == "yes", 1,0)
+D1$falseneg.model1 <- ifelse(D1$level.up == "yes" & D1$threshold.pred1 == "no", 1,0)
+precision.model1 <- sum(D1$truepos.model1)/(sum(D1$truepos.model1) + sum(D1$falsepos.model1))
+recall.model1 <- sum(D1$truepos.model1)/(sum(D1$truepos.model1) + sum(D1$falseneg.model1))
 
 #Finally, calculate Kappa for your model according to:
 
@@ -75,7 +96,17 @@ matrix1 <- as.matrix(table1)
 kappa(matrix1, exact = TRUE)/kappa(matrix1)
 
 #Now choose a different threshold value and repeat these diagnostics. What conclusions can you draw about your two thresholds?
-
+D1$threshold.pred2 <- ifelse(D1$pred2 >= 0.606, "yes", "no")
+accuracy.model2 <- mean(ifelse(D1$level.up == D1$threshold.pred2, 1, 0))
+D1$truepos.model2 <- ifelse(D1$level.up == "yes" & D1$threshold.pred2 == "yes", 1, 0)
+D1$falsepos.model2 <- ifelse(D1$level.up == "no" & D1$threshold.pred2 == "yes", 1,0)
+D1$falseneg.model2 <- ifelse(D1$level.up == "yes" & D1$threshold.pred2 == "no", 1,0)
+precision.model2 <- sum(D1$truepos.model2)/(sum(D1$truepos.model2) + sum(D1$falsepos.model2))
+recall.model2 <- sum(D1$truepos.model2)/(sum(D1$truepos.model2) + sum(D1$falseneg.model2))
+table2 <- table(D1$level.up, D1$threshold.pred2)
+matrix2 <- as.matrix(table2)
+kappa(matrix2, exact = TRUE)/kappa(matrix2)
+# The kappa value are very close with different thresholds.
 ```
 
 ### To Submit Your Assignment

diff --git a/Assignment7.html b/Assignment7.html