From e5008e21f9afbfa26474bb9e0f13bbf6470ee5d8 Mon Sep 17 00:00:00 2001
From: wuuyiijiaa <yijiawu0804@gmail.com>
Date: Thu, 19 Dec 2019 14:57:00 -0500
Subject: [PATCH 1/2] First attempt

---
 .gitignore        |  4 ++++
 assignment7.Rproj | 13 +++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 assignment7.Rproj

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b6a065
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/assignment7.Rproj b/assignment7.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/assignment7.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX

From 4acf1aef7cbd9c3c0b8a752b8b6674dd0a4cc158 Mon Sep 17 00:00:00 2001
From: wuuyiijiaa <yijiawu0804@gmail.com>
Date: Thu, 19 Dec 2019 17:45:18 -0500
Subject: [PATCH 2/2] assignment7

---
 Assignment7.Rmd | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/Assignment7.Rmd b/Assignment7.Rmd
index 105cbdf..7de58a2 100644
--- a/Assignment7.Rmd
+++ b/Assignment7.Rmd
@@ -11,27 +11,38 @@ In the following assignment you will be looking at data from an one level of an
 
 #Upload data
 ```{r}
+library("ggplot2")
+library(dplyr)
+library(tidyr)
+library(rpart)
+library(rpart.plot)
 
+M1 <- read.csv("online.data.csv", header = TRUE)
 ```
 
 #Visualization 
 ```{r}
 #Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
-
+M2 <- gather(M1, "measure", "score", -c(id, level.up))
+D1 <- ggplot(M2, aes(x = score, group = level.up)) + facet_grid(level.up~measure, scales = "free") + geom_histogram(stat = "count")
+D1
 #Then visualize the relationships between variables
-
+cor(M1) %>% 
+  corrplot::corrplot()
 #Try to capture an intution about the data and the relationships
 
 ```
 #Classification tree
 ```{r}
 #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums)
+D2 <- rpart(level.up ~ pre.test.score + messages + forum.posts, method = "class", data = M1, control = rpart.control(cp=0.01))
 
 #Plot and generate a CP table for your tree 
-
+printcp(D2)
+rpart.plot(D2)
 #Generate a probability value that represents the probability that a student levels up based your classification tree 
-
-D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
+M1$pred <- predict(D2, type = "prob")[,2]
+#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
 ```
 ## Part II
 #Now you can generate the ROC curve for your model. You will need to install the package ROCR to do this.
@@ -39,12 +50,13 @@ D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" whic
 library(ROCR)
 
 #Plot the curve
-pred.detail <- prediction(D1$pred, D1$level.up) 
+pred.detail <- prediction(M1$pred, M1$level.up) 
 plot(performance(pred.detail, "tpr", "fpr"))
 abline(0, 1, lty = 2)
 
 #Calculate the Area Under the Curve
-unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
+unlist(slot(performance(pred.detail,"auc"), "y.values"))
+#Unlist liberates the AUC value from the "performance" object created by ROCR
 
 #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why?
 ```
@@ -53,20 +65,18 @@ unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC valu
 ```{r}
 #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold.
 
-threshold.pred1 <- 
+threshold.pred1 <- ifelse(M1$pred >= 0.8, 1, 0)
 
 #Now generate three diagnostics:
 
-D1$accuracy.model1 <-
-
-D1$precision.model1 <- 
-
-D1$recall.model1 <- 
+M1$truepos.model1 <- ifelse(M1$level.up == "1" & M1$threshold.pred1 == "1", 1, 0)
+M1$falsepos.model1 <- ifelse(M1$level.up == "0" & M1$threshold.pred1 == "1", 1,0)
+M1$falseneg.model1 <- ifelse(M1$level.up == "1" & M1$threshold.pred1 == "0", 1,0)
 
 #Finally, calculate Kappa for your model according to:
 
 #First generate the table of comparisons
-table1 <- table(D1$level.up, D1$threshold.pred1)
+table1 <- table(M1$level.up, M1$threshold.pred1)
 
 #Convert to matrix
 matrix1 <- as.matrix(table1)