| title | author | date | output |
|---|---|---|---|
ex2 try1 |
Amos Zamir |
April 7, 2017 |
html_document |
Kaggle Profile Page: https://www.kaggle.com/zamiramos
knitr::opts_chunk$set(echo = TRUE)
setwd('D:/Code/Serve')
Read the train.csv and test.csv file into a dataframe. Use the parameter na.strings = "" to recognize empty strings as null values, otherwise these empty strings might raise errors when creating a model. Bind togther the train and test before the 'feature engineering'.
df <- read.csv("Titanic/train.csv",na.strings = "")
trainRows<-nrow(df)
test <-read.csv('Titanic/test.csv',na.strings = "")
test$Survived <- NA
df<-rbind(df,test)
Check the datatypes of the attributes using the str method. You should find two numeric features that must be converted into factors. Convert these two features to factors.
str(df)
Fix numeric features that must be converted into factors.
df$Survived<- as.factor(df$Survived)
df$Pclass<- as.factor(df$Pclass)
It is easier to explore factors and numeric features separately. Here we divide the features' names to numerics and factors:
traindf<-head(df[-1], trainRows)
cols<- 1:dim(traindf)[2]
factors <- cols[sapply(traindf,is.factor)]
numerics <- cols[!sapply(traindf,is.factor)]
We now tide the data two times: the first is for categorial data and the second for numeric data.
#install.packages("tidyr")
library(tidyr)
df_tidy_factors<-gather(traindf[,factors],"feature","value",-1)
df_tidy_numerics<-gather(cbind(Survived=traindf[,1],traindf[,numerics]),"feature","value",-1)
Finally, we can plot. The first plot describes only categorical features (factors). Notice that the scales parameter was set to "free" to enable a suitable scaling for each facet (otherwise it is hard to view some of the facets, that need much smaller scales). We use the facet_grid that accepts a scales parameter.
#install.packages("ggplot2")
library(ggplot2)
qplot(x=value,data=df_tidy_factors,fill=Survived) + facet_grid(~feature,scales="free")
It looks like Cabin, Name, Ticket has many levels and needs to be processed for getting more valuable features.
One more plot for numeric features:
qplot(x=value,data=df_tidy_numerics,fill=Survived) + facet_grid(~feature,scales="free")
It certainly looks luck there are more chances to survive in certain levels of almost each feature.
df$TicketPrefix <- mapply(function(x) {strsplit(x, '\\s+')[[1]]}, as.character(df$Ticket))
df$TicketPrefix <- mapply(function(x) {ifelse(length(x)>1, x[1], NA)}, df$TicketPrefix)
df$TicketPrefix <- mapply(function(x) {gsub('[./,]','', x)}, df$TicketPrefix)
df$TicketPrefix <- mapply(function(x) {toupper(x)}, df$TicketPrefix)
df$TicketPrefix <- as.factor(df$TicketPrefix)
table(df$TicketPrefix)
Sir - gather all man respectful titles Lady - gather all women respectful titles Mlle - In english equals to Miss Mme - In english equals to Ms
Seperate the PersonalTitles from the passenger names.
df$PersonalTitles <- mapply(function(x) {strsplit(x, '[,.]')[[1]][2]}, as.character(df$Name))
df$PersonalTitles[df$PersonalTitles %in% c(' Capt',' Col', ' Don', ' Major', ' Sir')] <- ' Sir'
df$PersonalTitles[df$PersonalTitles %in% c(' Jonkheer',' Dona', ' Lady', ' the Countess')] <- ' Lady'
df$PersonalTitles[df$PersonalTitles %in% c(' Mlle')] <- ' Miss'
df$PersonalTitles[df$PersonalTitles %in% c(' Mme')] <- ' Ms'
df$PersonalTitles<-as.factor(df$PersonalTitles)
table(df$PersonalTitles)
Seperate the surname from the passenger names
SurName <- mapply(function(x) {strsplit(x, '[,.]')[[1]][1]}, as.character(df$Name))
sibSp - number of siblings / spouses aboard the Titanic parch - number of parents / children aboard the Titanic FamilySize = sibSp + parch + 1
df$FamilySize <- mapply(function(sibSp, parch) { sibSp + parch + 1}, df$SibSp, df$Parch)
table(df$FamilySize)
Combine FamilySize and SurName to new feature
df$FamilySizeSurName <- mapply(function(familyS, surN) { paste(as.character(familyS), as.character(surN), sep='')}, df$FamilySize, SurName)
FamilySizeSurNameCount<-as.data.frame(table(df$FamilySizeSurName))
df$FamilySizeSurName <- mapply(function(familySS) {
(FamilySizeSurNameCount[which(FamilySizeSurNameCount$Var1 == familySS),]$Freq -mean(FamilySizeSurNameCount$Freq)) /sqrt(var(FamilySizeSurNameCount$Freq))
}, df$FamilySizeSurName)
# it seems that big families most likely not survived
#plot(df$FamilySizeSurName, df$Survived)
The cabin room number can be meaningful.
df$CabinMinRoomNumber<-mapply(function(x)(gsub('[a-zA-Z]', '', x)), df$Cabin)
df$CabinMinRoomNumber <- mapply(function(x) {strsplit(x, '\\s+' )[[1]][1]}, as.character(df$CabinMinRoomNumber))
df$CabinMinRoomNumber <- as.integer(df$CabinMinRoomNumber)
table(df$CabinMinRoomNumber)
The cabin level can be meaningful.
df$CabinLevel<-mapply(function(x)(gsub('[^a-zA-Z]', '', x)), df$Cabin)
df$CabinLevel <- mapply(function(x) {substr(x,1,1)}, as.character(df$CabinLevel))
df$CabinLevel <- as.factor(df$CabinLevel)
table(df$CabinLevel)
Lets run summary to check how much NA's we have
summary(df)
we can see that 'Age' have 263 missing values and 'Fare' have one and etc.. Lets complete these values using rpart algorithm:
1.Predict Age Column using 'anova' method because it continues value
library('rpart')
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + PersonalTitles + FamilySize,
data=df[!is.na(df$Age),],
method="anova")
df$Age[is.na(df$Age)] <- predict(Agefit, df[is.na(df$Age),])
2.Predict Ticket coulumn
TicketPrefixfit <- rpart(TicketPrefix ~ Pclass + Age + Sex + SibSp + Parch + Fare + Embarked + PersonalTitles + FamilySize,
data=df[!is.na(df$TicketPrefix),],
method="class")
df$TicketPrefix[is.na(df$TicketPrefix)] <- predict(TicketPrefixfit, df[is.na(df$TicketPrefix),],type = "class")
3.Predict Fare coulmn
Farefit <- rpart(Fare ~ Pclass + Sex + SibSp + Parch + Age + Embarked + PersonalTitles + FamilySize,
data=df[!is.na(df$Fare),],
method="anova")
df$Fare[is.na(df$Fare)] <- predict(Farefit, df[is.na(df$Fare),])
4.Predict Embarked coulomn
Embarkedfit <- rpart(Embarked ~ Pclass + Age + Sex + SibSp + Parch + Fare + PersonalTitles + FamilySize,
data=df[!is.na(df$Embarked),],
method="class")
df$Embarked[is.na(df$Embarked)] <- predict(Embarkedfit, df[is.na(df$Embarked),],type = "class")
5.Predict CabinMinRoomNumber column
CabinMinRoomNumberfit <- rpart(CabinMinRoomNumber ~ Pclass + Sex + SibSp + Parch + + Fare+ Age + Embarked + PersonalTitles + FamilySize,
data=df[!is.na(df$CabinMinRoomNumber),],
method="anova")
df$CabinMinRoomNumber[is.na(df$CabinMinRoomNumber)] <- predict(CabinMinRoomNumberfit, df[is.na(df$CabinMinRoomNumber),])
6.Predict Cabin Level column
CabinLevelfit <- rpart(CabinLevel ~ Pclass + Sex + Fare + Age + PersonalTitles,
data=df[!is.na(df$CabinLevel),],
method="class")
df$CabinLevel[is.na(df$CabinLevel)] <- predict(CabinLevelfit, df[is.na(df$CabinLevel),],type = "class")
df$Fare<-mapply(function(fare){(fare - mean(df$Fare))/sqrt(var(df$Fare))}, df$Fare)
dfBackup<-df
df <- df[,-c(1,4, 9, 11)]
Split the data back to test and train
traindf<-head(df, trainRows)
testdf<-tail(df, -trainRows)[-1]
In the section of feature engineering above.
cforest - An implementation of the random forest and bagging ensemble algorithms utilizing conditional inference trees as base learners.
Function cforest_unbiased returns the settings suggested for the construction of unbiased random forests (teststat = "quad", testtype = "Univ", replace = FALSE) by Strobl et al. (2007) and is the default since version 0.9-90
library(party)
library(caret)
set.seed(123)
#Fare + PersonalTitles
#TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber
cforestfit1<-cforest(Survived~ Pclass + Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber,
data = traindf,
controls=cforest_unbiased(ntree=400, mtry=3))
cforestfit2<-cforest(Survived~ Pclass + Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber,
data = traindf,
controls=cforest_unbiased(ntree=800, mtry=3))
cforestfit3<-cforest(Survived~ Pclass + Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber,
data = traindf,
controls=cforest_unbiased(ntree=1200, mtry=3))
cforestfit4<-cforest(Survived~ Pclass + Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber,
data = traindf,
controls=cforest_unbiased(ntree=2000, mtry=3))
cforestfit5<-cforest(Survived~ Pclass + Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber,
data = traindf,
controls=cforest_unbiased(ntree=3000, mtry=3))
cforestfit <- c(cforestfit1, cforestfit2, cforestfit3, cforestfit4, cforestfit5)
cforestStatsArray<-sapply(cforestfit, function(x){ cforestStats(x) })
cforestStatsArray
It seems that ntree=2000 is the best.
cforestfit <- cforestfit4
cforest_importance <- varimp(cforestfit)
dotchart(cforest_importance[order(cforest_importance)])
Prediction <- predict(cforestfit, testdf, OOB=TRUE, type = "response")
write to file
res1<-as.numeric(Prediction)
res1[res1==1]<-0
res1[res1==2]<-1
res <- cbind(PassengerId=test$PassengerId,Survived=res1)
write.csv(res,file="Titanic/try1.csv",row.names = F)
In the section of feature engineering above.
The rpart package found in the R tool can be used for classification by decision trees and can also be used to generate regression trees. Recursive partitioning is a fundamental tool in data mining. It helps us explore the structure of a set of data, while developing easy to visualize decision rules for predicting a categorical (classification tree) or continuous (regression tree) outcome. The rpart programs build classification or regression models of a very general structure using a two stage procedure; the resulting models can be represented as binary trees. The tree is built by the following process: first the single variable is found which best splits the data into two groups ('best' will be defined later). The data is separated, and then this process is applied separately to each sub-group, and so on recursively until the subgroups either reach a minimum size (5 for this data) or until no improvement can be made. The resultant model is, with certainty, too complex, and the question arises as it does with all stepwise procedures of when to stop. The second stage of the procedure consists of using cross-validation to trim back the full tree.
library(rpart)
set.seed(123)
trcontrol <- trainControl(method="cv", number=10, repeats=3)
rpartfit <- train(
Survived~., data=traindf, method="rpart", trControl=trcontrol,
tuneGrid = expand.grid(
cp = c(0.01,0.005,0.0005)
))
plot(rpartfit)
it seems like 0.005 give the best acuuracy.
varImp(rpartfit)
Prediction <- predict(rpartfit, testdf)
write to file
res1<-as.numeric(Prediction)
res1[res1==1]<-0
res1[res1==2]<-1
res <- cbind(PassengerId=test$PassengerId,Survived=res1)
write.csv(res,file="Titanic/try2.csv",row.names = F)
In the section of feature engineering above.
Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function.
library(stats)
set.seed(123)
trcontrol <- trainControl(method="cv", number=10, repeats=3)
gbmfit <- train(
Survived~., data=traindf, method="gbm", trControl=trcontrol,
tuneGrid = expand.grid(
n.trees = c(1000,2000),
interaction.depth = c(5,10,20),
shrinkage = c(0.001, 0.01, 0.1),
n.minobsinnode = c(10, 20, 30)
),
verbose = FALSE)
plot(gbmfit)
We can see that the best tuning is shrinkage = 0.01 and interaction.depth is 10. n.minobsinnode = 30
lets test more ntree options:
library(stats)
set.seed(123)
trcontrol <- trainControl(method="cv", number=10, repeats=3)
gbmfit <- train(
Survived~Sex + Age + Embarked + TicketPrefix + FamilySize + PersonalTitles + FamilySizeSurName + CabinMinRoomNumber, data=traindf, method="gbm", trControl=trcontrol,
tuneGrid = expand.grid(
n.trees = c(300, 500, 1000, 1500,2000),
interaction.depth = 10,
shrinkage = 0.01,
n.minobsinnode = 30
),
verbose = FALSE)
plot(gbmfit)
great ntree = 500 is the best score.
new_pred<- predict(gbmfit,newdata = testdf)
new_pred<-sapply(new_pred, function(x){ as.character(x) == "1" })
res <- cbind(PassengerId=test$PassengerId,Survived=new_pred)
write.csv(res,file="Titanic/try3.csv",row.names = F)
In the section of feature engineering above.
We will use in ensemble of the 3 models (above- try1,try2,try3):
- cforest - An implementation of the random forest and bagging ensemble algorithms utilizing conditional inference trees as base learners.
- rpart - Decision trees.
- gbm - Stochastic Gradient Boosting.
and RandomForest as the stacking algorithm.
We already calibrated the parameters of each algorithm separately.
Let's look that there is no high colleration between them:
library(caret)
library(caretEnsemble)
library(party)
set.seed(123)
control <- trainControl(method="cv", number=10, savePredictions='final', classProbs=TRUE)
models <- caretList(
make.names(Survived)~.,
data=traindf,
metric='Accuracy',
trControl=control,
tuneList = list(
cforest = caretModelSpec(
method = "cforest",
controls = cforest_unbiased(ntree=2000, mtry=3)
),
gbm = caretModelSpec(
method = "gbm",
tuneGrid = expand.grid(
n.trees = 1000,
interaction.depth = 10,
shrinkage = 0.01,
n.minobsinnode = 30
),
verbose = FALSE
),
rpart = caretModelSpec(
method = "rpart",
tuneGrid = expand.grid(
cp = 0.005
)
)
)
)
results <- resamples(models)
summary(results)
modelCor(results)
We can see that there is no high correlation (<75) between them and we can continue to build one model from all the models:
set.seed(123)
stackControl <- trainControl(method="cv", number=10, savePredictions='final', classProbs=TRUE)
stack.rf <- caretStack(models, method="rf", metric="Accuracy", trControl=stackControl, tuneGrid =
expand.grid(mtry = c(2, 3, 6, 9)))
print(stack.rf)
new_pred<- predict(stack.rf,newdata = testdf)
length(new_pred)
new_pred<-sapply(new_pred, function(x){ as.character(x) == "X1" })
Write the PassengerId and Survived attributes to a csv file and submit this file to kaggle's competition
res <- cbind(PassengerId=test$PassengerId,Survived=new_pred)
write.csv(res,file="Titanic/try4.csv",row.names = F)





