TECHCEPTRON: #Logistic regression Model on Titanic dataset

#Logistic regression Model on Titanic dataset

#Please post your comments in the comment box if you think we can optimise the model prediction

titanic<-read.csv("C:/Users/******/Desktop/DayWiseTracker/Programming Concepts/Data Science/DataSets/titanic.csv", na.strings = c(""," "," ","?","NA"))
summary(titanic)
colnames(titanic)
View(titanic)

#Cols To Use: Not applying as want to make a naive model first
#Imputation
table(titanic$pclass)
library(DMwR)
knnImputation(data=titanic, k = 5)
#Error in knnImputation(titanic, k = 5) : Not sufficient complete cases for computing neighbors.
#Dropping boat and body
sum(is.na(titanic$boat)) #823
nrow(titanic) #1309
#Out of 1309 rows 823 are NA. So KnnImputation will not work

sum(is.na(titanic$body)) #1188
#Out of 1309 rows 1188 are NA. So KnnImputation will not work

titanic$boat<-NULL
titanic$body<-NULL
data<-knnImputation(data=titanic, k = 5)

#Binning not req
#Convert numerics to factor
str(data)
data$name<-NULL #if we take name then it will be converted to many factors by one hot encoding
data$ticket<-NULL
data$cabin<-NULL
data$home_dest<-NULL

data$age<-ifelse(data$age<30,"young",ifelse(data$age<60,"middle","Aged"))
data$sibsp<-ifelse(data$sibsp<3,"Low",ifelse(data$sibsp<5,"Mid","High"))
table(data$embarked)
data$parch<-ifelse(data$parch<3,"Low",ifelse(data$parch<5,"Mid","High"))

#Scaling
hist((data$fare)^1/3)
install.packages("uroot",dependencies = TRUE)
library(forecast)
BoxCox(data$fare,BoxCox.lambda(data$fare)) #Not good. Go with log
data$fare<-log(data$fare)

#Outlier mgmt: Let's live with this. As we are going to make naive model

#Model Construct
rows<-1:nrow(data)
set.seed(123)
trainIndex<-sample(rows,round(0.8*length(rows)))
train<-data[trainIndex,]
test<-data[-trainIndex,]
nrow(train)/nrow(data)
nrow(test)/nrow(data)

str(data)

model1<-glm(formula = survived ~ .-fare, family = binomial(link = "logit"), data = train)
plot(model1)
summary(model1)

#Prediction
preds<-predict(model1,test,type = 'response')

test$preds<-preds
test$preds<-ifelse(test$preds>0.5,1,0)

#Construct Confusion matrix
table(test$preds,test$survived,dnn = c('preds','actuals'))
#Precision: how much is predicted truely and positively from the total predicted values (60 out of 22+60)
precision<-60/(22+60)
#Recall: how much is predicted truely and positively from the total actual values (60 out of 34+60)
recall<-60/(34+60)

#By caret in-built functions
library(caret)
precision1 <- posPredValue(as.factor(test$preds), as.factor(test$survived), positive="1")
sensitivity1 <- sensitivity(as.factor(test$preds), as.factor(test$survived), positive="1")

#Calculate ROCR
library(ROCR)
rocrPred<-prediction(test$preds,test$survived)
rocrPerf<-performance(rocrPred,'tpr','fpr')
plot(rocrPerf,colorize=TRUE,text.adj=c(-0.2,1.7))

#plot glm
library(ggplot2)
ggplot(test, aes(x=Rating, y=Recommended)) + geom_point() +
stat_smooth(method="glm", family="binomial", se=FALSE)

TECHCEPTRON

Monday, May 28, 2018

#Logistic regression Model on Titanic dataset

#Logistic regression Model on Titanic dataset

No comments:

Post a Comment

Important Points in Data Science

Report Abuse