#Linear Regression Model on AirQuality dataset
#Your Comments are valuable for us. Please leave a comment if you think the model needs to be optimised more.
data(airquality)
airquality
summary(airquality)
head(airquality)
mapply(table, airquality)
table(airquality$Day)
#Preprocessing Steps
summary(airquality)
mapply(anyNA, airquality)
library(DMwR)
airquality<-knnImputation(airquality,k=5)
#Normalize the dataset
mapply(shapiro.test, airquality)
minMaxFunc<-function(x){
return((x-min(x))/(max(x)-min(x)))
}
airquality<-minMaxFunc(airquality)
str(airquality)
#Construct Model
rows<-1:nrow(airquality)
set.seed(123)
trainIndex<-sample(rows,round(0.8*length(rows)))
train<-airquality[trainIndex,]
test<-airquality[-trainIndex,]
nrow(train)/nrow(airquality)
nrow(test)/nrow(airquality)
model1<-lm(Ozone~.,data = train)
summary(model1)
# As month and day have no effect on model so removing them
train$Month<-NULL
train$Day<-NULL
test$Month<-NULL
test$Day<-NULL
model1<-lm(Ozone~.,data = train)
plot(model1)
abline(model1)
summary(model1)
preds<-predict(model1,test)
test$preds<-preds
#Calculating RMSE
rmse<-sqrt(mean((test$preds-test$Ozone)^2))
Hi Satya, Why are you taking k =5
ReplyDeleteairquality<-knnImputation(airquality,k=5)
Could you please elaborate this. Thanks
Hi Shrish,
ReplyDeleteThanks for your comment. As we know in KnnImputation, we have to look up the neighbour rows for filling up a missing value in a specific row. Hence here "k" denotes how many number of neighbour rows we have to look up and down in order to fill up the missing value. If its not able to find any value in 5 rows, it throws the below error:
"Not sufficient complete cases for computing neighbors"
Thanks Satya. If we will increase the k then it will cover maximum imputation range or its only denotes neighbour range in records.
ReplyDelete