#Naive Bayes Classifier

#To clear the environment
rm(list=ls())

#Import the required libraries
library(naivebayes)
library(dplyr)
library(ggplot2)
library(psych)

#Import and load the dataset
data <- read.csv('titanic.csv')
str(data)

#Check for missing values in dataset
sum(is.na(data))

#Cleaning NA values
data_clean <- na.omit(data)
sum(is.na(data_clean))

#To convert int in 'Survived' column to factor
data_clean$Survived <- as.factor(data_clean$Survived)

#To convert int in 'Pclass' column to factor
data_clean$Pclass <- as.factor(data_clean$Pclass)
data_clean <- select(data_clean,-c(PassengerId,Name,Ticket,Cabin,Embarked))
str(data_clean)

#Check the independence of attributes
pairs.panels(data_clean)

data_clean %>%
  ggplot(aes(x=Survived,y=Fare,fill=Survived))+
  geom_boxplot()+
  ggtitle('Survived Box Plot based on Fare')
data_clean %>%
  ggplot(aes(x=Fare,fill=Survived))+
  geom_density(alpha=0.75,color='black')+
  ggtitle('Density Plot based on Fare')

#Split dataset into training and testing data
set.seed(234)
smpl<-sample(2,nrow(data_clean),replace=T,prob=c(0.8,0.2))
train<-data_clean[smpl==1,]
test<-data_clean[smpl==2,]

mdl<-naive_bayes(Survived~ .,data=train)
mdl
plot(mdl)

p<-predict(mdl,train,type='prob')
head(cbind(p,train))

#To find the accuracy of prediction
p1<-predict(mdl,train)
(tab1<-table(p1,train$Survived))
1-sum(diag(tab1))/sum(tab1)