#Author: Dr.R.Maheswari,Professor, School of Computing Science & Engineering, VIT Chennai
#Ex3- Real-time environmental monitoring and weather prediction using Multiple Linear Regression, SVM and Naïve Bayes classification
#R version 3.3.2 (2016-10-31)
#RStudio version 1.2.1335

#Program Execution

#1. Install the required packages

install.packages('TTR')
install.packages('caret')
install.packages('outliers')
install.packages('caTools')
install.packages('ROCR')

#2. Create function to get mode of categorical data and for removing outliers

#Data preprocessing
#Function to get mode of categorical data
#usage of functions
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}#Function for removing outliers
out.rem<-function(x) {
  x[which(x==outlier(x))]=NA
  x
}

#3. Load the dataset and pre-process the data

set.seed(1023)
weather_data <- read.csv("/content/weatherAUS.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
is.numeric(weather_data$WindGustDir)
weather_data2 <- subset(weather_data, select = -c(Date, Location, Rainfall, RainToday))
colnames(weather_data2)
weather_data3 <- weather_data2[complete.cases(weather_data2),]
summary(weather_data2)
weather_data2

#4. Use the Simple Moving Average forecasting model and visualize the output

#SMA - Simple Moving Average
library("TTR")
library("data.table")   
  
weather_col1 <- fread("/content/weatherAUS.csv",
                  select = c("MinTemp"))
weather_col1series <- ts(weather_col1,frequency=12, start=c(2015,1))

weather_col1series[is.na(weather_col1series)]<-mean(weather_col1series,na.rm=TRUE) #Replace NA with mean

weather_col1seriesSMA3 <- SMA(weather_col1series,n=12)
plot.ts(weather_col1seriesSMA3)

#5. Use the Exponential smoothing forecasting model and visualize the output

#Forecast using Exponential smoothing
weather_col1 <- fread("/content/weatherAUS.csv",
                  select = c("MinTemp"))
weather_col1series <- ts(weather_col1,frequency=12, start=c(2015,1))
weather_col1series[is.na(weather_col1series)]<-mean(weather_col1series,na.rm=TRUE) #Replace NA with mean

weather_col1seriesforecasts <- HoltWinters(weather_col1series, beta=FALSE, gamma=FALSE)
weather_col1seriesforecasts

weather_col1seriesforecasts$SSE

HoltWinters(weather_col1series, beta=FALSE, gamma=FALSE, l.start=23.56)

#6. Perform Exploratory Data Analysis using box plot and histograms

# Arima Model

library("TTR")
v1 <- weather_data[[4]] 
weather_datats <- ts(v1)



## partition into train and test
train_series=weather_datats[1:40]
test_series=weather_datats[41:50]

## make arima models
arimaModel_1=arima(train_series, order=c(0,1,2))
arimaModel_2=arima(train_series, order=c(1,1,0))
arimaModel_3=arima(train_series, order=c(1,1,2))

## look at the parameters
print(arimaModel_1);print(arimaModel_2);print(arimaModel_3)

forecast1=predict(arimaModel_1, 10)
forecast2=predict(arimaModel_2, 10)
forecast3=predict(arimaModel_3, 10)

forecast1
forecast2
forecast3

#7. Exploratory Data Analysis(EDA) using box plot and histogram

##1. Exploratory Data Analysis(EDA)
#get the boxplots
library(ggplot2)
gp <- invisible(lapply(weather_data3, function(x) { 
  ggplot(data=weather_data3, aes(x= RainTomorrow, y=eval(parse(text=x)), col = RainTomorrow)) + geom_boxplot() + xlab("RainTomorrow") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
gp[[1]]
gp[[2]]
gp[[3]]
gp[[4]]
gp[[5]]
gp[[6]]
gp[[7]]
gp[[8]]
gp[[9]]
gp[[10]]
gp[[11]]
gp[[12]]
gp[[13]]
gp[[14]]
#histograms
#Check the skewness of data
hist(weather_data$MinTemp)
hist(weather_data$MaxTemp)
hist(weather_data$Evaporation)
hist(weather_data$Sunshine)
hist(weather_data$WindGustSpeed)
hist(weather_data$WindSpeed9am)
hist(weather_data$WindSpeed3pm)
hist(weather_data$Humidity9am)
hist(weather_data$Humidity3pm)
hist(weather_data$Temp9am)
hist(weather_data$Temp3pm)
hist(weather_data$Pressure9am)
hist(weather_data$Pressure3pm)

#8. Feature extraction - use Chi-Square to check whether the variables are dependent on RainTomorrow

#2.Feature Extraction
#Get the categorical variables
#2.1. Chi-Square to check whether the variables are dependent on RainTomorrow
factor_vars1 <- names(which(sapply(weather_data3, class) == "factor"))
factor_vars1
factor_vars1 <- setdiff(factor_vars1, "RainTomorrow")
factor_vars1
chisq_test_res <- lapply(factor_vars1, function(x) { 
  chisq.test(weather_data3[,x], weather_data3[, "RainTomorrow"], simulate.p.value = TRUE)
})
names(chisq_test_res) <- factor_vars1
chisq_test_res
#Baed on the chisquare values including categorical variables WindGustDir,WindDir9am and WindDir3pm

#9. Remove Categorical variables from dataset

#Feature Extraction for numberic variables
#2.2Method:Correlation
#Remove Categorical variables from dataset
weather_data4 <- subset(weather_data2, select = -c(WindDir9am, WindDir3pm))
colnames(weather_data4)
weather_data5 <- weather_data4[complete.cases(weather_data4),]
numeric_vars <- setdiff(colnames(weather_data5), factor_vars1)
numeric_vars <- setdiff(numeric_vars, "RainTomorrow")
numeric_vars_mat <- as.matrix(weather_data5[, numeric_vars, drop=FALSE])
numeric_vars_cor <- cor(numeric_vars_mat)

#10. Find correlation

library(caret)
fndCorrelation = findCorrelation(numeric_vars_cor, cutoff=0.6) # putt any value as a "cutoff"
fndCorrelation = sort(fndCorrelation)
reduced_Data = numeric_vars_mat[,c(fndCorrelation)]
cols=colnames(reduced_Data)
cols
summary (reduced_Data)

#11. Get the numeric and categorical variables

# Get the numeric and categorical variables
library(dplyr)
weather_data7= weather_data2[c("WindGustDir","WindDir9am","WindDir3pm","RainTomorrow")]
weather_data9= weather_data2[c(cols)]

#12. Remove outliers

#remove outliers
library(outliers)
apply(weather_data9,2,out.rem)
colnames(weather_data9)

#13. merge numeric and factor columns

#merge numeric and factor columns
weather_data10=cbind(weather_data9,weather_data7)
summary(weather_data10)
dim(weather_data10$Pressure3pm)
dim(weather_data10$Cloud3pm)

#14. Data normalisation/Cleaning - Replace NA values with mean,mode

#Data normalisation/Cleaning 
#Replace NA values with mean,mode
library(dplyr)
weather_data10=weather_data10 %>% mutate_if(is.numeric, funs(replace(.,is.na(.), mean(., na.rm = TRUE)))) %>%
  mutate_if(is.factor, funs(replace(.,is.na(.), getmode(na.omit(.)))))
summary(weather_data10)

#15. Plot to verify results of Data Preprocessing

#plot to verify results of Data Preprocessing
hist(weather_data10$MaxTemp)
hist(weather_data10$Sunshine)
hist(weather_data10$WindGustSpeed)
hist(weather_data10$Humidity9am)
hist(weather_data10$Pressure3pm)
hist(weather_data10$Cloud3pm)
hist(weather_data10$Temp9am)
hist(weather_data10$Temp3pm)
WindGustDirnum=as.numeric(weather_data10$WindGustDir)
WindGustDirnum
WindDir9amnum=as.numeric(weather_data10$WindDir9am)
WindDir9amnum
WindDir3pmnum=as.numeric(weather_data10$WindDir3pm)
WindDir3pmnum
hist(WindGustDirnum)
hist(WindDir9amnum)
hist(WindDir3pmnum)

#16. Data modeling

# 4.Data Modeling  
weather_data10$WindGustDir=as.numeric(weather_data10$WindGustDir)
weather_data10$WindDir9am=as.numeric(weather_data10$WindDir9am)
weather_data10$WindDir3pm=as.numeric(weather_data10$WindDir3pm)

#17. Convert Raintomrrow data to numeric

#Convert Raintomrrow data to numeric
library(plyr)
weather_data10
weather_data10$RainTomorrow <- revalue(weather_data10$RainTomorrow, c("Yes"=1))
weather_data10$RainTomorrow <- revalue(weather_data10$RainTomorrow, c("No"=0))

#18. Splitting the data into train and test

#Data us split to test and train data in the ratio 75:25
weather_data10
library(caTools)
set.seed(123)
split = sample.split(weather_data10$RainTomorrow, SplitRatio = 0.75)
training_set = subset(weather_data10, split == TRUE)
test_set = subset(weather_data10, split == FALSE)
training_set$RainTomorrow
# Feature Scaling
training_set[-12] = scale(training_set[-12])
test_set[-12] = scale(test_set[-12])

#19. Multiple Linear regression

##Multiple Linear Regression
# Fitting Logistic Regression to the Training set
classifier = glm(formula = RainTomorrow ~ .,
                 family = binomial,
                 data = training_set)
summary(classifier)
#Predict using test set
prob_pred = predict(classifier, type = 'response')
prob_prd_glm=predict(classifier, type = 'response', newdata = test_set[-12])
y_pred = ifelse(prob_prd_glm > 0.5, 1, 0)

library(ROCR)
ROCRpred <- prediction(prob_pred, training_set$RainTomorrow)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))

cm3 = table(test_set[,12], y_pred)
confusionMatrix(cm3)

summary(training_set)
training_set$RainTomorrow=as.numeric(training_set$RainTomorrow)
is.numeric(training_set$RainTomorrow)

#20. Support Vector Machine

#SVM
library(e1071)
svmfit = svm(formula = RainTomorrow ~ .,
             data = training_set,
             type = 'C-classification',
             kernel = 'linear')

# Predicting the Test set results
y_pred_svm = predict(svmfit, newdata = test_set[-12])

y_pred_svm1 = ifelse(y_pred_svm == 1, 0, 1)

# Making the Confusion Matrix
cm_svm = table(test_set[,12], y_pred_svm1)
confusionMatrix(cm_svm)

#21. Naive Bayes

#Naive Bayes

# Loading package
library(e1071)
library(caTools)
library(caret)
 
set.seed(120)  # Setting Seed
classifier_cl <- naiveBayes(RainTomorrow ~ ., data = training_set)
classifier_cl

y_pred_nb = predict(classifier_cl, newdata = test_set[-12])

y_pred_nb1 = ifelse(y_pred_nb == 1, 0, 1)

# Making the Confusion Matrix
cm_nb = table(test_set[, 12], y_pred_nb1)

#Model Evaluation
confusionMatrix(cm_nb)


#Conclusion
#Real-time environmental monitoring and weather prediction was successfully implemented and visuazlied using Multiple Linear Regression, SVM and Naïve Bayes classification.
