Traffic pattern recognition using correlation study, Naïve Bayes and Decision tree classification and Time series forecasting.

The term “traffic patterns recognition” refers to the process of recognising a user’s current traffic pattern, which can be applicable to transportation planning, location-based services, social networks, and a range of other applications. This work will be performing the correlation study, Naïve Bayes, Decision tree and forecasting on the data using various libraries and methods.

Procedure

  1. Import required packages after installing

  2. Load and read the data set

  3. Pre-process the data appropriately

  4. Use summary method to see the characteristics of the data set

  5. Use the Simple Moving Average forecasting model and visualize the output

  6. Use the Exponential smoothing forecasting model and visualize the output

  7. Use the Arima forecasting model and view the output

  8. Get the correlation between the columns

  9. Split the data set into training and testing in the ratio of 70:30

  10. Perform Decision tree classification and view the results in tree format

  11. Perform Naïve Bayes and view the results in confusion matrix

  12. Import required packages after installing

library("e1071")
library("caTools")
library("caret")
## Loading required package: ggplot2
## Loading required package: lattice
library("party")
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("magrittr")
library("TTR")
library("data.table") 
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
  1. Load the data set
data <- read.csv("traffic.csv")
data
  1. Pre-process the data appropriately
data$DateTime = strtrim(data$DateTime,15)
data
  1. Use summary method to see the characteristics of the data set
print("Summary of Dataset")
## [1] "Summary of Dataset"
summary(data)
##    DateTime            Junction        Vehicles            ID           
##  Length:48120       Min.   :1.000   Min.   :  1.00   Min.   :2.015e+10  
##  Class :character   1st Qu.:1.000   1st Qu.:  9.00   1st Qu.:2.016e+10  
##  Mode  :character   Median :2.000   Median : 15.00   Median :2.016e+10  
##                     Mean   :2.181   Mean   : 22.79   Mean   :2.016e+10  
##                     3rd Qu.:3.000   3rd Qu.: 29.00   3rd Qu.:2.017e+10  
##                     Max.   :4.000   Max.   :180.00   Max.   :2.017e+10
  1. Use the Simple Moving Average forecasting model and visualize the output
t_col1 <- fread("traffic.csv",select = c("Vehicles"))
t_col1series <- ts(t_col1,frequency=12, start=c(2015,1))
t_col1series[is.na(t_col1series)]<-mean(t_col1series,na.rm=TRUE) #Replace NA with mean
t_col1seriesSMA3 <- SMA(t_col1series,n=12)
plot.ts(t_col1seriesSMA3)

  1. Use the Exponential smoothing forecasting model and visualize the output
#Forecast using Exponential smoothing
t_col1 <- fread("traffic.csv",select = c("Junction"))
t_col1series <- ts(t_col1,frequency=12, start=c(2015,1))
t_col1series[is.na(t_col1series)]<-mean(t_col1series,na.rm=TRUE) #Replace NA with mean
t_col1seriesforecasts <- HoltWinters(t_col1series, beta=FALSE, gamma=FALSE)
t_col1seriesforecasts
## Holt-Winters exponential smoothing without trend and without seasonal component.
## 
## Call:
## HoltWinters(x = t_col1series, beta = FALSE, gamma = FALSE)
## 
## Smoothing parameters:
##  alpha: 0.9999188
##  beta : FALSE
##  gamma: FALSE
## 
## Coefficients:
##   [,1]
## a    4
t_col1seriesforecasts$SSE
## [1] 3
HoltWinters(t_col1series, beta=FALSE, gamma=FALSE, l.start=23.56)
## Holt-Winters exponential smoothing without trend and without seasonal component.
## 
## Call:
## HoltWinters(x = t_col1series, beta = FALSE, gamma = FALSE, l.start = 23.56)
## 
## Smoothing parameters:
##  alpha: 0.9999188
##  beta : FALSE
##  gamma: FALSE
## 
## Coefficients:
##   [,1]
## a    4

7.Use the Arima forecasting model and view the output

library("TTR")
v1 <- data[[4]]
datats <- ts(v1)
## partition into train and test
train_series=datats[1:40]
test_series=datats[41:50]
## make arima models
arimaModel_1=arima(train_series, order=c(0,1,2))
arimaModel_2=arima(train_series, order=c(1,1,0))
arimaModel_3=arima(train_series, order=c(1,1,2))
## look at the parameters
print(arimaModel_1);print(arimaModel_2);print(arimaModel_3)
## 
## Call:
## arima(x = train_series, order = c(0, 1, 2))
## 
## Coefficients:
##          ma1     ma2
##       0.0005  0.0005
## s.e.  0.0061  0.0061
## 
## sigma^2 estimated as 15294:  log likelihood = -243.23,  aic = 492.45
## 
## Call:
## arima(x = train_series, order = c(1, 1, 0))
## 
## Coefficients:
##          ar1
##       0.0005
## s.e.  0.0061
## 
## sigma^2 estimated as 15297:  log likelihood = -243.23,  aic = 490.46
## 
## Call:
## arima(x = train_series, order = c(1, 1, 2))
## 
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
##         ar1    ma1     ma2
##       4e-04  2e-04  0.0005
## s.e.    NaN    NaN  0.0061
## 
## sigma^2 estimated as 15294:  log likelihood = -243.23,  aic = 494.45
forecast1=predict(arimaModel_1, 10)
forecast2=predict(arimaModel_2, 10)
forecast3=predict(arimaModel_3, 10)
forecast1
## $pred
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 20151102151 20151102151 20151102151 20151102151 20151102151 20151102151
##  [7] 20151102151 20151102151 20151102151 20151102151
## 
## $se
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 123.6684 174.9410 214.3161 247.5044 276.7408 303.1707 327.4745 350.0951
##  [9] 371.3403 391.4341
forecast2
## $pred
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 20151102151 20151102151 20151102151 20151102151 20151102151 20151102151
##  [7] 20151102151 20151102151 20151102151 20151102151
## 
## $se
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 123.6808 174.9585 214.2989 247.4622 276.6787 303.0918 327.3808 349.9882
##  [9] 371.2213 391.3040
forecast3
## $pred
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 20151102151 20151102151 20151102151 20151102151 20151102151 20151102151
##  [7] 20151102151 20151102151 20151102151 20151102151
## 
## $se
## Time Series:
## Start = 41 
## End = 50 
## Frequency = 1 
##  [1] 123.6684 174.9409 214.3159 247.5042 276.7405 303.1705 327.4742 350.0948
##  [9] 371.3399 391.4337
  1. Correlation study
print("Correlation between Traffic and Junction")
## [1] "Correlation between Traffic and Junction"
cor(data$Vehicles,data$Junction,method = "pearson")
## [1] -0.6137872
cor.test(data$Vehicles,data$Junction,method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$Vehicles and data$Junction
## t = -170.54, df = 48118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6193256 -0.6081877
## sample estimates:
##        cor 
## -0.6137872
  1. Split the data set into training and testing in the ratio of 70:30
split <- sample.split(data, SplitRatio = 0.7)
train_cl <- subset(data, split == "TRUE")
test_cl <- subset(data, split == "FALSE")

10.Perform Decision tree classification and view the results in tree format

model<- ctree(Vehicles ~ Junction, train_cl)
plot(model)

11.Perform Naïve Bayes and view the results in confusion matrix

set.seed(120) # Setting Seed
classifier_cl <- naiveBayes(Junction ~ ., data = train_cl)
# Predicting on test data
y_pred <- predict(classifier_cl, newdata = test_cl)
# Confusion Matrix
cm <- table(test_cl$Junction, y_pred)
cm
##    y_pred
##        1    2    3    4
##   1 5146    9  821 1320
##   2  408  686  361 5841
##   3  397  510  466 5923
##   4    1   36    2 2133
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    y_pred
##        1    2    3    4
##   1 5146    9  821 1320
##   2  408  686  361 5841
##   3  397  510  466 5923
##   4    1   36    2 2133
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3504          
##                  95% CI : (0.3444, 0.3565)
##     No Information Rate : 0.6325          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2187          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity            0.8646  0.55278  0.28242  0.14017
## Specificity            0.8813  0.71033  0.69523  0.99559
## Pos Pred Value         0.7053  0.09402  0.06387  0.98204
## Neg Pred Value         0.9519  0.96689  0.92937  0.40223
## Prevalence             0.2474  0.05158  0.06858  0.63246
## Detection Rate         0.2139  0.02851  0.01937  0.08865
## Detection Prevalence   0.3032  0.30324  0.30324  0.09027
## Balanced Accuracy      0.8729  0.63155  0.48882  0.56788
plot(cm)