##------------Loading libraries------------------
library(readr)
library(sm)
library(forecast)
library(TSclust)
##---------------Loading data--------------------

# Loading data
wpi_monthly_data <- read_csv("wpi_monthly_data.csv")

# Removing commodity code, weight and columns with date ahead of Dec 2020
wpi_monthly_data <- wpi_monthly_data[,-c(2,3,121:147)]

# Removing columns with missing values
non_empty_cases <- complete.cases(wpi_monthly_data)
wpi_monthly_data <- wpi_monthly_data[non_empty_cases,]

# List of redundant rows
l <- c(1,2,3,4,5,13,21,22,41,58,60,68,
     79,84,85,92,104,113,117,118,
     126,131,134,135,136,138,142,
     144,155,157,158,159,164,167,
     172,184,190,200,202,206,212,
     215,217,221,224,226,229,235,
     236,240,243,246,247,251,252,
     258,266,269,277,280,283,284,
     289,294,295,298,303,309,310,
     314,318,321,324,325,339,344,
     348,349,357,358,386,396,403,
     407,410,412,424,438,443,444,
     468,469,481,492,510,511,517,
     521,523,526,534,540,543,545,
     546,553,555,564,569,571,575,
     578,591,596,598,599,606,610,
     613,616,622,632,633,637,641,
     644,647,649,653,656,659,660,
     676,681,683,694,701,711,715,
     716,719,724,730,734,736,740,
     742,751,757,762,771,777,782,
     789,792,793,799,819,820,823,
     828,832,834,836,837,844,845,
     848,850,856,861)

# List of items to be removed
cat(wpi_monthly_data$COMM_NAME[l],sep = "\n")

# Removing rows
wpi_monthly_data <- wpi_monthly_data[-l,]
wpi_monthly_data <- as.data.frame(wpi_monthly_data)
data.names <- wpi_monthly_data[,1]

# Creating a vector containing date values
date <- seq(as.Date("2011/4/1"), by = "month", to = as.Date("2020/12/1"))

##-----------------Plotting----------------------

# Plotting data 
x <- t(wpi_monthly_data[1,-1]) # Data of paddy
plot(date,x, type = "l", col = "red", ylim = c(50,730))

# Adding data of other commodities over the same plot
for(i in 1:length(wpi_monthly_data)){
  lines(date, wpi_monthly_data[i,-1], type = "l")}

##-------------Kernel Smoothing------------------

# Implementation over the data of the first row
sm.regression(seq_along(x),x,method = 'cv',eval.points=1:length(x))

operation <- function(x)
{
  return(sm.regression(seq_along(x),x,method = 'cv',eval.points=1:length(x),display = "none")$estimate)
}
smooth_ts <- apply(wpi_monthly_data[,-1],1,operation)

##----------------Clustering---------------------

# Computing dissimilarity metric
dist_ts <- diss(t(smooth_ts),METHOD = "CORT")

# Applying hierarchical clustering over the obtained data and plotting the results
hclust <- hclust(dist_ts)
plot(hclust)
number.of.groups <- 50
rect.hclust(hclust, k = number.of.groups, border = seq(1,number.of.groups))
ts_cut <- cutree(hclust,number.of.groups)
matrix(ts_cut)
table(ts_cut)
sort(table(ts_cut))

# List of groups with more than 5 elements
ts_grp_list <- NULL
for(i in 1:length(table(ts_cut)))
{
  if(table(ts_cut)[i]>=5){ts_grp_list <- append(ts_grp_list,i)}
}

# Assigning groups to commodities
grp <- list()

for(i in ts_grp_list){
  grp[[i]]<-which(matrix(ts_cut)==i)
}

# Removing empty (NULL) groups
grp_final <- NULL
for (i in 1:length(grp)) 
{
  c <- 1
  if(!is.null(unlist(grp[[i]])))
  {
    grp_final <- append(grp_final,grp[i])
    c <- c+1
  }
}
rm(grp)

# Printing items in the groups
for(i in seq_along(ts_grp_list))
{
  cat("Cluster :",i,"\n")
  print(data.names[grp_final[[i]]],justify ="center")
}

##-------------Plotting Clusters-----------------

img.name <- paste0("Initial Clusters",".png",sep = "")
png(img.name, width = 1080, height = 720)
par(mfrow = c(5,5))

for(k in seq_along(ts_grp_list))
{
  maximum<-max(smooth_ts[,grp_final[[k]]])
  minimum<-min(smooth_ts[,grp_final[[k]]])
  plot(1:117, type = 'n',ylim =c(minimum,maximum),main = paste("Cluster", k))
  t <- grp_final[[k]]
  for(i in 1:length(t))
    {
      lines(smooth_ts[,t[i]])
    }
}
dev.off()
par(mfrow = c(1,1))

##-------------Calculating Noise-----------------

noise <- list()
data <- t(wpi_monthly_data[,-1])
colnames(data) <- data.names

for(k in seq_along(ts_grp_list))
{
  noise[[k]]<- data[,grp_final[[k]]] - smooth_ts[,grp_final[[k]]]
}

# Noise associated with 1st commodity from group 20
noise[[20]][,1]

# Plot of noise and its ACF & PACF
plot(noise[[20]][,1],type = "l")
acf(noise[[20]][,1])
pacf(noise[[20]][,1])

##----------------Sub-clusters-------------------

# ARIMA model sub-clusters
# Visualizing the fit of ARIMA model
fit <- auto.arima(noise[[1]][,1])
plot(unlist(noise[[1]][,1]), type= 'l')
points(fit$fitted,type = 'l', col = 2, lty = 2)

# Making sub-clusters for the largest group, i.e., group 19
k <- 19
df <- as.data.frame(noise[[k]])
commodity_names <- colnames(df)
arima_df<-apply(df, 2, function(y)
{
  auto.arima(y)$fitted
})
arima_noise <- noise[[k]]-arima_df
sq.sum.diff <- apply(arima_noise , 2, function(x){sum(x^2)})
d <- dist(sq.sum.diff)
h <- hclust(d)
plot(h)
number.of.groups <- 5
rect.hclust(h,number.of.groups, border = seq(1,number.of.groups))
ts_cut_sub <- cutree(h,number.of.groups)
matrix(ts_cut_sub)
table(ts_cut_sub)
sort(table(ts_cut_sub))
ts_sub_grp_list <- NULL
for(i in 1:length(table(ts_cut_sub)))
{
  if(table(ts_cut_sub)[i]>=2)
    ts_sub_grp_list <- append(ts_sub_grp_list,i)
}

grp<-list()
for(i in ts_sub_grp_list)
{
  grp[[i]] <- which(matrix(ts_cut_sub)==i)
}

# Removing empty (NULL) groups
sub_grp_final <- NULL
for (i in 1:length(grp)) 
{
  c <- 1
  if(!is.null(unlist(grp[[i]])))
    {
      sub_grp_final <- append(sub_grp_final,grp[i])
      c <- c+1
    }
}
rm(grp)

for(i in seq_along(ts_sub_grp_list))
{
  cat("Sub Cluster :",i,"\n")
  print(commodity_names[sub_grp_final[[i]]],justify ="center")
}

par(mfrow = c(3,1))
for(p in seq_along(ts_sub_grp_list))
{
  s <- grp_final[[k]][sub_grp_final[[p]]]
  maximum <- max(smooth_ts[,s])
  minimum <- min(smooth_ts[,s])
  plot(1:117, type = 'n',ylim =c(minimum,maximum),main = paste("Cluster :",k), sub = paste("Sub Cluster",p))
  t <- s
  for(i in 1:length(t))
  {
    lines(smooth_ts[,t[i]])
  }
}
par(mfrow = c(1,1))