#Sports Data Analysis

#Name: Remita Austin
#Faculty Mentor: Dr. Parvathi R

#Import the required libraries
library(MASS)
library("readxl")
library(dplyr)
library(tidyverse)
library(caret)

#Import and load the dataset
cricket <- read_excel("cricket_data.xlsx")
head(cricket)

#Data Cleaning
#check for missing values
any(is.na(cricket))

unique(cricket$opposition)
#cleaning unwanted substring "v" from the column
cricket <- cricket%>%
  mutate(opposition=gsub("v ","",opposition))
unique(cricket$opposition)

#drop the odi_number feature because it adds no value to the analysis
cricket <- subset (cricket, select = -odi_number)

#Data Manipulation
#display top five records with runs scored and whose opposition is Bangladesh
cricket %>%
  filter(opposition=='Bangladesh') %>%
  arrange(desc(runs_scored)) %>%
  head(5)

#display ten records with strike rate>100
cricket %>%
  filter(strike_rate>100) %>%
  head(10)

#display random 5 rows of score, runs scored, balls faced columns
cricket %>%
  dplyr::select(score,runs_scored,balls_faced) %>%
  slice_sample(n=5)

#find the first match date
minc <- cricket %>%
  arrange(date) %>%
  head(1)
minc['date']

#find the last match date
maxc <- cricket %>%
  arrange(desc(date)) %>%
  head(1)
maxc['date']

#find the number of balls faced in career
sum(cricket$balls_faced)

#find highest runs scored
max(cricket$runs_scored)

#Statistical Analysis
#find the number of matches in each ground
ground_freq <- table(cricket$ground)
ground_freq

#find the relative frequency distribution of the ground and display them with the precision of two decimal places
n=nrow(cricket)
ground_relfreq=ground_freq/n
format(round(ground_relfreq,2),nsmall=2)

#find the range of strike rate of the player
s=cricket$strike_rate
range(s)

#compute the mean, variance and standard deviation of strike rate in career
mean(s)
var(s)
sd(s)

#Basic Visualization
#Plot a graph for the number of matches against different oppositions
ggplot(cricket,aes(x=opposition,fill=opposition))+geom_bar()+ggtitle("Number of matches against different oppositions")

#Draw the histogram of strike rate of matches
hist(cricket$strike_rate,main="Strike Rate of the Matches",xlab="Strike Rate",ylab="frequency")

#Plot relationship of runs scored and number of sixes
ggplot(data=cricket,mapping=aes(x=runs_scored,y=sixes))+geom_point()

#Correlation and Regression Analysis
#find correlation between strike rate and runs scored
cor(cricket$strike_rate,cricket$runs_scored)

#find correlation between strike rate and balls faced
cor(cricket$strike_rate,cricket$balls_faced)

#find correlation between strike rate and fours
cor(cricket$strike_rate,cricket$fours)

#find correlation between strike rate and sixes
cor(cricket$strike_rate,cricket$sixes)

#to find the most significant variables wrt strike rate
#Split the data into training and testing data set
set.seed(123)
train_samples<-cricket$strike_rate %>%
  createDataPartition(p=0.8,list=FALSE) #80% training and 20% testing
head(train_samples)
train<-cricket[train_samples,]
test<-cricket[-train_samples,]

#Building a regression model
model<-lm(strike_rate~.,data=train)
summary(model)