# Statistics for Management and Economics by Gerald Keller
# Chapter 17: MULTIPLE REGRESSION
# Example 17.1 on Pg 719
# Christmas Week Ski Lift Sales 


data1 <- read.csv(file.choose()) #choose Xm17-01.csv


skilift <- data1$Tickets #number of lift tickets sold during Christmas week - Response variable

snow <- data1$Snowfall #total snowfall in inches - Explanatory variable
temp <- data1$Temperature #the average temperature in degrees Fahrenheit - Explanatory variable

#build multiple regression model and check if any required conditions are violated

regression_line <- lm(skilift ~ snow+temp) #gives regression line
s <- summary(regression_line) #gives the Residuals, Std Error etc
anova(regression_line) #gives the anova table

cat("The multiple regression model is given by Y =", 
    regression_line$coefficients[1], "+", regression_line$coefficients[2], "X1", 
    regression_line$coefficients[3], "X2", 
    "where Y is number of skilift tickets sold,
    X1 is snowfall and X2 is temperature")

# The multiple regression model is Y = 8308 + 75X1 -9X2

cat("The very low R squared value of", s$r.squared, 
    "shows that this model is not upto the mark for further consideration.")

pred <- predict(regression_line) #Predicted Values
res <- resid(regression_line) #Residuals
plot(pred,res) # Plot of Predicted Values versus Residuals. No sign of heteroscedasticity.

hist(regression_line$residuals) #histogram of Residuals

#############

time <- data1$Time #Time - Explanatory variable

plot(time, res) #shows that errors are not independent. 
#Durbin Watson test used below for further validation that residuals are not independent.
install.packages("lmtest")
library(lmtest)
dwtest(regression_line) #test for autocorrelation - Durbin Watson Test
cat("Autocorrelation exists in the data; just as was observed in the time-residual plot")

############################################################################################

#building regression model again by including the Time variable.

regression_line2 <- lm(skilift ~ snow+temp+time) #gives regression line
s2 <- summary(regression_line2) #gives the Residuals, Std Error etc
anova(regression_line2) #gives the anova table

cat("The multiple regression model is given by Y =", 
    regression_line2$coefficients[1], "+", regression_line2$coefficients[2], "X1", 
    regression_line2$coefficients[3], "X2","+", regression_line2$coefficients[4], "X3",
    "where Y is number of skilift tickets sold,
    X1 is snowfall, X2 is temperature and X3 is time.")

# The new multiple regression model is Y = 5966 + 7X1 -9X2 + 230X3

cat("The R squared value of", s2$r.squared, 
    "shows that this model has drastically improved on introducing the Time variable.")

pred2 <- predict(regression_line2) #Predicted Values
res2 <- resid(regression_line2) #Residuals
plot(pred2,res2) # Plot of Predicted Values versus Residuals. Error variance appears constant.
hist(regression_line2$residuals) #histogram of Residuals
plot(time, res2) #no autocorrelation  
dwtest(regression_line2) #test for autocorrelation - Durbin Watson Test
cat("Autocorrelation does Not exist in the data; just as was observed in the time-residual plot")

cat("The F-statistic", s2$fstatistic[1], "and the t-tests (p-values:", s2$coef[2,"Pr(>|t|)"], 
    s2$coef[4,"Pr(>|t|)"], ") respectively show that snowfall and time significantly 
    affect the number of skilift tickets sold.")

#End
