Using the Kaggle Dataset named “Household Electric Power Consumption” present in the link “https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set”, extract date, year and month from the data. Plot the power usage across a duration and visualize it. Perform time series forecasting on the data by analyzing it and plot the obtained forecast. Use the table, dplyr, lubridate, plotly and forecast packages to perform the tasks.
Prodedure:
library(data.table)
library(dplyr)
library(lubridate)
library(plotly)
library(forecast)
data <- fread("household_power_consumption.txt")
head(data)
Date | Time | Global_active_power | Global_reactive_power | Voltage | Global_intensity | Sub_metering_1 | Sub_metering_2 | Sub_metering_3 |
---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <dbl> |
16/12/2006 | 17:24:00 | 4.216 | 0.418 | 234.840 | 18.400 | 0.000 | 1.000 | 17 |
16/12/2006 | 17:25:00 | 5.360 | 0.436 | 233.630 | 23.000 | 0.000 | 1.000 | 16 |
16/12/2006 | 17:26:00 | 5.374 | 0.498 | 233.290 | 23.000 | 0.000 | 2.000 | 17 |
16/12/2006 | 17:27:00 | 5.388 | 0.502 | 233.740 | 23.000 | 0.000 | 1.000 | 17 |
16/12/2006 | 17:28:00 | 3.666 | 0.528 | 235.680 | 15.800 | 0.000 | 1.000 | 17 |
16/12/2006 | 17:29:00 | 3.520 | 0.522 | 235.020 | 15.000 | 0.000 | 2.000 | 17 |
glimpse(data)
Rows: 2,075,259 Columns: 9 $ Date <chr> "16/12/2006", "16/12/2006", "16/12/2006", "16/12… $ Time <chr> "17:24:00", "17:25:00", "17:26:00", "17:27:00", … $ Global_active_power <chr> "4.216", "5.360", "5.374", "5.388", "3.666", "3.… $ Global_reactive_power <chr> "0.418", "0.436", "0.498", "0.502", "0.528", "0.… $ Voltage <chr> "234.840", "233.630", "233.290", "233.740", "235… $ Global_intensity <chr> "18.400", "23.000", "23.000", "23.000", "15.800"… $ Sub_metering_1 <chr> "0.000", "0.000", "0.000", "0.000", "0.000", "0.… $ Sub_metering_2 <chr> "1.000", "1.000", "2.000", "1.000", "1.000", "2.… $ Sub_metering_3 <dbl> 17, 16, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, …
summary(data)
Date Time Global_active_power Length:2075259 Length:2075259 Length:2075259 Class :character Class :character Class :character Mode :character Mode :character Mode :character Global_reactive_power Voltage Global_intensity Sub_metering_1 Length:2075259 Length:2075259 Length:2075259 Length:2075259 Class :character Class :character Class :character Class :character Mode :character Mode :character Mode :character Mode :character Sub_metering_2 Sub_metering_3 Length:2075259 Min. : 0.000 Class :character 1st Qu.: 0.000 Mode :character Median : 1.000 Mean : 6.458 3rd Qu.:17.000 Max. :31.000 NA's :25979
data <- data[complete.cases(data)]
sum(is.na(data))
data$datetime <- paste(data$Date,data$Time)
data$datetime <- as.POSIXct(data$datetime, format="%d/%m/%Y %H:%M:%S")
attr(data$datetime, "tzone") <- "Europe/Paris"
str(data)
Classes ‘data.table’ and 'data.frame': 2049280 obs. of 10 variables: $ Date : chr "16/12/2006" "16/12/2006" "16/12/2006" "16/12/2006" ... $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ... $ Global_active_power : chr "4.216" "5.360" "5.374" "5.388" ... $ Global_reactive_power: chr "0.418" "0.436" "0.498" "0.502" ... $ Voltage : chr "234.840" "233.630" "233.290" "233.740" ... $ Global_intensity : chr "18.400" "23.000" "23.000" "23.000" ... $ Sub_metering_1 : chr "0.000" "0.000" "0.000" "0.000" ... $ Sub_metering_2 : chr "1.000" "1.000" "2.000" "1.000" ... $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ... $ datetime : POSIXct, format: "2006-12-16 12:54:00" "2006-12-16 12:55:00" ... - attr(*, ".internal.selfref")=<externalptr>
data$year <- year(data$datetime)
data$week <- week(data$datetime)
data$day <- day(data$datetime)
data$month <- month(data$datetime)
data$minute <- minute(data$datetime)
plot(data$Sub_metering_1)
# Sub_metering_1
ann <- filter(data, year == 2006)
plot(ann$Sub_metering_1)
# Sub_metering_2
plot(ann$Sub_metering_2)
# Sub_metering_3
plot(ann$Sub_metering_3)
houseDay <- filter(data, year == 2008 & day == 10 & month==1)
plot_ly(houseDay, x = ~houseDay$datetime, y = ~houseDay$Sub_metering_1, type = 'scatter', mode = 'lines')
dtDay <- filter(data, year == 2009 & day == 2 & month==2)
plot_ly(dtDay, x = ~dtDay$datetime, y = ~dtDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~dtDay$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~dtDay$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption Feb 2th, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
houseDay10 <- filter(data, year == 2009 & month == 2 & day == 2 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
plot_ly(houseDay10, x = ~houseDay10$datetime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption Feb 2th, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
data$minute <- minute(data$datetime)
houseDay10 <- filter(data, year == 2008 & month == 5 & day == 10 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
plot_ly(houseDay10, x = ~houseDay10$datetime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption May 10th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
data$hour <- hour(data$datetime)
houseweekly <- filter(data, week == 2 & hour == 20 & minute == 1)
tsSM3_weekly <- ts(houseweekly$Sub_metering_3, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 3")
tsSM3_weekly <- ts(houseweekly$Sub_metering_1, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 1")
tsSM3_weekly <- ts(houseweekly$Sub_metering_2, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 2")
house070809weekly <- filter(data, year==2008, hour == 20 & minute == 1)
tsSM3_070809weekly <- ts(house070809weekly$Sub_metering_3, frequency=52, start=c(2008,3))
tsSM2_070809weekly <- ts(house070809weekly$Sub_metering_2, frequency=52, start=c(2008,3))
tsSM1_070809weekly <- ts(house070809weekly$Sub_metering_1, frequency=52, start=c(2008,3))
fit3 <- tslm(tsSM3_070809weekly ~ trend + season)
fit2 <- tslm(tsSM2_070809weekly ~ trend + season)
fit1 <- tslm(tsSM1_070809weekly ~ trend + season)
summary(fit3)
Call: tslm(formula = tsSM3_070809weekly ~ trend + season) Residuals: Min 1Q Median 3Q Max -8.0997 -3.6021 -2.0930 0.3787 25.2358 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 5.2300695 2.6910798 1.943 0.0529 . trend 0.0009589 0.0034285 0.280 0.7799 season2 -5.0009589 3.6710347 -1.362 0.1741 season3 -3.0305567 3.5554148 -0.852 0.3947 season4 0.9684843 3.5553371 0.272 0.7855 season5 -2.2396848 3.6747199 -0.609 0.5426 season6 -0.6692152 3.6745680 -0.182 0.8556 season7 -0.8130313 3.6744192 -0.221 0.8250 season8 -1.3854188 3.6742737 -0.377 0.7064 season9 -2.3863778 3.6741313 -0.650 0.5165 season10 -4.8159081 3.6739922 -1.311 0.1909 season11 -3.6740100 3.6738562 -1.000 0.3181 season12 -1.6749689 3.6737234 -0.456 0.6488 season13 -1.2473564 3.6735938 -0.340 0.7344 season14 -2.5340296 3.6734674 -0.690 0.4908 season15 -3.2492743 3.6733442 -0.885 0.3771 season16 -2.5359475 3.6732242 -0.690 0.4905 season17 -4.9654779 3.6731074 -1.352 0.1774 season18 0.8907060 3.6729938 0.243 0.8086 season19 -4.9673958 3.6728834 -1.352 0.1772 season20 -4.9683548 3.6727762 -1.353 0.1771 season21 -2.3978851 3.6726722 -0.653 0.5143 season22 -2.5417012 3.6725714 -0.692 0.4894 season23 -5.2569459 3.6724737 -1.431 0.1533 season24 -2.5436191 3.6723793 -0.693 0.4891 season25 -5.1160066 3.6722881 -1.393 0.1646 season26 -2.2598227 3.6722001 -0.615 0.5387 season27 1.4535040 3.6721152 0.396 0.6925 season28 -1.2617406 3.6720336 -0.344 0.7314 season29 -2.5484139 3.6719552 -0.694 0.4882 season30 -2.6922299 3.6718800 -0.733 0.4640 season31 1.4496683 3.6718079 0.395 0.6933 season32 1.5915665 3.6717391 0.433 0.6650 season33 -2.5522496 3.6716735 -0.695 0.4875 season34 -2.6960657 3.6716110 -0.734 0.4633 season35 -0.6970247 3.6715518 -0.190 0.8496 season36 -2.8408408 3.6714958 -0.774 0.4397 season37 -5.2703711 3.6714430 -1.436 0.1521 season38 -0.6999015 3.6713933 -0.191 0.8489 season39 -0.7008605 3.6713469 -0.191 0.8487 season40 -1.1303908 3.6713037 -0.308 0.7584 season41 2.5829359 3.6712637 0.704 0.4822 season42 -5.2751659 3.6712269 -1.437 0.1517 season43 -0.9904105 3.6711932 -0.270 0.7875 season44 -2.5627981 3.6711628 -0.698 0.4856 season45 1.5791001 3.6711356 0.430 0.6674 season46 -4.8504302 3.6711116 -1.321 0.1874 season47 -2.4228177 3.6710908 -0.660 0.5098 season48 -4.8523481 3.6710732 -1.322 0.1872 season49 -2.7104499 3.6710588 -0.738 0.4609 season50 -2.7114089 3.6710475 -0.739 0.4607 season51 0.1447750 3.6710395 0.039 0.9686 season52 -4.9990411 3.6710347 -1.362 0.1743 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.868 on 313 degrees of freedom Multiple R-squared: 0.09806, Adjusted R-squared: -0.05179 F-statistic: 0.6544 on 52 and 313 DF, p-value: 0.9678
forecastfitSM3c <- forecast(fit3, h=20, level=c(80,90))
forecastfitSM2c <- forecast(fit2, h=20, level=c(80,90))
forecastfitSM1c <- forecast(fit1, h=20, level=c(80,90))
plot(forecastfitSM3c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")
plot(forecastfitSM2c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")
plot(forecastfitSM1c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")