Using the Kaggle Dataset named “Household Electric Power Consumption” present in the link “https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set”, extract date, year and month from the data. Plot the power usage across a duration and visualize it. Perform time series forecasting on the data by analyzing it and plot the obtained forecast. Use the table, dplyr, lubridate, plotly and forecast packages to perform the tasks.
Prodedure:
library(data.table)
library(dplyr)
library(lubridate)
library(plotly)
library(forecast)
data <- fread("household_power_consumption.txt")
head(data)
| Date | Time | Global_active_power | Global_reactive_power | Voltage | Global_intensity | Sub_metering_1 | Sub_metering_2 | Sub_metering_3 |
|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <dbl> |
| 16/12/2006 | 17:24:00 | 4.216 | 0.418 | 234.840 | 18.400 | 0.000 | 1.000 | 17 |
| 16/12/2006 | 17:25:00 | 5.360 | 0.436 | 233.630 | 23.000 | 0.000 | 1.000 | 16 |
| 16/12/2006 | 17:26:00 | 5.374 | 0.498 | 233.290 | 23.000 | 0.000 | 2.000 | 17 |
| 16/12/2006 | 17:27:00 | 5.388 | 0.502 | 233.740 | 23.000 | 0.000 | 1.000 | 17 |
| 16/12/2006 | 17:28:00 | 3.666 | 0.528 | 235.680 | 15.800 | 0.000 | 1.000 | 17 |
| 16/12/2006 | 17:29:00 | 3.520 | 0.522 | 235.020 | 15.000 | 0.000 | 2.000 | 17 |
glimpse(data)
Rows: 2,075,259 Columns: 9 $ Date <chr> "16/12/2006", "16/12/2006", "16/12/2006", "16/12… $ Time <chr> "17:24:00", "17:25:00", "17:26:00", "17:27:00", … $ Global_active_power <chr> "4.216", "5.360", "5.374", "5.388", "3.666", "3.… $ Global_reactive_power <chr> "0.418", "0.436", "0.498", "0.502", "0.528", "0.… $ Voltage <chr> "234.840", "233.630", "233.290", "233.740", "235… $ Global_intensity <chr> "18.400", "23.000", "23.000", "23.000", "15.800"… $ Sub_metering_1 <chr> "0.000", "0.000", "0.000", "0.000", "0.000", "0.… $ Sub_metering_2 <chr> "1.000", "1.000", "2.000", "1.000", "1.000", "2.… $ Sub_metering_3 <dbl> 17, 16, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, …
summary(data)
Date Time Global_active_power
Length:2075259 Length:2075259 Length:2075259
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Global_reactive_power Voltage Global_intensity Sub_metering_1
Length:2075259 Length:2075259 Length:2075259 Length:2075259
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Sub_metering_2 Sub_metering_3
Length:2075259 Min. : 0.000
Class :character 1st Qu.: 0.000
Mode :character Median : 1.000
Mean : 6.458
3rd Qu.:17.000
Max. :31.000
NA's :25979
data <- data[complete.cases(data)]
sum(is.na(data))
data$datetime <- paste(data$Date,data$Time)
data$datetime <- as.POSIXct(data$datetime, format="%d/%m/%Y %H:%M:%S")
attr(data$datetime, "tzone") <- "Europe/Paris"
str(data)
Classes ‘data.table’ and 'data.frame': 2049280 obs. of 10 variables: $ Date : chr "16/12/2006" "16/12/2006" "16/12/2006" "16/12/2006" ... $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ... $ Global_active_power : chr "4.216" "5.360" "5.374" "5.388" ... $ Global_reactive_power: chr "0.418" "0.436" "0.498" "0.502" ... $ Voltage : chr "234.840" "233.630" "233.290" "233.740" ... $ Global_intensity : chr "18.400" "23.000" "23.000" "23.000" ... $ Sub_metering_1 : chr "0.000" "0.000" "0.000" "0.000" ... $ Sub_metering_2 : chr "1.000" "1.000" "2.000" "1.000" ... $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ... $ datetime : POSIXct, format: "2006-12-16 12:54:00" "2006-12-16 12:55:00" ... - attr(*, ".internal.selfref")=<externalptr>
data$year <- year(data$datetime)
data$week <- week(data$datetime)
data$day <- day(data$datetime)
data$month <- month(data$datetime)
data$minute <- minute(data$datetime)
plot(data$Sub_metering_1)
# Sub_metering_1
ann <- filter(data, year == 2006)
plot(ann$Sub_metering_1)
# Sub_metering_2
plot(ann$Sub_metering_2)
# Sub_metering_3
plot(ann$Sub_metering_3)
houseDay <- filter(data, year == 2008 & day == 10 & month==1)
plot_ly(houseDay, x = ~houseDay$datetime, y = ~houseDay$Sub_metering_1, type = 'scatter', mode = 'lines')
dtDay <- filter(data, year == 2009 & day == 2 & month==2)
plot_ly(dtDay, x = ~dtDay$datetime, y = ~dtDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~dtDay$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~dtDay$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption Feb 2th, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
houseDay10 <- filter(data, year == 2009 & month == 2 & day == 2 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
plot_ly(houseDay10, x = ~houseDay10$datetime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption Feb 2th, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
data$minute <- minute(data$datetime)
houseDay10 <- filter(data, year == 2008 & month == 5 & day == 10 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
plot_ly(houseDay10, x = ~houseDay10$datetime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption May 10th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Warning message: “Can't display both discrete & non-discrete data on same axis” Warning message: “Can't display both discrete & non-discrete data on same axis”
data$hour <- hour(data$datetime)
houseweekly <- filter(data, week == 2 & hour == 20 & minute == 1)
tsSM3_weekly <- ts(houseweekly$Sub_metering_3, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 3")
tsSM3_weekly <- ts(houseweekly$Sub_metering_1, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 1")
tsSM3_weekly <- ts(houseweekly$Sub_metering_2, frequency=52, start=c(2007,1))
plot(tsSM3_weekly, xlab = "Time", ylab = "Watt Hours", main = "Sub-meter 2")
house070809weekly <- filter(data, year==2008, hour == 20 & minute == 1)
tsSM3_070809weekly <- ts(house070809weekly$Sub_metering_3, frequency=52, start=c(2008,3))
tsSM2_070809weekly <- ts(house070809weekly$Sub_metering_2, frequency=52, start=c(2008,3))
tsSM1_070809weekly <- ts(house070809weekly$Sub_metering_1, frequency=52, start=c(2008,3))
fit3 <- tslm(tsSM3_070809weekly ~ trend + season)
fit2 <- tslm(tsSM2_070809weekly ~ trend + season)
fit1 <- tslm(tsSM1_070809weekly ~ trend + season)
summary(fit3)
Call:
tslm(formula = tsSM3_070809weekly ~ trend + season)
Residuals:
Min 1Q Median 3Q Max
-8.0997 -3.6021 -2.0930 0.3787 25.2358
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.2300695 2.6910798 1.943 0.0529 .
trend 0.0009589 0.0034285 0.280 0.7799
season2 -5.0009589 3.6710347 -1.362 0.1741
season3 -3.0305567 3.5554148 -0.852 0.3947
season4 0.9684843 3.5553371 0.272 0.7855
season5 -2.2396848 3.6747199 -0.609 0.5426
season6 -0.6692152 3.6745680 -0.182 0.8556
season7 -0.8130313 3.6744192 -0.221 0.8250
season8 -1.3854188 3.6742737 -0.377 0.7064
season9 -2.3863778 3.6741313 -0.650 0.5165
season10 -4.8159081 3.6739922 -1.311 0.1909
season11 -3.6740100 3.6738562 -1.000 0.3181
season12 -1.6749689 3.6737234 -0.456 0.6488
season13 -1.2473564 3.6735938 -0.340 0.7344
season14 -2.5340296 3.6734674 -0.690 0.4908
season15 -3.2492743 3.6733442 -0.885 0.3771
season16 -2.5359475 3.6732242 -0.690 0.4905
season17 -4.9654779 3.6731074 -1.352 0.1774
season18 0.8907060 3.6729938 0.243 0.8086
season19 -4.9673958 3.6728834 -1.352 0.1772
season20 -4.9683548 3.6727762 -1.353 0.1771
season21 -2.3978851 3.6726722 -0.653 0.5143
season22 -2.5417012 3.6725714 -0.692 0.4894
season23 -5.2569459 3.6724737 -1.431 0.1533
season24 -2.5436191 3.6723793 -0.693 0.4891
season25 -5.1160066 3.6722881 -1.393 0.1646
season26 -2.2598227 3.6722001 -0.615 0.5387
season27 1.4535040 3.6721152 0.396 0.6925
season28 -1.2617406 3.6720336 -0.344 0.7314
season29 -2.5484139 3.6719552 -0.694 0.4882
season30 -2.6922299 3.6718800 -0.733 0.4640
season31 1.4496683 3.6718079 0.395 0.6933
season32 1.5915665 3.6717391 0.433 0.6650
season33 -2.5522496 3.6716735 -0.695 0.4875
season34 -2.6960657 3.6716110 -0.734 0.4633
season35 -0.6970247 3.6715518 -0.190 0.8496
season36 -2.8408408 3.6714958 -0.774 0.4397
season37 -5.2703711 3.6714430 -1.436 0.1521
season38 -0.6999015 3.6713933 -0.191 0.8489
season39 -0.7008605 3.6713469 -0.191 0.8487
season40 -1.1303908 3.6713037 -0.308 0.7584
season41 2.5829359 3.6712637 0.704 0.4822
season42 -5.2751659 3.6712269 -1.437 0.1517
season43 -0.9904105 3.6711932 -0.270 0.7875
season44 -2.5627981 3.6711628 -0.698 0.4856
season45 1.5791001 3.6711356 0.430 0.6674
season46 -4.8504302 3.6711116 -1.321 0.1874
season47 -2.4228177 3.6710908 -0.660 0.5098
season48 -4.8523481 3.6710732 -1.322 0.1872
season49 -2.7104499 3.6710588 -0.738 0.4609
season50 -2.7114089 3.6710475 -0.739 0.4607
season51 0.1447750 3.6710395 0.039 0.9686
season52 -4.9990411 3.6710347 -1.362 0.1743
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.868 on 313 degrees of freedom
Multiple R-squared: 0.09806, Adjusted R-squared: -0.05179
F-statistic: 0.6544 on 52 and 313 DF, p-value: 0.9678
forecastfitSM3c <- forecast(fit3, h=20, level=c(80,90))
forecastfitSM2c <- forecast(fit2, h=20, level=c(80,90))
forecastfitSM1c <- forecast(fit1, h=20, level=c(80,90))
plot(forecastfitSM3c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")
plot(forecastfitSM2c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")
plot(forecastfitSM1c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time")