Skip to content

Imagine living in a house where every single watt of electricity you use is meticulously recorded, each of which contributes to a vast pool of data. By analyzing this detailed household power consumption data recorded over nearly 4 years, an energy company can help customers achieve sustainable energy usage while balancing their energy generation. With predictive models, the company can optimize energy usage, forecast future consumption, and provide tailored recommendations. Your task is to use this dataset to build a model that predicts power consumption, benefiting both the energy provider and its customers.

The Data

Available in df_train.csv and df_test.csv:

ColumnTypeDescription
datechrDate of the measurement
power_consumptiondblDaily power consumption (in kilowatts)
yearintYear of the measurement
semesterintSemester of the measurement (1 for Jan-Jun, 2 for Jul-Dec)
quarterintQuarter of the measurement (1 for Q1, 2 for Q2, 3 for Q3, 4 for Q4)
day_in_weekchrDay of the week of the measurement (e.g., Monday, Tuesday)
week_in_yearintWeek number in the year of the measurement
day_in_yearintDay number in the year of the measurement
monthintMonth of the year of the measurement

This dataset was donated to the UCI Machine Learning Repository. For detailed information about the dataset and the preprocessing steps, please refer to the License and Data Preprocessing Details (Invalid URL) notebook.

# Load necessary libraries
suppressPackageStartupMessages(library(dplyr))
library(lubridate) 
library(ranger)    
library(xgboost)   
library(ggplot2)   

# Load and inspect the training and testing datasets
df_train <- read.csv("df_train.csv")
df_test <- read.csv("df_test.csv")

## Explore the structure of the dataset
glimpse(df_train)

# Start coding here...add as many cells as you like!
# Step 1: Data Cleaning

## 1. Convert `date` column from character to Date format
df_train <- df_train %>%
  mutate(date = as.Date(date, format = "%Y-%m-%d"))  # Adjust format as per your data
df_test <- df_test %>%
  mutate(date = as.Date(date, format = "%Y-%m-%d"))

## 2. Convert `day_in_week` column from character to factor
df_train <- df_train %>%
  mutate(day_in_week = factor(day_in_week))
df_test <- df_test %>%
  mutate(day_in_week = factor(day_in_week))

## 3. Create indicator variables (one-hot encoding) for factor columns

### One-hot encoding for `day_in_week`
# For training data
day_in_week_train <- as.data.frame(model.matrix(~ day_in_week - 1, data = df_train))
df_train <- df_train %>%
  bind_cols(day_in_week_train) %>%  # Combine one-hot encoded columns with original data
  select(-day_in_week)             # Remove the original factor column

# For testing data
day_in_week_test <- as.data.frame(model.matrix(~ day_in_week - 1, data = df_test))
df_test <- df_test %>%
  bind_cols(day_in_week_test) %>%
  select(-day_in_week)

# Inspect cleaned data
glimpse(df_train)
glimpse(df_test)
# Step 2: Prepare Training and Testing Data for Modeling

## 1. Separate features and target variable in training dataset
# Remove `power_consumption` (target) and `date` columns for features
train_X <- df_train %>%
  select(-c(power_consumption, date))

# Extract the target variable (power_consumption)
train_y <- df_train$power_consumption

## 2. Separate features and target variable in testing dataset
# Remove `power_consumption` (target) and `date` columns for features
test_X <- df_test %>%
  select(-c(power_consumption, date))

# Extract the target variable (power_consumption)
test_y <- df_test$power_consumption

# Inspect the separated data
glimpse(train_X)  # Features for training
glimpse(train_y)  # Target variable for training
glimpse(test_X)   # Features for testing
glimpse(test_y)   # Target variable for testing
# Step 3: Train Models and Predict on Test Data

## 1. Linear Regression
# Train the model
linear_model <- lm(power_consumption ~ ., data = cbind(train_X, power_consumption = train_y))

# Predict on the test data
linear_predictions <- predict(linear_model, newdata = test_X)

## 2. Random Forest
# Train the model with ranger
rf_model <- ranger(power_consumption ~ ., data = cbind(train_X, power_consumption = train_y), num.trees = 500)

# Predict on the test data
rf_predictions <- predict(rf_model, data = test_X)$predictions

## 3. XGBoost
# Prepare data for xgboost
dtrain <- xgb.DMatrix(data = as.matrix(train_X), label = train_y)
dtest <- xgb.DMatrix(data = as.matrix(test_X))

# Train the model
xgb_model <- xgboost(data = dtrain, max_depth = 6, eta = 0.1, nrounds = 100, objective = "reg:squarederror", verbose = 0)

# Predict on the test data
xgb_predictions <- predict(xgb_model, newdata = dtest)

# Inspect Predictions
head(linear_predictions)
head(rf_predictions)
head(xgb_predictions)
# Step 4: Calculate RMSE for all models

# Function to calculate RMSE
calculate_rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}

# 1. Calculate RMSE for Linear Regression
linear_rmse <- calculate_rmse(test_y, linear_predictions)

# 2. Calculate RMSE for Random Forest
rf_rmse <- calculate_rmse(test_y, rf_predictions)

# 3. Calculate RMSE for XGBoost
xgb_rmse <- calculate_rmse(test_y, xgb_predictions)

# Print RMSE values
cat("Linear Regression RMSE:", linear_rmse, "\n")
cat("Random Forest RMSE:", rf_rmse, "\n")
cat("XGBoost RMSE:", xgb_rmse, "\n")

# 4. Identify the model with the lowest RMSE
selected_rmse <- min(linear_rmse, rf_rmse, xgb_rmse)

# Ensure selected_rmse is less than 450 kW
if (selected_rmse > 450) {
  warning("The RMSE exceeds the 450 kW threshold. Consider boosting model performance.")
}

# Output the best-performing model's RMSE
cat("Selected RMSE (lowest):", selected_rmse, "\n")
# Ensure the `date` column is a proper Date object
test_with_predictions <- test_with_predictions %>%
  mutate(
    date = as.Date(day_in_year - 1, origin = paste0(year, "-01-01"))
  )
# Create a line plot for actual vs predicted values
trend_plot <- ggplot(data = test_with_predictions, aes(x = date)) +
  geom_line(aes(y = power_consumption, color = "Actual"), linewidth = 1) +
  geom_line(aes(y = predicted_power_consumption, color = "Predicted"), linewidth = 1, linetype = "dashed") +
  labs(
    title = "Actual vs Predicted Power Consumption",
    x = "Date",
    y = "Power Consumption (kW)",
    color = "Legend"
  ) +
  theme_minimal()

# Print the plot
print(trend_plot)
# Assess trend similarity visually and save the result
trend_similarity <- "Yes"  # Adjust based on visual inspection
cat("Trend Similarity:", trend_similarity, "\n")