Skip to content
Course Notes: Modeling with tidymodels in R
Requirements
install.packages("pacman")
pacman::p_load(tidyverse, tidymodels, ggplot2)
data(mpg)
telecom_df <- readRDS(url('https://assets.datacamp.com/production/repositories/5840/datasets/d69598f0737bd05ba8dbf70766b817418c4c465f/telecom_df.rds'))
loans_df <- readRDS(url('https://assets.datacamp.com/production/repositories/5840/datasets/e504d22ea01bac6a0b509b3641ee41dfc3a4fe66/loan_df.rds'))Regression models
Parameter 'strata' provides stratification by outcome varieble.
Split
# Load the tidymodels library
library(tidymodels)
# Create a data split object
mpg_split <- initial_split(mpg,
prop = 0.7,
strata = hwy)
# Create the training data
mpg_training <- mpg_split %>%
training()
# Create the test data
mpg_test <- mpg_split %>%
testing()Basic characteristics
# Distribution of selling_price in training data
mpg_training %>%
summarize(min_hwy = min(hwy),
max_hwy = max(hwy),
mean_hwy = mean(hwy),
sd_hwy = sd(hwy))
# Distribution of selling_price in test data
mpg_test %>%
summarize(min_hwy = min(hwy),
max_hwy = max(hwy),
mean_hwy = mean(hwy),
sd_hwy = sd(hwy))Fitting and predicting model
To create a tidymodel, we use parship package.
# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>%
# Set the model engine
set_engine('lm') %>%
# Set the model mode
set_mode('regression')
# Fit the model using the training data
lm_fit <- linear_model %>%
fit(hwy ~ cty + displ,
data = mpg_training)
# Predict selling_price
mpg_predictions <- predict(lm_fit,
new_data = mpg_test)
# Combine test data with predictions
mpg_test_results <- mpg_test %>%
select(hwy, cty,displ) %>%
bind_cols(mpg_predictions)
Model performance
To evaluate the model, we could run long or short (last_fit) scripts
# Print home_test_results
mpg_test_results
# Calculate the RMSE metric
mpg_test_results %>%
rmse(truth = hwy, estimate = .pred)
# Calculate the R squared metric
mpg_test_results %>%
rsq(truth = hwy, estimate = .pred)
# Create an R squared plot of model performance
ggplot(mpg_test_results, aes(x = hwy, y = .pred)) +
geom_point(alpha = 0.5) +
geom_abline(color = 'blue', linetype = 2) +
coord_obs_pred() +
labs(x = 'Actual HWY', y = 'Predicted HWY')### last_fit ###
# Define a linear regression model
linear_model <- linear_reg() %>%
set_engine('lm') %>%
set_mode('regression')
# Train linear_model with last_fit()
linear_fit <- linear_model %>%
last_fit(hwy ~ ., split = mpg_split)
# Collect predictions and view results
predictions_df <- linear_fit %>% collect_predictions()
metrics_df <- linear_fit %>% collect_metrics()
# Make an R squared plot using predictions_df
ggplot(predictions_df, aes(x = hwy, y = .pred)) +
geom_point(alpha = 0.5) +
geom_abline(color = 'blue', linetype = 2) +
coord_obs_pred() +
labs(x = 'Actual HWY', y = 'Predicted HWY')Classification models
Split
# Create data split object
telecom_split <- initial_split(telecom_df, prop = .75,
strata = canceled_service)
# Create the training data
telecom_training <- telecom_split %>%
training()
# Create the test data
telecom_test <- telecom_split %>%
testing()