Skip to content

Requirements

install.packages("pacman")
pacman::p_load(tidyverse, tidymodels, ggplot2)
data(mpg)
telecom_df <- readRDS(url('https://assets.datacamp.com/production/repositories/5840/datasets/d69598f0737bd05ba8dbf70766b817418c4c465f/telecom_df.rds'))
loans_df <- readRDS(url('https://assets.datacamp.com/production/repositories/5840/datasets/e504d22ea01bac6a0b509b3641ee41dfc3a4fe66/loan_df.rds'))

Regression models

Parameter 'strata' provides stratification by outcome varieble.

Split

# Load the tidymodels library
library(tidymodels)

# Create a data split object
mpg_split <- initial_split(mpg, 
                            prop = 0.7, 
                            strata = hwy)

# Create the training data
mpg_training <- mpg_split %>%
  training()

# Create the test data
mpg_test <- mpg_split %>% 
  testing()

Basic characteristics

# Distribution of selling_price in training data
mpg_training %>% 
  summarize(min_hwy = min(hwy),
            max_hwy = max(hwy),
            mean_hwy = mean(hwy),
            sd_hwy = sd(hwy))
# Distribution of selling_price in test data
mpg_test %>% 
  summarize(min_hwy = min(hwy),
            max_hwy = max(hwy),
            mean_hwy = mean(hwy),
            sd_hwy = sd(hwy))

Fitting and predicting model

To create a tidymodel, we use parship package.

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  # Set the model engine
  set_engine('lm') %>% 
  # Set the model mode
  set_mode('regression')

# Fit the model using the training data
lm_fit <- linear_model %>% 
  fit(hwy ~ cty + displ,
      data = mpg_training)

# Predict selling_price
mpg_predictions <- predict(lm_fit,
                            new_data = mpg_test)

# Combine test data with predictions
mpg_test_results <- mpg_test %>% 
  select(hwy, cty,displ) %>% 
  bind_cols(mpg_predictions)

Model performance

To evaluate the model, we could run long or short (last_fit) scripts

# Print home_test_results
mpg_test_results

# Calculate the RMSE metric
mpg_test_results %>% 
  rmse(truth = hwy, estimate = .pred)

# Calculate the R squared metric
mpg_test_results %>% 
  rsq(truth = hwy, estimate = .pred)

# Create an R squared plot of model performance
ggplot(mpg_test_results, aes(x = hwy, y = .pred)) +
  geom_point(alpha = 0.5) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual HWY', y = 'Predicted HWY')
### last_fit ###
# Define a linear regression model
linear_model <- linear_reg() %>% 
  set_engine('lm') %>% 
  set_mode('regression')

# Train linear_model with last_fit()
linear_fit <- linear_model %>% 
  last_fit(hwy ~ ., split = mpg_split)

# Collect predictions and view results
predictions_df <- linear_fit %>% collect_predictions()
metrics_df <- linear_fit %>% collect_metrics()
                                        
# Make an R squared plot using predictions_df
ggplot(predictions_df, aes(x = hwy, y = .pred)) + 
  geom_point(alpha = 0.5) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual HWY', y = 'Predicted HWY')

Classification models

Split

# Create data split object
telecom_split <- initial_split(telecom_df, prop = .75,
                     strata = canceled_service)

# Create the training data
telecom_training <- telecom_split %>% 
  training()

# Create the test data
telecom_test <- telecom_split %>% 
  testing()