Skip to content
Course Notes: Supervised Learning in R: Regression
Course Notes
Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! The datasets used in this course are available in the datasets folder.
# Import any packages you want to use here
Take Notes
Add notes here about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# unemployment (with predictions) is available
summary(unemployment)
# unemployment_model is available
summary(unemployment_model)
# Load the package WVPlots
library(WVPlots)
# Plot the Gain Curve
GainCurvePlot(unemployment, "predictions", "female_unemployment", "Unemployment model")# mpg is available
summary(mpg)
# splitPlan is available
str(splitPlan)
# Run the 3-fold cross validation plan from splitPlan
k <- 3 # Number of folds
mpg$pred.cv <- 0
for(i in 1:k) {
split <- splitPlan[[i]]
model <- lm(cty ~ hwy, data = mpg[split$train, ])
mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app, ])
}
# Predict from a full model
mpg$pred <- predict(lm(cty ~ hwy, data = mpg))
# Get the rmse of the full model's predictions
rmse(mpg$pred, mpg$cty)
# Get the rmse of the cross-validation predictions
rmse(mpg$pred.cv, mpg$cty)# fmla.abs is available
fmla.abs
# model.abs is available
summary(model.abs)
# Add predictions to the test set
income_test <- income_test %>%
mutate(pred.absmodel = predict(model.abs, income_test), # predictions from model.abs
pred.logmodel = exp(predict(model.log, income_test))) # predictions from model.log
# Gather the predictions and calculate residuals and relative error
income_long <- income_test %>%
gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>%
mutate(residual = pred - Income2005, # residuals
relerr = residual / Income2005) # relative error
# Calculate RMSE and relative RMSE and compare
income_long %>%
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residual^2)), # RMSE
rmse.rel = sqrt(mean(relerr^2))) # Root mean squared relative error# houseprice is available
summary(houseprice)
# fmla_sqr is available
fmla_sqr
# Create a splitting plan for 3-fold cross validation
set.seed(34245) # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)
# Sample code: get cross-val predictions for price ~ size
houseprice$pred_lin <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_lin <- lm(price ~ size, data = houseprice[split$train, ])
houseprice$pred_lin[split$app] <- predict(model_lin, newdata = houseprice[split$app, ])
}
# Get cross-val predictions for price as a function of size^2 (use fmla_sqr)
houseprice$pred_sqr <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_sqr <- lm(fmla_sqr, data = houseprice[split$train, ])
houseprice$pred_sqr[split$app] <- predict(model_sqr, newdata = houseprice[split$app, ])
}
# Gather the predictions and calculate the residuals
houseprice_long <- houseprice %>%
gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>%
mutate(residuals = pred - price)
# Compare the cross-validated RMSE for the two models
houseprice_long %>%
group_by(modeltype) %>%
summarize(rmse = sqrt(mean(residuals^2)))# bikesJuly is available
str(bikesJuly)
# Random seed to reproduce results
seed
# the outcome column
(outcome <- "cnt")
# The input variables
(vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"))
# Create the formula string for bikes rented as a function of the inputs
(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
# Load the package ranger
library(ranger)
# Fit and print the random forest model.
(bike_model_rf <- ranger(fmla,
bikesJuly,
num.trees = 500,
respect.unordered.factors = "order",
seed = seed))# Print dframe
dframe
# Create a vector of variable names
(vars <- c("color", "size"))
# Load the package vtreat
library(vtreat)
# Create the treatment plan
treatplan <- designTreatmentsZ(dframe, vars)
# Examine the scoreFrame
(scoreFrame <- treatplan %>%
use_series(scoreFrame) %>%
select(varName, origName, code))
# We only want the rows with codes "clean" or "lev"
(newvars <- scoreFrame %>%
filter(code %in% c("clean", "lev")) %>%
use_series(varName))
# Create the treated training data
(dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars))set.seed(1234)
# Load the package xgboost
library(xgboost)
# Run xgb.cv
cv <- xgb.cv(data = as.matrix(bikesJuly.treat),
label = bikesJuly$cnt,
nrounds = 50,
nfold = 5,
objective = "reg:squarederror",
eta = 0.75,
max_depth = 5,
early_stopping_rounds = 5,
verbose = FALSE # silent
)
# Get the evaluation log
elog <- cv$evaluation_log
# Determine and print how many trees minimize training and test error
elog %>%
summarize(ntrees.train = which.min(train_rmse_mean), # find the index of min(train_rmse_mean)
ntrees.test = which.min(test_rmse_mean)) # find the index of min(test_rmse_mean)