Skip to content
Regressions
Part 1: Simple Linear Regression
# Load necessary libraries
library(readxl) # for reading Excel files
library(tidyverse) # for data manipulation and visualization
# Read datasets from files
fish <- read.csv('fish.csv') # Read fish data from CSV
customer_churn <- read.csv('customer_churn.csv') # Read customer churn data from CSV
College_Town <- read_excel('College_Town.xlsx') # Read college town data from Excel
# Filter the dataset for Bream species
bream <- fish %>%
filter(Species == "Bream")
# Perform a quick exploratory data analysis with a scatter plot
ggplot(bream, aes(Length, Weight)) +
geom_point()
# Build a linear model object for Weight as a function of Length
model1 <- lm(Weight ~ Length, bream)
# Display a summary of the linear model
summary(model1)
# Load broom library for tidying model outputs
library(broom)
# Extract coefficients of the linear model
coefficients(model1)
# Augment data with information from the linear model
augment(model1)
# Get fitted values from the model
fitted(model1)
# Get a glance of the model's statistics
glance(model1)
# Extract R-squared value from the model's summary
model1 %>%
glance() %>%
pull(r.squared)
# Plot the linear model on top of the scatter plot
ggplot(bream, aes(Length, Weight)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE)
# Step 1: Build a new random dataset for prediction
explanatory_data <- tibble(Length = 20:40)
# Step 2: Predict the values using the new dataset
predict(model1, explanatory_data)
# Step 3: Build a new prediction dataset for plotting
prediction_data <- explanatory_data %>%
mutate(Weight = predict(model1, explanatory_data))
# Step 4: Plot the new random data over the existing plot
ggplot(bream, aes(Length, Weight)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
geom_point(
data=prediction_data,
color="red"
)
# Install and load ggfortify for enhanced plotting of model diagnostics
install.packages("ggfortify")
library(ggfortify)
# Plot residuals and diagnostics for the linear model
autoplot(model1, which=1:3, nrow=3, ncol=1) # Residuals
autoplot(model1, which=4:6, nrow=3, ncol=1) # Outliers
Part 2: Multiple Linear Regressions
# Display the structure and summary statistics of the College_Town dataframe
str(College_Town)
summary(College_Town)
# Comments on different regression model formulas
#1 y = b0 + b1x1 + b2x2 + b3x3
#2 y = b0 + b1x1 + b2x2 + Log(b3x3)
#3 log(y) = b0 + b1x1 + b2x2 + b3x3
#4 log(y) = b0 + b1x1 + b2x2 + Log(b3x3)
# Perform exploratory data analysis
# Visualize the relationship between Sqft and Rent
ggplot(College_Town, aes(Sqft, Rent)) +
geom_point()
# Visualize the relationship between Beds and Rent
ggplot(College_Town, aes(Beds, Rent)) +
geom_point()
# Visualize the relationship between Baths and Rent
ggplot(College_Town, aes(Baths, Rent)) +
geom_point()
# Display the arguments of the lm function
args(lm)
# Fit four different linear models with various transformations
lmodel1 <- lm(Rent ~ Beds + Baths + Sqft, College_Town)
lmodel2 <- lm(Rent ~ Beds + Baths + log(Sqft), College_Town)
lmodel3 <- lm(log(Rent) ~ Beds + Baths + Sqft, College_Town)
lmodel4 <- lm(log(Rent) ~ Beds + Baths + log(Sqft), College_Town)
# Display the summary of each model
summary(lmodel1)
summary(lmodel2)
summary(lmodel3)
summary(lmodel4) # this is identified as the best model
#############################################################################################
# Introduction to FOR loops
for (x in 1:5) {
print(x)
}
for (x in 1:5) {
print(x^2)
}
cities <- c("Copenhagen", "Odense", "Aalborg", "Aarhus", "Roskilde")
for(city in cities) {
print(city)
}
# Break the loop when city name length is 7
for(city in cities) {
if(nchar(city) == 7)
break
print(city)
}
# Skip printing the city name when its length is 7
for(city in cities) {
if(nchar(city) == 7) {
next
} else {
print(city)
}
}
#############################################################################################
# Print the R-squared value for each model
models <- list(lmodel1, lmodel2, lmodel3, lmodel4)
for(model in models) {
print(glance(model))
}
for(model in models) {
print(model %>%
glance() %>%
pull(r.squared))
}
#Step 1 Generate a new dataset for prediction
explanatory_data_new <- expand_grid(
Beds=1:5,
Baths = 1:4,
Sqft = unique(College_Town$Sqft)
)
explanatory_data_new <- explanatory_data_new %>%
mutate(log_Sqft = log(Sqft))
#Step 2 predictions using model4
predict(lmodel4, explanatory_data_new)
# Step 3 Create a new dataframe with predictions
prediction_data_new <- explanatory_data_new %>%
mutate(log_Rent_pred = predict(lmodel4, explanatory_data_new),
Rent= exp(log_Rent_pred))
# Step 4 Plot the original data and predictions in a log-log regression model
ggplot(College_Town, aes(Sqft, Rent)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
geom_point(
data= prediction_data_new,
color="red",
alpha=0.4
)
Part 3: Multiple Logistic Regression
# Load necessary library for data manipulation
library(dplyr)
# Display a summary of the customer churn dataset
summary(customer_churn)
# Create a subset of the first 300 rows from the customer churn dataset
test <- customer_churn %>%
slice(1:300) # slice is a function for rows
# Fit a multiple logistic regression model using selected predictors
modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = test)
# Display a summary of the model to check coefficients and model statistics
summary(modelGLM)
# Extract the actual churn values from the test dataset
actual_churn <- test$Churn
# Predict churn using the model and round the predictions to get binary outcomes
modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold
# Predict probabilities of churn for the test dataset
prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class
# Calculate the accuracy of the model
accuracy_model <- mean(actual_churn == modeled_churn)
# Load necessary library for creating confusion matrices
library(yardstick)
# Create a confusion matrix from the model predictions and actual values
outcomes <- table(modeled_churn, actual_churn)
outcomes
# Convert the table to a confusion matrix and plot it
confusion <- conf_mat(outcomes)
autoplot(confusion)
# Fit the model again, this time using the ENTIRE customer churn dataset
modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = customer_churn)
# Display a summary of the model to check coefficients and model statistics
summary(modelGLM)
# Extract the actual churn values from the entire dataset
actual_churn <- customer_churn$Churn
# Predict churn using the model and round the predictions to get binary outcomes
modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold
# Predict probabilities of churn for the entire dataset
prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class
# Calculate the accuracy of the model for the entire dataset
accuracy_model <- mean(actual_churn == modeled_churn)
# Create a confusion matrix from the model predictions and actual values for the entire dataset
outcomes <- table(modeled_churn, actual_churn)
outcomes
# Convert the table to a confusion matrix and plot it
confusion <- conf_mat(outcomes)
autoplot(confusion)