Skip to content
Course Notes: Machine Learning in the Tidyverse
  • AI Chat
  • Code
  • Report
  • Course Notes

    Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the datasets folder.

    # Install the gapminder package
    install.packages("gapminder")
    
    # Import any packages you want to use 
    library(tidyverse)
    library(gapminder)

    Take Notes

    Add notes here about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    # Explore gapminder
    head(gapminder)
    
    # Prepare the nested data frame gap_nested
    library(tidyverse)
    gap_nested <- gapminder %>% 
      group_by(country) %>% 
      nest()
    
    # Explore gap_nested
    head(gap_nested)
    # Create the unnested data frame called gap_unnnested
    gap_unnested <- gap_nested %>% 
      unnest()
      
    # Confirm that your data was not modified  
    identical(gapminder, gap_unnested)
    # Extract the data of Algeria
    algeria_df <- gap_nested$data[[1]]
    
    # Calculate the minimum of the population vector
    min(algeria_df$population)
    
    # Calculate the maximum of the population vector
    max(algeria_df$population)
    
    # Calculate the mean of the population vector
    mean(algeria_df$population)
    # Build a linear model for each country
    gap_models <- gap_nested %>%
        mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))
        
    # Extract the model for Algeria    
    algeria_model <- gap_models$model[[1]]
    
    # View the summary for the Algeria model
    summary(algeria_model)
    # Build the augmented data frame
    algeria_fitted <- augment(algeria_model)
    
    # Compare the predicted values with the actual values of life expectancy
    algeria_fitted %>% 
      ggplot(aes(x = year)) +
      geom_point(aes(y = life_expectancy)) + 
      geom_line(aes(y = .fitted), color = "red")
    # Build a linear model for each country using all features
    gap_fullmodel <- gap_nested %>% 
      mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))
    
    fullmodel_perf <- gap_fullmodel %>% 
      # Extract the fit statistics of each model into data frames
      mutate(fit = map(model, ~glance(.x))) %>% 
      # Simplify the fit data frames for each model
      unnest(fit)
      
    # View the performance for the four countries with the worst fitting four simple models you looked at before
    fullmodel_perf %>% 
       filter(country %in% worst_fit$country) %>% 
      select(country, adj.r.squared)
    set.seed(42)
    
    # Prepare the initial split object
    gap_split <- initial_split(gapminder, prop = 0.75)
    
    # Extract the training data frame
    training_data <- training(gap_split)
    
    # Extract the testing data frame
    testing_data <- testing(gap_split)
    
    # Calculate the dimensions of both training_data and testing_data
    dim(training_data)
    dim(testing_data)
    set.seed(42)
    
    # Prepare the data frame containing the cross validation partitions
    cv_split <- vfold_cv(training_data, v = 5)
    
    cv_data <- cv_split %>% 
      mutate(
        # Extract the train data frame for each split
        train = map(splits, ~training(.x)), 
        # Extract the validate data frame for each split
        validate = map(splits, ~testing(.x))
      )
    
    # Use head() to preview cv_data
    head(cv_data)
    # Build a model using the train data for each fold of the cross validation
    cv_models_lm <- cv_data %>% 
      mutate(model = map(train, ~lm(formula = life_expectancy ~ ., data = .x)))
    cv_prep_lm <- cv_models_lm %>% 
      mutate(
        # Extract the recorded life expectancy for the records in the validate data frames
        validate_actual = map(validate, ~.x$life_expectancy),
        # Predict life expectancy for each validate set using its corresponding model
        validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
      )
    library(Metrics)
    # Calculate the mean absolute error for each validate fold       
    cv_eval_lm <- cv_prep_lm %>% 
      mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))
    
    # Print the validate_mae column
    cv_eval_lm$validate_mae
    
    # Calculate the mean of validate_mae column
    mean(cv_eval_lm$validate_mae)