Skip to content
Introduction to Statistics in R
  • AI Chat
  • Code
  • Report
  • Spinner

    Introduction to Statistics in R

    Run the hidden code cell below to import the data used in this course.

    Take Notes

    Add notes about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    
    
    # Scatterplot of grams_sugar_per_day and happiness_score
    ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
    # Correlation between grams_sugar_per_day and happiness_score
    cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
    [1] 0.69391
    # Scatterplot of grams_sugar_per_day and happiness_score
    ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
    # Correlation between grams_sugar_per_day and happiness_score
    cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
    [1] 0.69391
    
    
    
    # Create log_gdp_per_cap column
    world_happiness <- world_happiness %>%
      mutate(log_gdp_per_cap = log(gdp_per_cap))
    
    # Scatterplot of happiness_score vs. log_gdp_per_cap
    ggplot(world_happiness, aes(log_gdp_per_cap, happiness_score)) +
      geom_point()
    
    # Calculate correlation
    cor(world_happiness$log_gdp_per_cap, world_happiness$happiness_score)
    
    
    
    # Scatterplot of gdp_per_cap and life_exp
    ggplot(world_happiness, aes(gdp_per_cap, life_exp)) +
      geom_point()
    
    # Correlation between gdp_per_cap and life_exp
    cor(world_happiness$gdp_per_cap, world_happiness$life_exp)
    
    
    
    
    
    
    # Add a linear trendline to scatterplot
    ggplot(world_happiness, aes(life_exp, happiness_score)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE)
    
    # Correlation between life_exp and happiness_score
    cor(world_happiness$life_exp, world_happiness$happiness_score)
    
    
    
    
    # Probability response takes < 1 hour
    pexp(1, rate = 5.5)
    [1] 0.9959132
    # Probability response takes < 1 hour
    pexp(1, rate = 5.5)
    [1] 0.9959132
    # Probability response takes < 1 hour
    pexp(1, rate > 4)
    Error: object 'rate' not found
    # Probability response takes < 1 hour
    pexp(1, rate >= 4)
    Error: object 'rate' not found
    # Probability response takes < 1 hour
    pexp(1, rate = 4)
    [1] 0.9816844
    # Probability response takes < 1 hour
    pexp(1, rate = 4)
    [1] 0.9816844
    # Probability response takes < 1 hour
    pexp(1, rate = 2.5)
    [1] 0.917915
    # Probability response takes < 1 hour
    pexp(1, rate = 2.5)
    [1] 0.917915
    # Probability response takes < 1 hour
    pexp(1, rate = 0.4)
    [1] 0.32968
    # Probability response takes < 1 hour
    pexp(1, rate = 0.4)
    [1] 0.32968
    # Probability response takes > 4 hours
    pexp(4, rate = 0.4)
    [1] 0.7981035
    # Probability response takes > 4 hours
    pexp(4, rate = 0.4)
    [1] 0.7981035
    # Probability response takes > 4 hours
    pexp(4, rate = 0.4, lower.tail = FALSE)
    [1] 0.2018965
    # Probability response takes > 4 hours
    pexp(4, rate = 0.4, lower.tail = FALSE)
    [1] 0.2018965
    # Probability response takes 3-4 hours
    pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
    [1] -0.09929769
    # Probability response takes 3-4 hours
    pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
    [1] -0.09929769
    # Probability response takes 3-4 hours
    pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
    [1] -0.09929769
    # Probability response takes 3-4 hours
    pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
    [1] -0.09929769
    # Probability response takes 3-4 hours
    pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
    [1] 0.09929769
    # Probability response takes 3-4 hours
    pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
    [1] 0.09929769
    
    
    
    # Probability of 5 responses
    dpois(5, 4)
    [1] 0.1562935
    # Probability of 5 responses from coworker
    dpois(5, 5.5)
    [1] 0.1714007
    # Probability of 2 or fewer responses
    ppois(2, 4)
    [1] 0.2381033
    # Probability of > 10 responses
    ppois(10, 4, lower.tail = FALSE)
    [1] 0.002839766
    
    
    
    
    
    
    # Set seed to 104
    set.seed(104)
    
    # Sample 20 num_users from amir_deals and take mean
    sample(amir_deals$num_users, size = 20, replace = TRUE) %>%
      mean()
    
    # Repeat the above 100 times
    sample_means <- replicate(100, sample(amir_deals$num_users, size = 20, replace = TRUE) %>% mean())
    
    # Create data frame for plotting
    samples <- data.frame(mean = sample_means)
    
    # Histogram of sample means
    ggplot(samples, aes(mean)) + geom_histogram(bins = 10)
    
    
    
    
    
    
    
    # Calculate amount that 75% of deals will be more than
    qnorm(0.75, 5000, 2000, lower.tail = FALSE)
    
    
    # Probability of deal < 7500
    pnorm(7500, 5000, 2000)
    
    # Probability of closing > 1 deal out of 3 deals
    pbinom(1, 3, 0.3, lower.tail = FALSE)
    
    # Generate 1000 wait times between 0 and 30 mins, save in time column
    wait_times %>%
      mutate(time = runif(1000, min = 0, max = 30)) %>%
      # Create a histogram of simulated times
      ggplot(aes(time)) +
      geom_histogram(bins = 30)
    
    # Set random seed to 334
    set.seed(334)
    
    # Generate 1000 wait times between 0 and 30 mins, save in time column
    wait_times <- data.frame(c(1:1000))
    head(wait_times %>% mutate(time = runif(1000, 0, 30)))
    
    
    # Min and max wait times for back-up that happens every 30 min
    min <- 0
    max <- 30
    
    # Calculate probability of waiting 10-20 mins
    prob_between_10_and_20 <- punif(20, min, max) - punif(10, min, max)
    prob_between_10_and_20
    # Min and max wait times for back-up that happens every 30 min
    min <- 0
    max <- 30
    # Calculate probability of waiting less than 5 mins
    prob_less_than_5 <- prob_less_than_5 <- punif(5, min, max)
    prob_less_than_5
    
    
    # Create probability distribution
    size_distribution <- restaurant_groups %>%
      count(group_size) %>%
      mutate(probability = n / sum(n))
    
    # Calculate probability of picking group of 4 or more
    size_distribution %>%
      # Filter for groups of 4 or larger
      filter(group_size >= 4) %>%
      # Calculate prob_4_or_more by taking sum of probabilities
      summarise(prob_4_or_more = sum(probability))
    
    # Create probability distribution
    size_distribution <- restaurant_groups %>%
      # Count number of each group size
      count(group_size) %>%
      # Calculate probability
      mutate(probability = n / sum(n))
    
    size_distribution
    mutate(probability = count / nrow(restaurant_groups)
    
    
    # Create a histogram of group_size
    ggplot(restaurant_groups, aes(x = group_size)) +
      geom_histogram(bins = 5)
    
    
    
    ##WHAT ARE THE CHANCES##
    
    # Set random seed to 31
    set.seed(31)
    
    # Sample 5 deals without replacement
    amir_deals %>%
    sample_n(5)
    
    ##
    
    # Calculate total co2_emission per country: emissions_by_country
    emissions_by_country <- food_consumption %>%
      group_by(country) %>%
      summarize(total_emission = sum(co2_emission))
    
    # Compute the first and third quartiles and IQR of total_emission
    q1 <- quantile(emissions_by_country$total_emission, 0.25)
    q3 <- quantile(emissions_by_country$total_emission, 0.75)
    iqr <- q3 - q1
    
    # Calculate the lower and upper cutoffs for outliers
    lower <- q1 - 1.5 * iqr
    upper <- q3 + 1.5 * iqr
    
    # Filter emissions_by_country to find outliers
    emissions_by_country %>%
      filter(total_emission < lower | total_emission > upper)
    
    ##
    
    
    # Calculate total co2_emission per country: emissions_by_country
    emissions_by_country <- food_consumption %>%
      group_by(country) %>%
      summarize(total_emission = sum(co2_emission))
    
    ##
    # Calculate variance and sd of co2_emission for each food_category
    food_consumption %>% 
      group_by(food_category) %>% 
      summarise(var_co2 = var(co2_emission),
         sd_co2 = sd(co2_emission))
    
    # Plot food_consumption with co2_emission on x-axis
    ggplot(food_consumption, aes(co2_emission)) +
      # Create a histogram
      geom_histogram() +
      # Create a separate sub-graph for each food_category
      facet_wrap(~ food_category)
    ##
    # Calculate the deciles of co2_emission
    
    quantile(food_consumption$co2_emission, probs = seq(0,1,0.1))
    # Calculate the quintiles of co2_emission
    
    quantile(food_consumption$co2_emission, probs = seq(0,1,0.2))
    
    # Calculate the quartiles of co2_emission
    quantile(food_consumption$co2_emission)
    
    
    # Filter for Belgium
    belgium_consumption <- food_consumption %>%
      filter(country == "Belgium")
    
    # Filter for USA
    usa_consumption <- food_consumption %>%
      filter(country == "USA")
    
    # Calculate mean and median consumption in Belgium
    mean(belgium_consumption$consumption)
    median(belgium_consumption$consumption)
    
    # Calculate mean and median consumption in USA
    mean(usa_consumption$consumption)
    median(usa_consumption$consumption)