Skip to content
Course Notes: Foundations of Inference in R
  • AI Chat
  • Code
  • Report
  • Spinner

    Inference Statistics

    Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the datasets folder.

    # import package 
    library(infer)

    Randomized distribution

    Null distribution: Generating a dist. of the stats from the null pop. gives info about whether the obs data are inconsistent with the null hypothesis.

    # One random permutation
    soda %>% 
    	group_by(location) %>% 
    	summarize(prop_cola = mean(drink == "cola")) %>% 
    	summarize(diff(prop_cola))
    
    # Perform 1 permutations 
    soda %>%
    	specify(drink ~ location, success = "cola") %>%
    	hypothesize(null = "independence") %>%
    	generate(reps=1, type="permute") %>%
    	calculate(stat="diff in props", order=c("west", "east"))
    
    # Explain 3 steps from `infer` package
    # `specify`: specify the response and explanatory var
    # `hypothesize`: declare the null hypothesis
    # `generate`: generate resamples, permutation, or simulation 
    # `calculate`: calculate summary statistics.
    # Create bar plot for Home Ownership by Gender
    ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) + 
      # Set the position to fill
      geom_bar(position = "fill") +
      ylab("Relative frequencies")
    
    # Density plot of SleepHrsNight colored by SleepTrouble
    ggplot(NHANES, aes(x = SleepHrsNight, color = SleepTrouble)) + 
      # Adjust by 2
      geom_density(adjust = 2) + 
      # Facet by HealthGen
      facet_wrap(~ HealthGen)
    
    # Randomized data under null model of independence
    
    # Specify variable 
    homeown_perm <- homes %>%
    	specify(HomeOwn ~ Gender, success="Own") %>%
    	hypothesize(null = "independence") 
    	generate(reps=10, type="permute")
    homeown_perm
    # Calculating statistic of Interest 
    homes <- NHANES %>%
    	select(Gender, HomeOwn) %>%
    	filter(HomeOwn %in% c("Own", "Rent"))
    
    diff_orig <- homes %>%   
      # Group by gender
      group_by(Gender) %>%
      # Summarize proportion of homeowners
      summarise(prop_own = mean(HomeOwn == "Own")) %>%
      # Summarize difference in proportion of homeowners
      summarise(obs_diff_prop = diff(prop_own)) # male - female
      
    diff_orig
    # Randomized data under null model of independence 
    # Specify variables
    homeown_perm <- homes %>%
      	specify(HomeOwn ~ Gender, success = "Own")
    	hypothesize(null = "independence")  
    	generate(reps = 10, type = "permute")
    	calculate("diff in props", order = c("male", "female"))
    
    # Dot plot of 100 permuted diff in props
    ggplot(homeown_perm, aes(x=stat) +
    	geom_dotplot(binwidth=0.001)
    	   
    	   
    # Perform 1000 permutations
    homeown_perm <- homes %>%
      specify(HomeOwn ~ Gender, success = "Own") %>%
      hypothesize(null = "independence") %>% 
      generate(reps = 1000, type = "permute") %>% 
      calculate(stat="diff in props", order = c("male", "female"))
    
    # Density plot of 1000 permuted differences in proportions
    ggplot(homeown_perm, aes(x = stat)) + 
      geom_density()
    

    Using the randomization distribution

    We want our obs data to be different from the the null so that we can claim the Ha to be true

    # Plot permuted differences, diff_perm
    ggplot(homeown_perm, aes(x = diff_perm)) + 
      # Add a density layer
      geom_density() +
      # Add a vline layer with intercept diff_orig
      geom_vline(aes(xintercept = diff_orig), color = "red")
    
    # Compare permuted differences to observed difference
    homeown_perm %>%
      summarize(n_perm_le_obs = sum(diff_perm <= diff_orig))
    

    CONCLUSION:

    • In NHANES data, the obs stats was consistent with the null stats.
    • Inference allows ONLY reject null hypothesis, but nothing else.

    Complete a Randomization test: Gender Discrimination

    1. Dataset: 14 out of 24 female files were selected for promotion and 21 out of 24 male filed were selected for promotion.
    2. Question: Is it pausible to observe such a difference in props in a scenario where men and women are equally likely to be promoted?
    3. SHUFFLE --> IS IT BY CHANCE?
    disc <- data.frame(
    	promote = c(rep("promoted", 35), rep("not_promoted", 13)),
    	sex = c(rep("male", 21), rep("female", 14)),
    	rep("male", 3), rep("female", 10))
    	)
    
    disc %>%
    	group_by(sex) %>%
    	summarize(promoted_prop = mean(promote == "promoted"))
    # Create a contingency table summarizing the data
    disc %>%
      count(sex, promote)
    
    # Find proportion of each sex who were promoted
    disc %>%
      group_by(sex) %>% 
      summarize(promoted_prop = mean(promote == "promoted"))
    
    # Replicate the entire data frame, permuting the promote variable
    disc_perm <- disc %>%
      specify(promote ~ sex, success = "promoted") %>%
      hypothesize(null = "independence") %>%
      generate(reps = 5, type = "permute")
    
    disc_perm %>%
      group_by(replicate) %>%
      count(sex, promote)
    
    disc_perm %>%
      # Calculate difference in proportion, male then female
      calculate(stat="diff in props", order=c("male", "female"))