Course Notes: Foundations of Inference in R

Inference Statistics

Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the datasets folder.

# import package 
library(infer)

Randomized distribution

Null distribution: Generating a dist. of the stats from the null pop. gives info about whether the obs data are inconsistent with the null hypothesis.

# One random permutation
soda %>% 
	group_by(location) %>% 
	summarize(prop_cola = mean(drink == "cola")) %>% 
	summarize(diff(prop_cola))

# Perform 1 permutations 
soda %>%
	specify(drink ~ location, success = "cola") %>%
	hypothesize(null = "independence") %>%
	generate(reps=1, type="permute") %>%
	calculate(stat="diff in props", order=c("west", "east"))

# Explain 3 steps from `infer` package
# `specify`: specify the response and explanatory var
# `hypothesize`: declare the null hypothesis
# `generate`: generate resamples, permutation, or simulation 
# `calculate`: calculate summary statistics.

# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) + 
  # Set the position to fill
  geom_bar(position = "fill") +
  ylab("Relative frequencies")

# Density plot of SleepHrsNight colored by SleepTrouble
ggplot(NHANES, aes(x = SleepHrsNight, color = SleepTrouble)) + 
  # Adjust by 2
  geom_density(adjust = 2) + 
  # Facet by HealthGen
  facet_wrap(~ HealthGen)

# Randomized data under null model of independence

# Specify variable 
homeown_perm <- homes %>%
	specify(HomeOwn ~ Gender, success="Own") %>%
	hypothesize(null = "independence") 
	generate(reps=10, type="permute")
homeown_perm

# Calculating statistic of Interest 
homes <- NHANES %>%
	select(Gender, HomeOwn) %>%
	filter(HomeOwn %in% c("Own", "Rent"))

diff_orig <- homes %>%   
  # Group by gender
  group_by(Gender) %>%
  # Summarize proportion of homeowners
  summarise(prop_own = mean(HomeOwn == "Own")) %>%
  # Summarize difference in proportion of homeowners
  summarise(obs_diff_prop = diff(prop_own)) # male - female
  
diff_orig

# Randomized data under null model of independence 
# Specify variables
homeown_perm <- homes %>%
  	specify(HomeOwn ~ Gender, success = "Own")
	hypothesize(null = "independence")  
	generate(reps = 10, type = "permute")
	calculate("diff in props", order = c("male", "female"))

# Dot plot of 100 permuted diff in props
ggplot(homeown_perm, aes(x=stat) +
	geom_dotplot(binwidth=0.001)
	   
	   
# Perform 1000 permutations
homeown_perm <- homes %>%
  specify(HomeOwn ~ Gender, success = "Own") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat="diff in props", order = c("male", "female"))

# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = stat)) + 
  geom_density()

Using the randomization distribution

We want our obs data to be different from the the null so that we can claim the Ha to be true

# Plot permuted differences, diff_perm
ggplot(homeown_perm, aes(x = diff_perm)) + 
  # Add a density layer
  geom_density() +
  # Add a vline layer with intercept diff_orig
  geom_vline(aes(xintercept = diff_orig), color = "red")

# Compare permuted differences to observed difference
homeown_perm %>%
  summarize(n_perm_le_obs = sum(diff_perm <= diff_orig))

CONCLUSION:

In NHANES data, the obs stats was consistent with the null stats.
Inference allows ONLY reject null hypothesis, but nothing else.

Complete a Randomization test: Gender Discrimination

Dataset: 14 out of 24 female files were selected for promotion and 21 out of 24 male filed were selected for promotion.
Question: Is it pausible to observe such a difference in props in a scenario where men and women are equally likely to be promoted?
SHUFFLE --> IS IT BY CHANCE?

disc <- data.frame(
	promote = c(rep("promoted", 35), rep("not_promoted", 13)),
	sex = c(rep("male", 21), rep("female", 14)),
	rep("male", 3), rep("female", 10))
	)

disc %>%
	group_by(sex) %>%
	summarize(promoted_prop = mean(promote == "promoted"))

# Create a contingency table summarizing the data
disc %>%
  count(sex, promote)

# Find proportion of each sex who were promoted
disc %>%
  group_by(sex) %>% 
  summarize(promoted_prop = mean(promote == "promoted"))

# Replicate the entire data frame, permuting the promote variable
disc_perm <- disc %>%
  specify(promote ~ sex, success = "promoted") %>%
  hypothesize(null = "independence") %>%
  generate(reps = 5, type = "permute")

disc_perm %>%
  group_by(replicate) %>%
  count(sex, promote)

disc_perm %>%
  # Calculate difference in proportion, male then female
  calculate(stat="diff in props", order=c("male", "female"))

‌
‌
‌

Course Notes: Foundations of Inference in R

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Inference Statistics

Randomized distribution

Using the randomization distribution

Complete a Randomization test: Gender Discrimination

Inference Statistics