this is the nav!
Introduction to Statistics in R
• AI Chat
• Code
• Report
• .mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Introduction to Statistics in R

Run the hidden code cell below to import the data used in this course.

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

```.mfe-app-workspace-11z5vno{font-family:JetBrainsMonoNL,Menlo,Monaco,'Courier New',monospace;font-size:13px;line-height:20px;}```# Add your code snippets here

# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness\$happiness_score,world_happiness\$grams_sugar_per_day)
[1] 0.69391
# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness\$happiness_score,world_happiness\$grams_sugar_per_day)
[1] 0.69391

# Create log_gdp_per_cap column
world_happiness <- world_happiness %>%
mutate(log_gdp_per_cap = log(gdp_per_cap))

# Scatterplot of happiness_score vs. log_gdp_per_cap
ggplot(world_happiness, aes(log_gdp_per_cap, happiness_score)) +
geom_point()

# Calculate correlation
cor(world_happiness\$log_gdp_per_cap, world_happiness\$happiness_score)

# Scatterplot of gdp_per_cap and life_exp
ggplot(world_happiness, aes(gdp_per_cap, life_exp)) +
geom_point()

# Correlation between gdp_per_cap and life_exp
cor(world_happiness\$gdp_per_cap, world_happiness\$life_exp)

# Add a linear trendline to scatterplot
ggplot(world_happiness, aes(life_exp, happiness_score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)

# Correlation between life_exp and happiness_score
cor(world_happiness\$life_exp, world_happiness\$happiness_score)

# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate > 4)
# Probability response takes < 1 hour
pexp(1, rate >= 4)
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769

# Probability of 5 responses
dpois(5, 4)
[1] 0.1562935
# Probability of 5 responses from coworker
dpois(5, 5.5)
[1] 0.1714007
# Probability of 2 or fewer responses
ppois(2, 4)
[1] 0.2381033
# Probability of > 10 responses
ppois(10, 4, lower.tail = FALSE)
[1] 0.002839766

# Set seed to 104
set.seed(104)

# Sample 20 num_users from amir_deals and take mean
sample(amir_deals\$num_users, size = 20, replace = TRUE) %>%
mean()

# Repeat the above 100 times
sample_means <- replicate(100, sample(amir_deals\$num_users, size = 20, replace = TRUE) %>% mean())

# Create data frame for plotting
samples <- data.frame(mean = sample_means)

# Histogram of sample means
ggplot(samples, aes(mean)) + geom_histogram(bins = 10)

# Calculate amount that 75% of deals will be more than
qnorm(0.75, 5000, 2000, lower.tail = FALSE)

# Probability of deal < 7500
pnorm(7500, 5000, 2000)

# Probability of closing > 1 deal out of 3 deals
pbinom(1, 3, 0.3, lower.tail = FALSE)

# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times %>%
mutate(time = runif(1000, min = 0, max = 30)) %>%
# Create a histogram of simulated times
ggplot(aes(time)) +
geom_histogram(bins = 30)

# Set random seed to 334
set.seed(334)

# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times <- data.frame(c(1:1000))
head(wait_times %>% mutate(time = runif(1000, 0, 30)))

# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30

# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 <- punif(20, min, max) - punif(10, min, max)
prob_between_10_and_20
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30
# Calculate probability of waiting less than 5 mins
prob_less_than_5 <- prob_less_than_5 <- punif(5, min, max)
prob_less_than_5

# Create probability distribution
size_distribution <- restaurant_groups %>%
count(group_size) %>%
mutate(probability = n / sum(n))

# Calculate probability of picking group of 4 or more
size_distribution %>%
# Filter for groups of 4 or larger
filter(group_size >= 4) %>%
# Calculate prob_4_or_more by taking sum of probabilities
summarise(prob_4_or_more = sum(probability))

# Create probability distribution
size_distribution <- restaurant_groups %>%
# Count number of each group size
count(group_size) %>%
# Calculate probability
mutate(probability = n / sum(n))

size_distribution
mutate(probability = count / nrow(restaurant_groups)

# Create a histogram of group_size
ggplot(restaurant_groups, aes(x = group_size)) +
geom_histogram(bins = 5)

##WHAT ARE THE CHANCES##

# Set random seed to 31
set.seed(31)

# Sample 5 deals without replacement
amir_deals %>%
sample_n(5)

##

# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
group_by(country) %>%
summarize(total_emission = sum(co2_emission))

# Compute the first and third quartiles and IQR of total_emission
q1 <- quantile(emissions_by_country\$total_emission, 0.25)
q3 <- quantile(emissions_by_country\$total_emission, 0.75)
iqr <- q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr

# Filter emissions_by_country to find outliers
emissions_by_country %>%
filter(total_emission < lower | total_emission > upper)

##

# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
group_by(country) %>%
summarize(total_emission = sum(co2_emission))

##
# Calculate variance and sd of co2_emission for each food_category
food_consumption %>%
group_by(food_category) %>%
summarise(var_co2 = var(co2_emission),
sd_co2 = sd(co2_emission))

# Plot food_consumption with co2_emission on x-axis
ggplot(food_consumption, aes(co2_emission)) +
# Create a histogram
geom_histogram() +
# Create a separate sub-graph for each food_category
facet_wrap(~ food_category)
##
# Calculate the deciles of co2_emission

quantile(food_consumption\$co2_emission, probs = seq(0,1,0.1))
# Calculate the quintiles of co2_emission

quantile(food_consumption\$co2_emission, probs = seq(0,1,0.2))

# Calculate the quartiles of co2_emission
quantile(food_consumption\$co2_emission)

# Filter for Belgium
belgium_consumption <- food_consumption %>%
filter(country == "Belgium")

# Filter for USA
usa_consumption <- food_consumption %>%
filter(country == "USA")

# Calculate mean and median consumption in Belgium
mean(belgium_consumption\$consumption)
median(belgium_consumption\$consumption)

# Calculate mean and median consumption in USA
mean(usa_consumption\$consumption)
median(usa_consumption\$consumption)

``````