Skip to content

## Introduction to Statistics in R

Run the hidden code cell below to import the data used in this course.

### Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

*Add your notes here*

```
# Add your code snippets here
# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
[1] 0.69391
# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
[1] 0.69391
# Create log_gdp_per_cap column
world_happiness <- world_happiness %>%
mutate(log_gdp_per_cap = log(gdp_per_cap))
# Scatterplot of happiness_score vs. log_gdp_per_cap
ggplot(world_happiness, aes(log_gdp_per_cap, happiness_score)) +
geom_point()
# Calculate correlation
cor(world_happiness$log_gdp_per_cap, world_happiness$happiness_score)
# Scatterplot of gdp_per_cap and life_exp
ggplot(world_happiness, aes(gdp_per_cap, life_exp)) +
geom_point()
# Correlation between gdp_per_cap and life_exp
cor(world_happiness$gdp_per_cap, world_happiness$life_exp)
# Add a linear trendline to scatterplot
ggplot(world_happiness, aes(life_exp, happiness_score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Correlation between life_exp and happiness_score
cor(world_happiness$life_exp, world_happiness$happiness_score)
# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate > 4)
Error: object 'rate' not found
# Probability response takes < 1 hour
pexp(1, rate >= 4)
Error: object 'rate' not found
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769
# Probability of 5 responses
dpois(5, 4)
[1] 0.1562935
# Probability of 5 responses from coworker
dpois(5, 5.5)
[1] 0.1714007
# Probability of 2 or fewer responses
ppois(2, 4)
[1] 0.2381033
# Probability of > 10 responses
ppois(10, 4, lower.tail = FALSE)
[1] 0.002839766
# Set seed to 104
set.seed(104)
# Sample 20 num_users from amir_deals and take mean
sample(amir_deals$num_users, size = 20, replace = TRUE) %>%
mean()
# Repeat the above 100 times
sample_means <- replicate(100, sample(amir_deals$num_users, size = 20, replace = TRUE) %>% mean())
# Create data frame for plotting
samples <- data.frame(mean = sample_means)
# Histogram of sample means
ggplot(samples, aes(mean)) + geom_histogram(bins = 10)
# Calculate amount that 75% of deals will be more than
qnorm(0.75, 5000, 2000, lower.tail = FALSE)
# Probability of deal < 7500
pnorm(7500, 5000, 2000)
# Probability of closing > 1 deal out of 3 deals
pbinom(1, 3, 0.3, lower.tail = FALSE)
# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times %>%
mutate(time = runif(1000, min = 0, max = 30)) %>%
# Create a histogram of simulated times
ggplot(aes(time)) +
geom_histogram(bins = 30)
# Set random seed to 334
set.seed(334)
# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times <- data.frame(c(1:1000))
head(wait_times %>% mutate(time = runif(1000, 0, 30)))
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30
# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 <- punif(20, min, max) - punif(10, min, max)
prob_between_10_and_20
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30
# Calculate probability of waiting less than 5 mins
prob_less_than_5 <- prob_less_than_5 <- punif(5, min, max)
prob_less_than_5
# Create probability distribution
size_distribution <- restaurant_groups %>%
count(group_size) %>%
mutate(probability = n / sum(n))
# Calculate probability of picking group of 4 or more
size_distribution %>%
# Filter for groups of 4 or larger
filter(group_size >= 4) %>%
# Calculate prob_4_or_more by taking sum of probabilities
summarise(prob_4_or_more = sum(probability))
# Create probability distribution
size_distribution <- restaurant_groups %>%
# Count number of each group size
count(group_size) %>%
# Calculate probability
mutate(probability = n / sum(n))
size_distribution
mutate(probability = count / nrow(restaurant_groups)
# Create a histogram of group_size
ggplot(restaurant_groups, aes(x = group_size)) +
geom_histogram(bins = 5)
##WHAT ARE THE CHANCES##
# Set random seed to 31
set.seed(31)
# Sample 5 deals without replacement
amir_deals %>%
sample_n(5)
##
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
group_by(country) %>%
summarize(total_emission = sum(co2_emission))
# Compute the first and third quartiles and IQR of total_emission
q1 <- quantile(emissions_by_country$total_emission, 0.25)
q3 <- quantile(emissions_by_country$total_emission, 0.75)
iqr <- q3 - q1
# Calculate the lower and upper cutoffs for outliers
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Filter emissions_by_country to find outliers
emissions_by_country %>%
filter(total_emission < lower | total_emission > upper)
##
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
group_by(country) %>%
summarize(total_emission = sum(co2_emission))
##
# Calculate variance and sd of co2_emission for each food_category
food_consumption %>%
group_by(food_category) %>%
summarise(var_co2 = var(co2_emission),
sd_co2 = sd(co2_emission))
# Plot food_consumption with co2_emission on x-axis
ggplot(food_consumption, aes(co2_emission)) +
# Create a histogram
geom_histogram() +
# Create a separate sub-graph for each food_category
facet_wrap(~ food_category)
##
# Calculate the deciles of co2_emission
quantile(food_consumption$co2_emission, probs = seq(0,1,0.1))
# Calculate the quintiles of co2_emission
quantile(food_consumption$co2_emission, probs = seq(0,1,0.2))
# Calculate the quartiles of co2_emission
quantile(food_consumption$co2_emission)
# Filter for Belgium
belgium_consumption <- food_consumption %>%
filter(country == "Belgium")
# Filter for USA
usa_consumption <- food_consumption %>%
filter(country == "USA")
# Calculate mean and median consumption in Belgium
mean(belgium_consumption$consumption)
median(belgium_consumption$consumption)
# Calculate mean and median consumption in USA
mean(usa_consumption$consumption)
median(usa_consumption$consumption)
```