Skip to content

Introduction to Statistics in R

Run the hidden code cell below to import the data used in this course.


1 hidden cell

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Add your code snippets here


# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
[1] 0.69391
# Scatterplot of grams_sugar_per_day and happiness_score
ggplot(world_happiness, aes(grams_sugar_per_day, happiness_score)) + geom_point()
# Correlation between grams_sugar_per_day and happiness_score
cor(world_happiness$happiness_score,world_happiness$grams_sugar_per_day)
[1] 0.69391



# Create log_gdp_per_cap column
world_happiness <- world_happiness %>%
  mutate(log_gdp_per_cap = log(gdp_per_cap))

# Scatterplot of happiness_score vs. log_gdp_per_cap
ggplot(world_happiness, aes(log_gdp_per_cap, happiness_score)) +
  geom_point()

# Calculate correlation
cor(world_happiness$log_gdp_per_cap, world_happiness$happiness_score)



# Scatterplot of gdp_per_cap and life_exp
ggplot(world_happiness, aes(gdp_per_cap, life_exp)) +
  geom_point()

# Correlation between gdp_per_cap and life_exp
cor(world_happiness$gdp_per_cap, world_happiness$life_exp)






# Add a linear trendline to scatterplot
ggplot(world_happiness, aes(life_exp, happiness_score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

# Correlation between life_exp and happiness_score
cor(world_happiness$life_exp, world_happiness$happiness_score)




# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate = 5.5)
[1] 0.9959132
# Probability response takes < 1 hour
pexp(1, rate > 4)
Error: object 'rate' not found
# Probability response takes < 1 hour
pexp(1, rate >= 4)
Error: object 'rate' not found
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 4)
[1] 0.9816844
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 2.5)
[1] 0.917915
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes < 1 hour
pexp(1, rate = 0.4)
[1] 0.32968
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4)
[1] 0.7981035
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes > 4 hours
pexp(4, rate = 0.4, lower.tail = FALSE)
[1] 0.2018965
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 0.4, lower.tail = FALSE) - pexp(3, rate = 0.4, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5, lower.tail = FALSE) - pexp(3, rate = 1/2.5, lower.tail = FALSE)
[1] -0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769
# Probability response takes 3-4 hours
pexp(4, rate = 1/2.5) - pexp(3, rate = 1/2.5)
[1] 0.09929769



# Probability of 5 responses
dpois(5, 4)
[1] 0.1562935
# Probability of 5 responses from coworker
dpois(5, 5.5)
[1] 0.1714007
# Probability of 2 or fewer responses
ppois(2, 4)
[1] 0.2381033
# Probability of > 10 responses
ppois(10, 4, lower.tail = FALSE)
[1] 0.002839766






# Set seed to 104
set.seed(104)

# Sample 20 num_users from amir_deals and take mean
sample(amir_deals$num_users, size = 20, replace = TRUE) %>%
  mean()

# Repeat the above 100 times
sample_means <- replicate(100, sample(amir_deals$num_users, size = 20, replace = TRUE) %>% mean())

# Create data frame for plotting
samples <- data.frame(mean = sample_means)

# Histogram of sample means
ggplot(samples, aes(mean)) + geom_histogram(bins = 10)







# Calculate amount that 75% of deals will be more than
qnorm(0.75, 5000, 2000, lower.tail = FALSE)


# Probability of deal < 7500
pnorm(7500, 5000, 2000)

# Probability of closing > 1 deal out of 3 deals
pbinom(1, 3, 0.3, lower.tail = FALSE)

# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times %>%
  mutate(time = runif(1000, min = 0, max = 30)) %>%
  # Create a histogram of simulated times
  ggplot(aes(time)) +
  geom_histogram(bins = 30)

# Set random seed to 334
set.seed(334)

# Generate 1000 wait times between 0 and 30 mins, save in time column
wait_times <- data.frame(c(1:1000))
head(wait_times %>% mutate(time = runif(1000, 0, 30)))


# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30

# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 <- punif(20, min, max) - punif(10, min, max)
prob_between_10_and_20
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30
# Calculate probability of waiting less than 5 mins
prob_less_than_5 <- prob_less_than_5 <- punif(5, min, max)
prob_less_than_5


# Create probability distribution
size_distribution <- restaurant_groups %>%
  count(group_size) %>%
  mutate(probability = n / sum(n))

# Calculate probability of picking group of 4 or more
size_distribution %>%
  # Filter for groups of 4 or larger
  filter(group_size >= 4) %>%
  # Calculate prob_4_or_more by taking sum of probabilities
  summarise(prob_4_or_more = sum(probability))

# Create probability distribution
size_distribution <- restaurant_groups %>%
  # Count number of each group size
  count(group_size) %>%
  # Calculate probability
  mutate(probability = n / sum(n))

size_distribution
mutate(probability = count / nrow(restaurant_groups)


# Create a histogram of group_size
ggplot(restaurant_groups, aes(x = group_size)) +
  geom_histogram(bins = 5)



##WHAT ARE THE CHANCES##

# Set random seed to 31
set.seed(31)

# Sample 5 deals without replacement
amir_deals %>%
sample_n(5)

##

# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
  group_by(country) %>%
  summarize(total_emission = sum(co2_emission))

# Compute the first and third quartiles and IQR of total_emission
q1 <- quantile(emissions_by_country$total_emission, 0.25)
q3 <- quantile(emissions_by_country$total_emission, 0.75)
iqr <- q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr

# Filter emissions_by_country to find outliers
emissions_by_country %>%
  filter(total_emission < lower | total_emission > upper)

##


# Calculate total co2_emission per country: emissions_by_country
emissions_by_country <- food_consumption %>%
  group_by(country) %>%
  summarize(total_emission = sum(co2_emission))

##
# Calculate variance and sd of co2_emission for each food_category
food_consumption %>% 
  group_by(food_category) %>% 
  summarise(var_co2 = var(co2_emission),
     sd_co2 = sd(co2_emission))

# Plot food_consumption with co2_emission on x-axis
ggplot(food_consumption, aes(co2_emission)) +
  # Create a histogram
  geom_histogram() +
  # Create a separate sub-graph for each food_category
  facet_wrap(~ food_category)
##
# Calculate the deciles of co2_emission

quantile(food_consumption$co2_emission, probs = seq(0,1,0.1))
# Calculate the quintiles of co2_emission

quantile(food_consumption$co2_emission, probs = seq(0,1,0.2))

# Calculate the quartiles of co2_emission
quantile(food_consumption$co2_emission)


# Filter for Belgium
belgium_consumption <- food_consumption %>%
  filter(country == "Belgium")

# Filter for USA
usa_consumption <- food_consumption %>%
  filter(country == "USA")

# Calculate mean and median consumption in Belgium
mean(belgium_consumption$consumption)
median(belgium_consumption$consumption)

# Calculate mean and median consumption in USA
mean(usa_consumption$consumption)
median(usa_consumption$consumption)