Skip to content

Introduction to Statistics in R

Run the hidden code cell below to import the data used in this course.


1 hidden cell

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Finding outliers
library(ggplot2)
library(tidyverse)
iqr <- quantile(msleep$bodywt, 0.75) - quantile(msleep$bodywt, 0.25)
lower_threshold <- quantile(msleep$bodywt, 0.25) - 1.5 * iqr
upper_threshold <- quantile(msleep$bodywt, 0.75) + 1.5 * iqr

msleep %>% filter(bodywt < lower_threshold | bodywt > upper_threshold) %>% 
  select(name, vore, sleep_total, bodywt)
library(dplyr)
library(tidyverse)
# Calculate the quartiles of co2_emission
quantile(msleep$bodywt) # interquartile range
quantile(msleep$bodywt, probs = seq(0,1, 0.2)) # 6 quintiles
quantile(msleep$bodywt, probs = seq(0,1, 0.1)) # 11 deciles

# Variance

# distance between each point
dists <- msleep$sleep_total - mean(msleep$sleep_total)
squared_dists <- dists^2
sum_sq_dists <- sum(squared_dists)
sum_sq_dists
# divide the sum of squares by the number of data points -1
length(msleep$sleep_total) # number of data points
sum_sq_dists / (length(msleep$sleep_total)-1) # higher the variance the more spread the data

var(msleep$sleep_total) # Do the steps above in one step

# Standard Deviation
sqrt(var(msleep$sleep_total))
sd(msleep$sleep_total)

# Mean absolute deviation
mean(abs(dists))
summary(msleep)

# Plot histogram of the 
ggplot(data = msleep, aes(x = sleep_total)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "Distribution of sleep total",
       x = "Sleep total",
       y = "Frequency") +
  facet_wrap(~genus)
library(ggplot2)

ggplot(data = msleep, aes(x = sleep_total)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "Distribution of sleep total",
       x = "Sleep total",
       y = "Frequency")
library(ggplot2)

ggplot(data = msleep, aes(x = sleep_total)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "Distribution of sleep total",
       x = "Sleep total",
       y = "Frequency") +
  facet_wrap(~ name)
df <- data.frame(name = c('Amir', 'Brian', 'Claire', 'Damian'), n_sales = c(178, 126, 75, 69))
df
library(dplyr)

sales_counts <- data.frame(name = c('Amir', 'Brian', 'Claire', 'Damian'), n_sales = c(178, 126, 75, 69))

set.seed(5) # sets the random numbers to be generated to enable consistent results
sales_counts  %>% sample_n(1)
set.seed(5)
sales_counts  %>% sample_n(2) # default to sampling without replacement
# With replacement i.e. allows people to be rechosen
set.seed(5)
sales_counts %>% sample_n(2, replace = TRUE)
set.seed(5)
sales_counts %>% sample_n(5, replace = TRUE)