Skip to content
1 hidden cell
Introduction to Statistics in R
Introduction to Statistics in R
Run the hidden code cell below to import the data used in this course.
1 hidden cell
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Finding outliers
library(ggplot2)
library(tidyverse)
iqr <- quantile(msleep$bodywt, 0.75) - quantile(msleep$bodywt, 0.25)
lower_threshold <- quantile(msleep$bodywt, 0.25) - 1.5 * iqr
upper_threshold <- quantile(msleep$bodywt, 0.75) + 1.5 * iqr
msleep %>% filter(bodywt < lower_threshold | bodywt > upper_threshold) %>%
select(name, vore, sleep_total, bodywt)
library(dplyr)
library(tidyverse)
# Calculate the quartiles of co2_emission
quantile(msleep$bodywt) # interquartile range
quantile(msleep$bodywt, probs = seq(0,1, 0.2)) # 6 quintiles
quantile(msleep$bodywt, probs = seq(0,1, 0.1)) # 11 deciles
# Variance
# distance between each point
dists <- msleep$sleep_total - mean(msleep$sleep_total)
squared_dists <- dists^2
sum_sq_dists <- sum(squared_dists)
sum_sq_dists
# divide the sum of squares by the number of data points -1
length(msleep$sleep_total) # number of data points
sum_sq_dists / (length(msleep$sleep_total)-1) # higher the variance the more spread the data
var(msleep$sleep_total) # Do the steps above in one step
# Standard Deviation
sqrt(var(msleep$sleep_total))
sd(msleep$sleep_total)
# Mean absolute deviation
mean(abs(dists))
summary(msleep)
# Plot histogram of the
ggplot(data = msleep, aes(x = sleep_total)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(title = "Distribution of sleep total",
x = "Sleep total",
y = "Frequency") +
facet_wrap(~genus)
library(ggplot2)
ggplot(data = msleep, aes(x = sleep_total)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(title = "Distribution of sleep total",
x = "Sleep total",
y = "Frequency")
library(ggplot2)
ggplot(data = msleep, aes(x = sleep_total)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(title = "Distribution of sleep total",
x = "Sleep total",
y = "Frequency") +
facet_wrap(~ name)
df <- data.frame(name = c('Amir', 'Brian', 'Claire', 'Damian'), n_sales = c(178, 126, 75, 69))
df
library(dplyr)
sales_counts <- data.frame(name = c('Amir', 'Brian', 'Claire', 'Damian'), n_sales = c(178, 126, 75, 69))
set.seed(5) # sets the random numbers to be generated to enable consistent results
sales_counts %>% sample_n(1)
set.seed(5)
sales_counts %>% sample_n(2) # default to sampling without replacement
# With replacement i.e. allows people to be rechosen
set.seed(5)
sales_counts %>% sample_n(2, replace = TRUE)
set.seed(5)
sales_counts %>% sample_n(5, replace = TRUE)