Skip to content
Introduction to Statistics in R
Introduction to Statistics in R
Run the hidden code cell below to import the data used in this course.
# Load the Tidyverse
library(tidyverse)
# Read in the gapminder file
deals <- read.csv("datasets/amir_deals.csv")
happiness <- read.csv("datasets/world_happiness.csv")
food <- read.csv("datasets/food_consumption.csv")
#mean and median
#MEAN
#mean(belgium_consumption$consumption)
#MEDIAN
#median(belgium_consumption$consumption)
#quantiles, quartiles and quintiles
#QUANTILES
#Quantiles are a great way of summarizing numerical data since they can be used to measure center and spread, as well as to get a sense of where a data point stands in relation to the rest of the dataset. For example, you might want to give a discount to the 10% most active users on a website.
#quantile(base de datos$columna)
#quantile(food_consumption$co2_emission)
#quantile(food_consumption$co2_emission, probs = seq(0,1,0.2)) <- para quintiles
#quantile(food_consumption$co2_emission, probs = seq(0,1,0.1)) <- para decils
## Compute the first and third quartiles and IQR (interquantile) of total_emission
#q1 <- quantile(emissions_by_country$total_emission, 0.25)
#q3 <- quantile(emissions_by_country$total_emission, 0.75)
#iqr <- q3 - q1
#measuring chances (probability)
#sample_n(1) <- para elegir un numero al azar (2,3,etc para elegir varios numeros)
#sample_n(1, replace=TRUE) <- para que el segundo numero al azar pueda repetirse
#set.seed(1) <- para elegir el mismo numero al azar siempre
#sample(base de datos, num de variable, replace=TRUE)
#probabilities
#The punif() function calculates the probability of an outcome less than its first argument
#Calculate probability of waiting 10-20 mins
#prob_between_10_and_20 <- punif(20, min=min, max=max) - punif(10, min=min, max=max)
#The runif() function takes in the number of wait times you want to generate, as well as the min and max of the distribution you're working with.
# Generate 1000 wait times between 0 and 30 mins, save in time column
#mutate(time = runif(1000, min=0, max=30))
#rbinom(# of trials, # of coins (n) , # of probability of head/succes(p))
#rbino(1,1,0.5) <- 1 prueba, 1 moneda, 50% de probabilidad
#dbinom(# de valores que coincida, # de veces que sucederá el acontecimeinto, #probabilidad de que suceda)
#dbinom(7,10,0.5) <- 7 veces suceda acto A, 10 veces sucederá la prueba,50% de probabilidad
#dbinom(7,10,0.5, lower.tail=FALSE) <- para sacar 1-probabilidad
#desviación estandar (normal distribution)
#pnorm(valor, mean=#, sd=#) <- mean: promedio sd=standar deviation
#para sacar el valor restante, (1-pnorm) se utiliza al final de la fórmula un "lower.tail=FALSE"
#pnorm(valor, mean=#, sd=#, lower.tail=FALSE)
#para sacar el valor entre dos valores, se hace una resta
#pnorm(valor, mean=#, sd=#)- pnorm2(valor2, mean2=#, sd2=#)
#para sacar porcentajes
#qnorm(porcentaje, mean=#, sd=7)
#ej: qnorm(.9, mean=161, sd=7), así como arriba en pnorm, podemos agregar el lower.tail=FALSE y la resta en caso de que el dato buscado sea este.
#replicate(#, formula), sirve para hacer lo que tienes dentro del paréntesis las veces que quieras
#SAMPLE
#sample(amir_deals$num_users, size=20, replace=TRUE)
#sample(dataframe$columna, size=#, replace=TRUE)
#POISSON
#lambda= average number of events per time interval
#dpois(# de eventos exactos en el tiempo, lambda=#)
#ej. dpois(5, lambda=8) - (# de adopciones = 5)
#ppois(# de eventos o menos en el tiempo, lambda=#)
#ej. ppois(5, lambda=8) - (# de adopciones <= 5)
#ppois(# de eventos o mas en el tiempo, lambda=#, lower.tail=FALSE)
#ej. ppois(5, lambda=8, lower.tail=FALSE) - (# de adopciones > 5)
#rpois(# de eventos en el tiempo, lambda=#) - para sacar todas las variables posibles en el tiempo
#pexp(evento, rate=#) <r- (rate= 1/probabilidad)
#ej. pexp(1, rate=0.5) - (variable menor a 1)
#pexp(evento, rate=#, lower.tail=FALSE)
#ej. pexp(4, rate=0.5, lower.tail=FALSE) - (variable mayor a 1)
#para sacar la probailidad de que algo suceda entre dos intervalos, estos se restan
#pexp(4, rate=0.5) - pexp(1, rate=0.5)
#Correlation
#x= independiente y=dependiente
#magnitude= tenght of relationship (mientras más cerca a1, más correlación tienen), si es positivo (+) la recta se inclina a la derecha y si es negativo (-) la recta se inclina a la izquierda (geom_ponit() + geom_smooth(method="lm", se=FALSE) para el ggplot)
#computing correlation
#cor(variable$x, variable$y), To ignore data points where one or both values are missing, set the use argument of cor to pairwise-dot-complete-dot-obs.
#cor(variable$x, variable$y, use= "pairise.complete.obs")
#cor(world_happiness$life_exp,world_happiness$happiness_score) <- cor(base de datos$variable , base de datos$variable 2)
#vocabulary
#treatment: explanatory / independent variable
#response: response / dependent variable
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here