Skip to content

Introduction to Statistics in R

Run the hidden code cell below to import the data used in this course.

# Load the Tidyverse
library(tidyverse)

# Read in the gapminder file
deals <- read.csv("datasets/amir_deals.csv")
happiness <- read.csv("datasets/world_happiness.csv")
food <- read.csv("datasets/food_consumption.csv")

#mean and median
	#MEAN
		#mean(belgium_consumption$consumption)

	#MEDIAN
		#median(belgium_consumption$consumption)



#quantiles, quartiles and quintiles
	#QUANTILES
	#Quantiles are a great way of summarizing numerical data since they can be used to measure center and spread, as well as to get a sense of where a data point stands in relation to the rest of the dataset. For example, you might want to give a discount to the 10% most active users on a website.
	#quantile(base de datos$columna)	
	#quantile(food_consumption$co2_emission)
	#quantile(food_consumption$co2_emission, probs = seq(0,1,0.2)) <- para quintiles
	#quantile(food_consumption$co2_emission, probs = seq(0,1,0.1)) <- para decils

	## Compute the first and third quartiles and IQR (interquantile) of total_emission
		#q1 <- quantile(emissions_by_country$total_emission, 0.25)
		#q3 <- quantile(emissions_by_country$total_emission, 0.75)
		#iqr <- q3 - q1

#measuring chances (probability)
		#sample_n(1) <- para elegir un numero al azar (2,3,etc para elegir varios numeros)
			#sample_n(1, replace=TRUE) <- para que el segundo numero al azar pueda repetirse
		#set.seed(1) <- para elegir el mismo numero al azar siempre
		#sample(base de datos, num de variable, replace=TRUE)

#probabilities
	#The punif() function calculates the probability of an outcome less than its first argument
	#Calculate probability of waiting 10-20 mins
			#prob_between_10_and_20 <- punif(20, min=min, max=max) - punif(10, min=min, max=max)

	#The runif() function takes in the number of wait times you want to generate, as well as the min and max of the distribution you're working with.

	# Generate 1000 wait times between 0 and 30 mins, save in time column
		#mutate(time = runif(1000, min=0, max=30))

	#rbinom(# of trials, # of coins (n) , # of probability of head/succes(p))
		#rbino(1,1,0.5) <- 1 prueba, 1 moneda, 50% de probabilidad
	#dbinom(# de valores que coincida, # de veces que sucederá el acontecimeinto, #probabilidad de que suceda)
		#dbinom(7,10,0.5) <- 7 veces suceda acto A, 10 veces sucederá la prueba,50% de probabilidad
			#dbinom(7,10,0.5, lower.tail=FALSE) <- para sacar 1-probabilidad


#desviación estandar (normal distribution)
	#pnorm(valor, mean=#, sd=#) <- mean: promedio sd=standar deviation
		#para sacar el valor restante, (1-pnorm) se utiliza al final de la fórmula un "lower.tail=FALSE"
			#pnorm(valor, mean=#, sd=#, lower.tail=FALSE)
		#para sacar el valor entre dos valores, se hace una resta 
			#pnorm(valor, mean=#, sd=#)- pnorm2(valor2,  mean2=#, sd2=#)

	#para sacar porcentajes
		#qnorm(porcentaje, mean=#, sd=7)
		#ej: qnorm(.9, mean=161, sd=7), así como arriba en pnorm, podemos agregar el lower.tail=FALSE y la resta en caso de que el dato buscado sea este.

	#replicate(#, formula), sirve para hacer lo que tienes dentro del paréntesis las veces que quieras

	#SAMPLE
		#sample(amir_deals$num_users, size=20, replace=TRUE)
		#sample(dataframe$columna, size=#, replace=TRUE)

	#POISSON
	#lambda= average number of events per time interval

	#dpois(# de eventos exactos en el tiempo, lambda=#)
	#ej. dpois(5, lambda=8) - (# de adopciones = 5)

	#ppois(# de eventos o menos en el tiempo, lambda=#)
	#ej. ppois(5, lambda=8) - (# de adopciones <= 5)

	#ppois(# de eventos o mas en el tiempo, lambda=#, lower.tail=FALSE)
	#ej. ppois(5, lambda=8, lower.tail=FALSE) - (# de adopciones > 5)

	#rpois(# de eventos en el tiempo, lambda=#) - para sacar todas las variables posibles en el tiempo

	#pexp(evento, rate=#) <r- (rate= 1/probabilidad)
	#ej. pexp(1, rate=0.5) - (variable menor a 1)

	#pexp(evento, rate=#, lower.tail=FALSE)
	#ej. pexp(4, rate=0.5, lower.tail=FALSE) - (variable mayor a 1)

	#para sacar la probailidad de que algo suceda entre dos intervalos, estos se restan
	#pexp(4, rate=0.5) - pexp(1, rate=0.5)

	#Correlation
	#x= independiente y=dependiente
	#magnitude= tenght of relationship (mientras más cerca a1, más correlación tienen), si es positivo (+) la recta se inclina a la derecha y si es negativo (-) la recta se inclina a la izquierda (geom_ponit() + geom_smooth(method="lm", se=FALSE) para el ggplot)
	#computing correlation
	#cor(variable$x, variable$y), To ignore data points where one or both values are missing, set the use argument of cor to pairwise-dot-complete-dot-obs.
	#cor(variable$x, variable$y, use= "pairise.complete.obs")
	#cor(world_happiness$life_exp,world_happiness$happiness_score) <- cor(base de datos$variable , base de datos$variable 2)

	#vocabulary
		#treatment: explanatory / independent variable
		#response: response / dependent variable








Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Add your code snippets here