Skip to content

Course notes: Introduction to Statistics in Python

Why does data type matter?

# Write and run code here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read the CSV file
amir_deals = pd.read_csv("datasets/amir_deals.csv")
food_consumption = pd.read_csv("datasets/food_consumption.csv")
world_happiness = pd.read_csv("datasets/world_happiness.csv")
# Filter for Belgium
be_consumption = food_consumption[food_consumption["country"] == "Belgium"]

# Filter for USA
usa_consumption = food_consumption[food_consumption["country"] == "USA"]

# Calculate mean and median consumption in Belgium
print(np.mean(be_consumption["consumption"]))
print(np.median(be_consumption["consumption"]))

# Calculate mean and median consumption in USA
print(np.mean(usa_consumption["consumption"]))
print(np.median(usa_consumption["consumption"]))
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category'] == 'rice']

# Histogram of co2_emission for rice and show plot
plt.hist(rice_consumption['co2_emission'], bins=20, edgecolor='black')
plt.xlabel('CO2 Emission (kg/person/year)')
plt.ylabel('Frequency')
plt.title('Histogram of CO2 Emission for Rice')
plt.show()
# Calculate the quartiles of co2_emission

print(np.quantile(food_consumption['co2_emission'],[0,0.25,0.5,0.75,1]))
# Calculate variance and standard deviation of co2_emission for each food_category
variance_std = food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std])
print(variance_std)

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Create histogram of co2_emission for food_category 'beef'
beef_data = food_consumption[food_consumption['food_category'] == 'beef']
plt.hist(beef_data['co2_emission'], bins=20, edgecolor='black')
plt.xlabel('CO2 Emission (kg/person/year)')
plt.ylabel('Frequency')
plt.title('Histogram of CO2 Emission for Beef')
plt.show()

# Create histogram of co2_emission for food_category 'eggs'
eggs_data = food_consumption[food_consumption['food_category'] == 'eggs']
plt.hist(eggs_data['co2_emission'], bins=20, edgecolor='black')
plt.xlabel('CO2 Emission (kg/person/year)')
plt.ylabel('Frequency')
plt.title('Histogram of CO2 Emission for Eggs')
plt.show()
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

print(emissions_by_country)
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

# Compute the first and third quartiles of emissions_by_country
q1 = emissions_by_country.quantile(0.25)
q3 = emissions_by_country.quantile(0.75)

# Calculate the interquartile range (IQR)
iqr = q3 - q1

# Print the values of q1, q3, and iqr
print("First Quartile (Q1):", q1)
print("Third Quartile (Q3):", q3)
print("Interquartile Range (IQR):", iqr)
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

# Compute the first and third quantiles and IQR of emissions_by_country
q1 = np.quantile(emissions_by_country, 0.25)
q3 = np.quantile(emissions_by_country, 0.75)
iqr = q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

# Print the lower and upper cutoffs
print("Lower Cutoff for Outliers:", lower)
print("Upper Cutoff for Outliers:", upper)
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

# Compute the first and third quantiles and IQR of emissions_by_country
q1 = np.quantile(emissions_by_country, 0.25)
q3 = np.quantile(emissions_by_country, 0.75)
iqr = q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

# Subset countries with total emission greater than the upper cutoff or less than the lower cutoff
outliers = emissions_by_country[(emissions_by_country > upper) | (emissions_by_country < lower)]

# Print the countries that are outliers
print("Countries with Outliers in Total Emissions:")
print(outliers)
# Count the deals for each product
counts = amir_deals["product"].value_counts()
print(counts)
# Count the deals for each product
counts = amir_deals['product'].value_counts()

# Calculate probability of picking a deal with each product
probs = counts / len(amir_deals)
print(probs)
# Set random seed
np.random.seed(24)

# Sample 5 deals without replacement
sample_without_replacement = amir_deals.sample(5, replace = False)
print(sample_without_replacement)
# Set random seed
np.random.seed(24)

# Sample 5 deals with replacement
sample_with_replacement = amir_deals.sample(5, replace=True)

print(sample_with_replacement)