Skip to content

Introduction to Statistics in Python

Run the hidden code cell below to import the data used in this course.

# Importing numpy and pandas
import numpy as np
import pandas as pd

# Importing the course datasets
amir_deals = pd.read_csv("datasets/amir_deals.csv")
happiness = pd.read_csv("datasets/world_happiness.csv")
food_consumption =pd.read_csv("datasets/food_consumption.csv")
restaurant_groups =pd.read_csv("datasets/restaurant_groups.csv")

Ch1 Summary Statistics

--Descriptive and inferential statistics

--Data type classification

--Mean and median

if the data is skewed, meaning it's not symmetrical, median is usually better to use.

print(food_consumption)

# Filter for Belgium
be_consumption = food_consumption[food_consumption["country"]=="Belgium"]

# Filter for USA
usa_consumption = food_consumption[food_consumption["country"]=="USA"]
# Calculate mean and median consumption in Belgium
print(np.mean(be_consumption["consumption"]))
print(np.median(be_consumption["consumption"]))
# Calculate mean and median consumption in USA
print(np.mean(usa_consumption["consumption"]))
print(np.median(usa_consumption["consumption"]))
# Subset for Belgium and USA only
be_and_usa = food_consumption[(food_consumption["country"]==("Belgium")) | (food_consumption['country'] == 'USA')]

# Group by country, select consumption column, and compute mean and median
print(be_and_usa.groupby("country")["consumption"].agg([np.mean, np.median]))

--Mean vs. median

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption["food_category"]=="rice"]

# Histogram of co2_emission for rice and show plot
rice_consumption['co2_emission'].hist()
plt.show()