Skip to content
Introduction to Statistics in Python
Run the hidden code cell below to import the data used in this course.
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
print(food_consumption)
# Filter for Belgium
be_consumption = food_consumption[food_consumption['country']=='Belgium']
# Filter for USA
usa_consumption = food_consumption[food_consumption['country']=='USA']
# Calculate mean and median consumption in Belgium
print(np.mean(be_consumption['consumption']))
print(np.median(be_consumption['consumption']))
# Calculate mean and median consumption in USA
print(np.mean(usa_consumption['consumption']))
print(np.median(usa_consumption['consumption']))
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
print(food_consumption)
# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category']=='rice']
# Histogram of co2_emission for rice and show plot
plt.hist(rice_consumption['co2_emission'])
plt.show()
# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category'] == 'rice']
# Calculate mean and median of co2_emission with .agg()
print(rice_consumption.agg([np.mean,np.median]))
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
# Print variance and sd of co2_emission for each food_category
print(food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std]))
# Create histogram of co2_emission for food_category 'beef'
food_consumption[food_consumption['food_category'] == 'beef']['co2_emission'].hist()
# Show plot
plt.show()
# Create histogram of co2_emission for food_category 'eggs'
food_consumption[food_consumption['food_category'] == 'eggs']['co2_emission'].hist()
# Show plot
plt.show()
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()
# Compute the first and third quantiles and IQR of emissions_by_country
q1 = np.quantile(emissions_by_country, 0.25)
q3 = np.quantile(emissions_by_country, 0.75)
iqr = q3 - q1
# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
# Subset emissions_by_country to find outliers
outliers = emissions_by_country[(emissions_by_country < lower) | (emissions_by_country > upper)]
print(outliers)
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
amir_deals=pd.read_csv("datasets/amir_deals.csv")
print(amir_deals)
# Count the deals for each product
counts = amir_deals['product'].value_counts()
print(counts)
# Calculate probability of picking a deal with each product
probs = counts / amir_deals.shape[0]
print(probs)
np.random.seed(24)
# Sample 5 deals without replacement
sample_without_replacement = amir_deals.sample(5)
print(sample_without_replacement)
# Sample 5 deals with replacement
sample_with_replacement = amir_deals.sample(5,replace=True)
print(sample_with_replacement)
import pandas as pd
import numpy as np
# Create resturant_group DataFrame from scratch
restaurant_groups=pd.DataFrame({'group_id':['A','B','C','D','E','F','G','H','I','J'],'group_size':[2,4,6,2,2,2,3,2,4,2]})
# Create probability distribution
size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]
# Reset index and rename columns
size_dist = size_dist.reset_index()
size_dist.columns = ['group_size', 'prob']
# Expected value
expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])
# Subset groups of size 4 or more
groups_4_or_more = size_dist[size_dist['group_size']>=4]
# Sum the probabilities of groups_4_or_more
prob_4_or_more = groups_4_or_more['prob'].sum()
print(prob_4_or_more)
# Min and max wait times for back-up that happens every 30 min
min_time = 0
max_time = 30
# Import uniform from scipy.stats
from scipy.stats import uniform
# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 = uniform.cdf(20, min_time, max_time)-uniform.cdf(10, min_time, max_time)
print(prob_between_10_and_20)
import matplotlib.pyplot as plt
# Set random seed to 334
np.random.seed(334)
# Import uniform
from scipy.stats import uniform
# Generate 1000 wait times between 0 and 30 mins
wait_times = uniform.rvs(0, 30, size=1000)
# Create a histogram of simulated times and show plot
plt.hist(wait_times)
plt.show()
# Import binom from scipy.stats
from scipy.stats import binom
# Set random seed to 10
np.random.seed(10)
# Simulate 52 weeks of 3 deals
deals = binom.rvs(3,0.3,size=52)
# Print mean deals won per week
print(deals.mean())