Skip to content
1 hidden cell
Introduction to Statistics in Python
Introduction to Statistics in Python
Run the hidden code cell below to import the data used in this course.
1 hidden cell
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
print(food_consumption)
# Filter for Belgium
be_consumption = food_consumption[food_consumption['country']=='Belgium']
# Filter for USA
usa_consumption = food_consumption[food_consumption['country']=='USA']
# Calculate mean and median consumption in Belgium
print(np.mean(be_consumption['consumption']))
print(np.median(be_consumption['consumption']))
# Calculate mean and median consumption in USA
print(np.mean(usa_consumption['consumption']))
print(np.median(usa_consumption['consumption']))
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
print(food_consumption)
# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category']=='rice']
# Histogram of co2_emission for rice and show plot
plt.hist(rice_consumption['co2_emission'])
plt.show()
# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category'] == 'rice']
# Calculate mean and median of co2_emission with .agg()
print(rice_consumption.agg([np.mean,np.median]))
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
# Print variance and sd of co2_emission for each food_category
print(food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std]))
# Create histogram of co2_emission for food_category 'beef'
food_consumption[food_consumption['food_category'] == 'beef']['co2_emission'].hist()
# Show plot
plt.show()
# Create histogram of co2_emission for food_category 'eggs'
food_consumption[food_consumption['food_category'] == 'eggs']['co2_emission'].hist()
# Show plot
plt.show()
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
food_consumption=pd.read_csv("datasets/food_consumption.csv")
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()
# Compute the first and third quantiles and IQR of emissions_by_country
q1 = np.quantile(emissions_by_country, 0.25)
q3 = np.quantile(emissions_by_country, 0.75)
iqr = q3 - q1
# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
# Subset emissions_by_country to find outliers
outliers = emissions_by_country[(emissions_by_country < lower) | (emissions_by_country > upper)]
print(outliers)
# Import numpy with alias np
import numpy as np
# Import pandas
import pandas as pd
# Read CSV file
amir_deals=pd.read_csv("datasets/amir_deals.csv")
print(amir_deals)
# Count the deals for each product
counts = amir_deals['product'].value_counts()
print(counts)
# Calculate probability of picking a deal with each product
probs = counts / amir_deals.shape[0]
print(probs)
np.random.seed(24)
# Sample 5 deals without replacement
sample_without_replacement = amir_deals.sample(5)
print(sample_without_replacement)
# Sample 5 deals with replacement
sample_with_replacement = amir_deals.sample(5,replace=True)
print(sample_with_replacement)
import pandas as pd
import numpy as np
# Create resturant_group DataFrame from scratch
restaurant_groups=pd.DataFrame({'group_id':['A','B','C','D','E','F','G','H','I','J'],'group_size':[2,4,6,2,2,2,3,2,4,2]})
# Create probability distribution
size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]
# Reset index and rename columns
size_dist = size_dist.reset_index()
size_dist.columns = ['group_size', 'prob']
# Expected value
expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])
# Subset groups of size 4 or more
groups_4_or_more = size_dist[size_dist['group_size']>=4]
# Sum the probabilities of groups_4_or_more
prob_4_or_more = groups_4_or_more['prob'].sum()
print(prob_4_or_more)
# Min and max wait times for back-up that happens every 30 min
min_time = 0
max_time = 30
# Import uniform from scipy.stats
from scipy.stats import uniform
# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 = uniform.cdf(20, min_time, max_time)-uniform.cdf(10, min_time, max_time)
print(prob_between_10_and_20)
import matplotlib.pyplot as plt
# Set random seed to 334
np.random.seed(334)
# Import uniform
from scipy.stats import uniform
# Generate 1000 wait times between 0 and 30 mins
wait_times = uniform.rvs(0, 30, size=1000)
# Create a histogram of simulated times and show plot
plt.hist(wait_times)
plt.show()
# Import binom from scipy.stats
from scipy.stats import binom
# Set random seed to 10
np.random.seed(10)
# Simulate 52 weeks of 3 deals
deals = binom.rvs(3,0.3,size=52)
# Print mean deals won per week
print(deals.mean())