Introduction to Statistics in Python
Run the hidden code cell below to import the data used in this course.
# Importing numpy and pandas
import numpy as np
import pandas as pd
# Importing the course datasets
deals = pd.read_csv("datasets/amir_deals.csv")
happiness = pd.read_csv("datasets/world_happiness.csv")
food = pd.read_csv("datasets/food_consumption.csv")
Computing probabilities
counts = amir_deals['product'].value_counts()
Calculate probability of picking a deal with each product
probs = counts / amir_deals.shape[0]
print(probs)
Discrete function
Create probability distribution
size_dist = restaurant_groups["group_size"].value_counts() / restaurant_groups["group_size"].shape[0]
Reset index and rename columns
size_dist = size_dist.reset_index()
size_dist.columns = ["group_size", "prob"]
print(size_dist)
Expected value
expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])
print(expected_value)
Subset groups of size 4 or more
groups_4_or_more = size_dist[size_dist.group_size >= 4]
Sum the probabilities of groups_4_or_more
prob_4_or_more = np.sum(groups_4_or_more["prob"]) print(prob_4_or_more)
Continuons distribution
Min and max wait times for back-up that happens every 30 min
min_time = 0 max_time = 30
Import uniform from scipy.stats
from scipy.stats import uniform
Calculate probability of waiting more than 5 mins
prob_greater_than_5 = 1 - uniform.cdf(5, min_time, max_time)
print(prob_greater_than_5)
Calculate probability of waiting 10-20 mins
prob_between_10_and_20 = uniform.cdf(20, min_time, max_time) - uniform.cdf(10, min_time, max_time)
print(prob_between_10_and_20)
Generate 1000 wait times between 0 and 30 mins
wait_times = uniform.rvs(0, 30, size=1000)
Create a histogram of simulated times and show plot
plt.hist(wait_times) plt.show()
Binomial distribution
Import binom from scipy.stats
from scipy.stats import binom
Set random seed to 10
np.random.seed(10)
Simulate 1 week of 3 deals
print(binom.rvs(3, 0.3, size=1))
Simulate 52 weeks of 3 deals
deals = binom.rvs(3, 0.3, size = 52)
Print mean deals won per week
print(np.mean(deals))
Probability of closing 3 out of 3 deals
prob_3 = binom.pmf(3, 3, 0.3)
print(prob_3)
Probability of closing <= 1 deal out of 3 deals
prob_less_than_or_equal_1 = binom.cdf(1, 3, 0.3)
print(prob_less_than_or_equal_1)
Probability of closing > 1 deal out of 3 deals
prob_greater_than_1 = 1 - binom.cdf(1, 3, 0.3)
print(prob_greater_than_1)
Expected number won with 30% win rate
won_30pct = 3 * 0.3
print(won_30pct)
Expected number won with 25% win rate
won_25pct = 3*0.25
print(won_25pct)
Normal distribution
Probability of deal < 7500
prob_less_7500 = norm.cdf(7500, 5000, 2000)
Probability of deal > 1000
prob_over_1000 = 1 - norm.cdf(1000, 5000, 2000)
Probability of deal between 3000 and 7000
prob_3000_to_7000 = norm.cdf(7000, 5000, 2000) - norm.cdf(3000, 5000, 2000)
Calculate amount that 25% of deals will be less than
pct_25 = norm.ppf(0.25, 5000, 2000)
###################################################################################
Calculate new average amount
new_mean = 5000 * 1.2
Calculate new standard deviation
new_sd = 2000 * 1.3
Simulate 36 new sales
new_sales = norm.rvs(new_mean, new_sd, size=36)
Create histogram and show
plt.hist(new_sales)
plt.show()
Central limit theorem
Set seed to 104
np.random.seed(104)
Sample 20 num_users with replacement from amir_deals
samp_20 = amir_deals['num_users'].sample(20, replace=True)
Take mean of samp_20
print(np.mean(samp_20))
sample_means = []
Loop 100 times
for i in range(100):
Take sample of 20 num_users
samp_20 = amir_deals['num_users'].sample(20, replace=True)
Calculate mean of samp_20
samp_20_mean = np.mean(samp_20)
Append samp_20_mean to sample_means
sample_means.append(samp_20_mean)
Convert to Series and plot histogram
sample_means_series = pd.Series(sample_means)
sample_means_series.hist()
Show plot
plt.show()