Skip to content

Introduction to Statistics in Python

Run the hidden code cell below to import the data used in this course.

# Importing numpy and pandas
import numpy as np
import pandas as pd

# Importing the course datasets
deals = pd.read_csv("datasets/amir_deals.csv")
happiness = pd.read_csv("datasets/world_happiness.csv")
food = pd.read_csv("datasets/food_consumption.csv")

Computing probabilities

counts = amir_deals['product'].value_counts()

Calculate probability of picking a deal with each product

probs = counts / amir_deals.shape[0]

print(probs)

Discrete function

Create probability distribution

size_dist = restaurant_groups["group_size"].value_counts() / restaurant_groups["group_size"].shape[0]

Reset index and rename columns

size_dist = size_dist.reset_index()

size_dist.columns = ["group_size", "prob"]

print(size_dist)

Expected value

expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])

print(expected_value)

Subset groups of size 4 or more

groups_4_or_more = size_dist[size_dist.group_size >= 4]

Sum the probabilities of groups_4_or_more

prob_4_or_more = np.sum(groups_4_or_more["prob"]) print(prob_4_or_more)

Continuons distribution

Min and max wait times for back-up that happens every 30 min

min_time = 0 max_time = 30

Import uniform from scipy.stats

from scipy.stats import uniform

Calculate probability of waiting more than 5 mins

prob_greater_than_5 = 1 - uniform.cdf(5, min_time, max_time)

print(prob_greater_than_5)

Calculate probability of waiting 10-20 mins

prob_between_10_and_20 = uniform.cdf(20, min_time, max_time) - uniform.cdf(10, min_time, max_time)

print(prob_between_10_and_20)

Generate 1000 wait times between 0 and 30 mins

wait_times = uniform.rvs(0, 30, size=1000)

Create a histogram of simulated times and show plot

plt.hist(wait_times) plt.show()

Binomial distribution

Import binom from scipy.stats

from scipy.stats import binom

Set random seed to 10

np.random.seed(10)

Simulate 1 week of 3 deals

print(binom.rvs(3, 0.3, size=1))

Simulate 52 weeks of 3 deals

deals = binom.rvs(3, 0.3, size = 52)

Print mean deals won per week

print(np.mean(deals))

Probability of closing 3 out of 3 deals

prob_3 = binom.pmf(3, 3, 0.3)

print(prob_3)

Probability of closing <= 1 deal out of 3 deals

prob_less_than_or_equal_1 = binom.cdf(1, 3, 0.3)

print(prob_less_than_or_equal_1)

Probability of closing > 1 deal out of 3 deals

prob_greater_than_1 = 1 - binom.cdf(1, 3, 0.3)

print(prob_greater_than_1)

Expected number won with 30% win rate

won_30pct = 3 * 0.3

print(won_30pct)

Expected number won with 25% win rate

won_25pct = 3*0.25

print(won_25pct)

Normal distribution

Probability of deal < 7500

prob_less_7500 = norm.cdf(7500, 5000, 2000)

Probability of deal > 1000

prob_over_1000 = 1 - norm.cdf(1000, 5000, 2000)

Probability of deal between 3000 and 7000

prob_3000_to_7000 = norm.cdf(7000, 5000, 2000) - norm.cdf(3000, 5000, 2000)

Calculate amount that 25% of deals will be less than

pct_25 = norm.ppf(0.25, 5000, 2000)

###################################################################################

Calculate new average amount

new_mean = 5000 * 1.2

Calculate new standard deviation

new_sd = 2000 * 1.3

Simulate 36 new sales

new_sales = norm.rvs(new_mean, new_sd, size=36)

Create histogram and show

plt.hist(new_sales)

plt.show()

Central limit theorem

Set seed to 104

np.random.seed(104)

Sample 20 num_users with replacement from amir_deals

samp_20 = amir_deals['num_users'].sample(20, replace=True)

Take mean of samp_20

print(np.mean(samp_20))

sample_means = []

Loop 100 times

for i in range(100):

Take sample of 20 num_users

samp_20 = amir_deals['num_users'].sample(20, replace=True)

Calculate mean of samp_20

samp_20_mean = np.mean(samp_20)

Append samp_20_mean to sample_means

sample_means.append(samp_20_mean)

Convert to Series and plot histogram

sample_means_series = pd.Series(sample_means)

sample_means_series.hist()

Show plot

plt.show()