## Introduction to Statistics in Python

Run the hidden code cell below to import the data used in this course.

```
# Importing numpy and pandas
import numpy as np
import pandas as pd
# Importing the course datasets
deals = pd.read_csv("datasets/amir_deals.csv")
happiness = pd.read_csv("datasets/world_happiness.csv")
food = pd.read_csv("datasets/food_consumption.csv")
```

## Computing probabilities

counts = amir_deals['product'].value_counts()

### Calculate probability of picking a deal with each product

probs = counts / amir_deals.shape[0]

print(probs)

## Discrete function

### Create probability distribution

size_dist = restaurant_groups["group_size"].value_counts() / restaurant_groups["group_size"].shape[0]

### Reset index and rename columns

size_dist = size_dist.reset_index()

size_dist.columns = ["group_size", "prob"]

print(size_dist)

### Expected value

expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])

print(expected_value)

### Subset groups of size 4 or more

groups_4_or_more = size_dist[size_dist.group_size >= 4]

### Sum the probabilities of groups_4_or_more

prob_4_or_more = np.sum(groups_4_or_more["prob"]) print(prob_4_or_more)

## Continuons distribution

### Min and max wait times for back-up that happens every 30 min

min_time = 0 max_time = 30

### Import uniform from scipy.stats

from scipy.stats import uniform

### Calculate probability of waiting more than 5 mins

prob_greater_than_5 = 1 - uniform.cdf(5, min_time, max_time)

print(prob_greater_than_5)

### Calculate probability of waiting 10-20 mins

prob_between_10_and_20 = uniform.cdf(20, min_time, max_time) - uniform.cdf(10, min_time, max_time)

print(prob_between_10_and_20)

### Generate 1000 wait times between 0 and 30 mins

wait_times = uniform.rvs(0, 30, size=1000)

### Create a histogram of simulated times and show plot

plt.hist(wait_times) plt.show()

## Binomial distribution

### Import binom from scipy.stats

from scipy.stats import binom

### Set random seed to 10

np.random.seed(10)

### Simulate 1 week of 3 deals

print(binom.rvs(3, 0.3, size=1))

### Simulate 52 weeks of 3 deals

deals = binom.rvs(3, 0.3, size = 52)

### Print mean deals won per week

print(np.mean(deals))

### Probability of closing 3 out of 3 deals

prob_3 = binom.pmf(3, 3, 0.3)

print(prob_3)

### Probability of closing <= 1 deal out of 3 deals

prob_less_than_or_equal_1 = binom.cdf(1, 3, 0.3)

print(prob_less_than_or_equal_1)

### Probability of closing > 1 deal out of 3 deals

prob_greater_than_1 = 1 - binom.cdf(1, 3, 0.3)

print(prob_greater_than_1)

### Expected number won with 30% win rate

won_30pct = 3 * 0.3

print(won_30pct)

### Expected number won with 25% win rate

won_25pct = 3*0.25

print(won_25pct)

## Normal distribution

### Probability of deal < 7500

prob_less_7500 = norm.cdf(7500, 5000, 2000)

### Probability of deal > 1000

prob_over_1000 = 1 - norm.cdf(1000, 5000, 2000)

### Probability of deal between 3000 and 7000

prob_3000_to_7000 = norm.cdf(7000, 5000, 2000) - norm.cdf(3000, 5000, 2000)

### Calculate amount that 25% of deals will be less than

pct_25 = norm.ppf(0.25, 5000, 2000)

###################################################################################

### Calculate new average amount

new_mean = 5000 * 1.2

### Calculate new standard deviation

new_sd = 2000 * 1.3

### Simulate 36 new sales

new_sales = norm.rvs(new_mean, new_sd, size=36)

### Create histogram and show

plt.hist(new_sales)

plt.show()

## Central limit theorem

### Set seed to 104

np.random.seed(104)

### Sample 20 num_users with replacement from amir_deals

samp_20 = amir_deals['num_users'].sample(20, replace=True)

### Take mean of samp_20

print(np.mean(samp_20))

sample_means = []

### Loop 100 times

for i in range(100):

#### Take sample of 20 num_users

samp_20 = amir_deals['num_users'].sample(20, replace=True)

#### Calculate mean of samp_20

samp_20_mean = np.mean(samp_20)

#### Append samp_20_mean to sample_means

sample_means.append(samp_20_mean)

### Convert to Series and plot histogram

sample_means_series = pd.Series(sample_means)

sample_means_series.hist()

### Show plot

plt.show()