Skip to content
Introduction to Statistics in Python
  • AI Chat
  • Code
  • Report
  • Introduction to Statistics in Python

    Run the hidden code cell below to import the data used in this course.

    Take Notes

    Add notes about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    # Import numpy with alias np
    import numpy as np
    # Import pandas
    import pandas as pd
    
    # Read CSV file
    food_consumption=pd.read_csv("datasets/food_consumption.csv")
    
    print(food_consumption)
    
    # Filter for Belgium
    be_consumption = food_consumption[food_consumption['country']=='Belgium']
    
    # Filter for USA
    usa_consumption = food_consumption[food_consumption['country']=='USA']
    
    # Calculate mean and median consumption in Belgium
    print(np.mean(be_consumption['consumption']))
    print(np.median(be_consumption['consumption']))
    
    # Calculate mean and median consumption in USA
    print(np.mean(usa_consumption['consumption']))
    print(np.median(usa_consumption['consumption']))
    # Import numpy with alias np
    import numpy as np
    # Import pandas
    import pandas as pd
    
    # Read CSV file
    food_consumption=pd.read_csv("datasets/food_consumption.csv")
    
    # Import matplotlib.pyplot with alias plt
    import matplotlib.pyplot as plt
    
    print(food_consumption)
    
    # Subset for food_category equals rice
    rice_consumption = food_consumption[food_consumption['food_category']=='rice']
    
    # Histogram of co2_emission for rice and show plot
    plt.hist(rice_consumption['co2_emission'])
    plt.show()
    
    # Subset for food_category equals rice
    rice_consumption = food_consumption[food_consumption['food_category'] == 'rice']
    
    # Calculate mean and median of co2_emission with .agg()
    print(rice_consumption.agg([np.mean,np.median]))
    # Import numpy with alias np
    import numpy as np
    # Import pandas
    import pandas as pd
    
    # Read CSV file
    food_consumption=pd.read_csv("datasets/food_consumption.csv")
    
    # Import matplotlib.pyplot with alias plt
    import matplotlib.pyplot as plt
    
    # Print variance and sd of co2_emission for each food_category
    print(food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std]))
    
    # Create histogram of co2_emission for food_category 'beef'
    food_consumption[food_consumption['food_category'] == 'beef']['co2_emission'].hist()
    # Show plot
    plt.show()
    
    # Create histogram of co2_emission for food_category 'eggs'
    food_consumption[food_consumption['food_category'] == 'eggs']['co2_emission'].hist()
    # Show plot
    plt.show()
    # Import numpy with alias np
    import numpy as np
    # Import pandas
    import pandas as pd
    
    # Read CSV file
    food_consumption=pd.read_csv("datasets/food_consumption.csv")
    
    # Calculate total co2_emission per country: emissions_by_country
    emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()
    
    # Compute the first and third quantiles and IQR of emissions_by_country
    q1 = np.quantile(emissions_by_country, 0.25)
    q3 = np.quantile(emissions_by_country, 0.75)
    iqr = q3 - q1
    
    # Calculate the lower and upper cutoffs for outliers
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    
    # Subset emissions_by_country to find outliers
    outliers = emissions_by_country[(emissions_by_country < lower) | (emissions_by_country > upper)]
    print(outliers)
    # Import numpy with alias np
    import numpy as np
    # Import pandas
    import pandas as pd
    
    # Read CSV file
    amir_deals=pd.read_csv("datasets/amir_deals.csv")
    
    print(amir_deals)
    
    # Count the deals for each product
    counts = amir_deals['product'].value_counts()
    print(counts)
    
    # Calculate probability of picking a deal with each product
    probs = counts / amir_deals.shape[0]
    print(probs)
    np.random.seed(24)
    
    # Sample 5 deals without replacement
    sample_without_replacement = amir_deals.sample(5)
    print(sample_without_replacement)
    
    # Sample 5 deals with replacement
    sample_with_replacement = amir_deals.sample(5,replace=True)
    print(sample_with_replacement)
    import pandas as pd
    import numpy as np
    
    # Create resturant_group DataFrame from scratch
    restaurant_groups=pd.DataFrame({'group_id':['A','B','C','D','E','F','G','H','I','J'],'group_size':[2,4,6,2,2,2,3,2,4,2]})
    
    # Create probability distribution
    size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]
    # Reset index and rename columns
    size_dist = size_dist.reset_index()
    size_dist.columns = ['group_size', 'prob']
    
    # Expected value
    expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])
    
    # Subset groups of size 4 or more
    groups_4_or_more = size_dist[size_dist['group_size']>=4]
    
    # Sum the probabilities of groups_4_or_more
    prob_4_or_more = groups_4_or_more['prob'].sum()
    print(prob_4_or_more)
    # Min and max wait times for back-up that happens every 30 min
    min_time = 0
    max_time = 30
    
    # Import uniform from scipy.stats
    from scipy.stats import uniform
    
    # Calculate probability of waiting 10-20 mins
    prob_between_10_and_20 = uniform.cdf(20, min_time, max_time)-uniform.cdf(10, min_time, max_time)
    print(prob_between_10_and_20)
    import matplotlib.pyplot as plt
    
    # Set random seed to 334
    np.random.seed(334)
    
    # Import uniform
    from scipy.stats import uniform
    
    # Generate 1000 wait times between 0 and 30 mins
    wait_times = uniform.rvs(0, 30, size=1000)
    
    # Create a histogram of simulated times and show plot
    plt.hist(wait_times)
    plt.show()
    # Import binom from scipy.stats
    from scipy.stats import binom
    
    # Set random seed to 10
    np.random.seed(10)
    
    # Simulate 52 weeks of 3 deals
    deals = binom.rvs(3,0.3,size=52)
    
    
    # Print mean deals won per week
    print(deals.mean())