Project: Exploring Experimental Design in the Energy Field

The energy sector involves intricate experiments to improve efficiency and sustainability. Proper experimental design helps to maximize insights and minimize errors. There are two common types of experimental design: factorial designs, which study multiple independent variables within a single experiment, and randomized block designs, which group experimental units to control variance. Understanding when to use each design is crucial for energy-related studies.

An environmental research team is investigating the impact of various fuel sources on CO2 emissions across different geographical regions. The goal is to understand which assigned fuel source contributes the most to CO2 emissions and whether this varies depending on location. The team has collected data from four distinct geographical regions: North, South, East, and West. In each region, multiple fuel sources—Natural Gas, Biofuel, and Coal—are used to generate energy. The resulting CO2 emissions are measured to evaluate the environmental impact of each fuel source.

As the data scientist on this project, you have access to two datasets, each representing data from one of the two mentioned experimental designs. The aim is to determine whether a factorial design or a randomized block design was used for the given experimental setup above and to analyze the dataset to identify key patterns and insights.

# Import required libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import f_oneway, ttest_ind
from statsmodels.sandbox.stats.multicomp import multipletests

# Load datasets
energy_design_a = pd.read_csv("energy_design_a.csv")
energy_design_b = pd.read_csv("energy_design_b.csv")

print(energy_design_a.head(10))
print(energy_design_b.head(10))
# Start coding here (use as many cells as you need)
print(energy_design_a.info())
print(energy_design_b.info())
print(energy_design_a.Fuel_Source.unique())
print(energy_design_b.Fuel_Source.unique())
print(energy_design_a.Building_Type.unique())
print(energy_design_b.Geographical_Region.unique())
energy_design_a['Fuel_Source']=energy_design_a.Fuel_Source.astype('category')
energy_design_a['Building_Type']=energy_design_a.Building_Type.astype('category')
print(energy_design_a.info())

energy_design_b['Fuel_Source']=energy_design_b.Fuel_Source.astype('category')
energy_design_b['Geographical_Region']=energy_design_b.Geographical_Region.astype('category')
print(energy_design_b.info())

import matplotlib.pyplot as plt
import seaborn as sns

#Prepare a bar plot showing CO2 emissions 
fig, ax = plt.subplots()
sns.barplot(data=energy_design_b, x='Geographical_Region', y='CO2_Emissions', hue='Fuel_Source', ax=ax)
plt.title('CO2 emissions according to region and fuel type-bar plot')
plt.show()

#Prepare a box plot for CO2 emissions
fig,ax=plt.subplots()
sns.boxplot(data=energy_design_b,x='Geographical_Region',y='CO2_Emissions',hue='Fuel_Source')
plt.title('CO2 emissions according to region and fuel type-box plot')
plt.show()

#Find the highest CO2 region and fuel source
max_co2 = energy_design_b.groupby(['Geographical_Region', 'Fuel_Source'], as_index=False)['CO2_Emissions'].median().sort_values(by='CO2_Emissions', ascending=False).head(1)

print(max_co2)
highest_co2_region=max_co2['Geographical_Region'].values[0]
highest_co2_source=max_co2['Fuel_Source'].values[0]
print(f'highest co2 region: {highest_co2_region}')
print(f'highest co2 source: {highest_co2_source}')


#Extract CO2 emision lists for geographic regions and fuel source
co2_by_region = energy_design_b.groupby(['Geographical_Region','Fuel_Source'])['CO2_Emissions'].apply(list).reset_index()

#cols_gr=co2_by_region['Geographical_Region'].unique().tolist()
#cols_fuel=co2_by_region['Fuel_Source'].unique().tolist()

#Prepare for ANOVA to test the null hyphothesis that there is no difference in CO2

#Creatr lists for regions and fuel sources
cols_gr=['North','South','East','West']
cols_fuel=['Natural_Gas', 'Biofuel', 'Coal']
flag=0
print(f'One way ANOVA results:')

#Scan regions and for each region fuel source 
#Collect CO2 emissions lists for each region and fuel source and 
#perform a one way anova test
for col1 in cols_gr:
    x=[]

    for col2 in cols_fuel:
        sel=(co2_by_region['Geographical_Region']==col1) & \
        (co2_by_region['Fuel_Source']==col2)
        x.append(co2_by_region.loc[sel,'CO2_Emissions'].values[0])
    result=f_oneway(*x)    

    print(f'{col1}: {result[0],result[1]}')      

#If any region has p_val less than 0.05 perform a pairwise bonferroni corrected #t_test for regions and fuel source for each region   
    if(result[1]<0.05):
        flag=1

print('\nPairwise ttest results:')        
if(flag==1):
    bonferroni=[]
    for k in range(len(cols_gr)):
        nsew=cols_gr[k]
        for i in range(len(cols_fuel)):
            fuel1=cols_fuel[i]
            for j in range (i+1,len(cols_fuel)):
                fuel2=cols_fuel[j]
                sel=(co2_by_region['Geographical_Region']==nsew) &\
                (co2_by_region['Fuel_Source']==fuel1)
                data_set1=co2_by_region.loc[sel,'CO2_Emissions'].values[0]
                sel=(co2_by_region['Geographical_Region']==nsew) &\
                (co2_by_region['Fuel_Source']==fuel2)
                data_set2=co2_by_region.loc[sel,'CO2_Emissions'].values[0] 
                stats=ttest_ind(data_set1,data_set2)
                p_val=stats[1]
                bonferroni.append(p_val)
                print(nsew,':', fuel1, ' ', fuel2,' ','p_val:', str(p_val.round(6)))                  
#print Bonverroni correctd p_values
diff_results = multipletests(bonferroni, alpha=0.05, method='bonferroni')
print("\nBonferroni Corrected P-values:\n", diff_results[1])