Skip to content
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
ozone = pd.read_csv('data/ozone.csv')
ozone.head()
ozone.isnull().sum()


#Phase 1. Organizing the data
ozone_initial = ozone.shape[0]

      
#fixing column titles
ozone.columns = ozone.columns.str.replace(' ', '_').str.replace('-', '_').str.lower()
print(ozone.columns)

#dropna in concentration
ozone.dropna(subset=['daily_max_8_hour_ozone_concentration', 'daily_aqi_value'], inplace=True)
ozone_after_na_drop = ozone.shape[0]

print(f"DataFrame now has {ozone_after_na_drop} rows.")

#dropduplicates
ozone.drop_duplicates(inplace=True)
ozone_dropped_duplicates = ozone.shape[0]

#NA in method_code - replace with mode
mode_method = ozone['method_code'].mode()[0] 
ozone['method_code'].fillna(mode_method, inplace=True)

#converting "date" to datetime format

ozone['date'] = pd.to_datetime(ozone['date'], errors='coerce')
print(f"New 'date' dtype: {ozone['date'].dtype}")
parsed_date_nas = ozone['date'].isnull().sum()
if parsed_date_nas > 0:
    print(f"WARNING: {parsed_date_nas} 'date' values failed to parse and were converted to NaT.")

#Drop rows with NaT dates
    ozone_before_date_na_drop = ozone.shape[0]
    ozone.dropna(subset=['date'], inplace=True)
    ozone_after_date_na_drop = ozone.shape[0]
    print(f"Dropped {ozone_before_date_na_drop - ozone_after_date_na_drop} rows with unparseable dates.")
    print(f"DataFrame now has {ozone_after_date_na_drop} rows.")
else:
    print("All 'date' values parsed successfully.")


print(ozone["date"].isnull().sum())
unique_DTvalues = np.unique(ozone['date'])
#handle CBSA variables - fill numerical CBSA_code with 0 and CBSA_name with 'Non-CBSA'

ozone['cbsa_code'].fillna(0, inplace=True)
ozone['cbsa_name'].fillna('Non-CBSA', inplace=True)


# --- Final Check of Missing Values and Info ---
print("\n--- Final missing Values Count after cleaning ---")
print(ozone.isnull().sum())
print("\n--- Final DF info after cleaning ---")
ozone.info()
from scipy import stats
import numpy as np
import pandas as pd 

z_concentration = np.abs(stats.zscore(ozone['daily_max_8_hour_ozone_concentration']))

threshold_z = 2

ozone_no_outliers_concentration = ozone[z_concentration <= threshold_z].copy()


print("Original DataFrame Shape:", ozone.shape)
print("DataFrame Shape after Removing Outliers (Concentration):", ozone_no_outliers_concentration.shape)



z_aqi = np.abs(stats.zscore(ozone['daily_aqi_value']))
ozone_no_outliers_aqi = ozone_no_outliers_concentration[z_aqi <= threshold_z].copy() # Apply to the already filtered DF

print("DataFrame Shape after Removing Outliers (AQI as well):", ozone_no_outliers_aqi.shape)

ozone = ozone_no_outliers_aqi.copy()

print("\nFinal DataFrame Shape after all outlier removal:", ozone.shape)
print("Outlier removal complete.")


#Phase 2: Exploratory Data Analysis (EDA)

# Look at seasonal patterns

ozone['date'] = pd.to_datetime(ozone['date'])
ozone['month'] = ozone['date'].dt.month
ozone['day_of_year'] = ozone['date'].dt.dayofyear


monthly_ozone = ozone.groupby("month")['daily_max_8_hour_ozone_concentration'].agg(['mean', 'std', 'count'])
print("Monthly ozone patterns:")
print(monthly_ozone)

# Check for unusual values and outliers
print("Checking for unusual ozone values:")
print("Zero ozone readings:", (ozone['daily_max_8_hour_ozone_concentration'] == 0).sum())
print("Very high ozone readings (>0.1 ppm):", (ozone['daily_max_8_hour_ozone_concentration'] > 0.1).sum())

# Look at the highest ozone days
high_ozone = ozone[ozone['daily_max_8_hour_ozone_concentration'] > 0.1].sort_values('daily_max_8_hour_ozone_concentration', ascending=False)
print("\
Top 10 highest ozone days:")
print(high_ozone[['date', 'local_site_name', 'daily_max_8_hour_ozone_concentration', 'daily_aqi_value']].head(10))


# Calculate correlation coefficient
correlation = ozone['daily_max_8_hour_ozone_concentration'].corr(ozone['daily_aqi_value'])
print("Correlation between ozone concentration and AQI:", correlation)

# Look at geographic patterns
print("\
Top 10 locations by average ozone concentration:")
location_ozone = ozone.groupby('local_site_name')['daily_max_8_hour_ozone_concentration'].agg(['mean', 'count']).sort_values('mean', ascending=False)
print(location_ozone.head(10))
#Now we can visualize monthly trends 

plt.figure(figsize=(12, 7)) # Adjust figure size for better readability
sns.boxplot(data=ozone, x='month', y="daily_max_8_hour_ozone_concentration", palette='coolwarm_r')
sns.lineplot(data=ozone, x='month', y="daily_max_8_hour_ozone_concentration", color='darkred', linewidth=2)

plt.title('Daily Max 8-hour Ozone Concentration by Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Ozone Concentration (ppm)', fontsize=12)
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) # More readable month labels
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
#How does daily maximum 8-hour ozone concentration vary over time and regions?

# Group locations by major regions in California
print("Available geographic columns:")
print("- county:", ozone['county'].nunique(), "unique counties")
print("- county_fips_code:", ozone['county_fips_code'].nunique(), "unique FIPS codes")
print("- cbsa_name:", ozone['cbsa_name'].nunique(), "unique CBSA areas")
print("- cbsa_code:", ozone['cbsa_code'].nunique(), "unique CBSA codes")

print("\
County distribution:")
county_counts = ozone['county'].value_counts()
print(county_counts.head(15))

print("\
CBSA (Metropolitan Area) distribution:")
cbsa_counts = ozone['cbsa_name'].value_counts()
print(cbsa_counts.head(10))

# --- WEEKEND VS. WEEKDAY ANALYSIS ---
# Create a new column to identify weekends
ozone['is_weekend'] = ozone['date'].dt.dayofweek >= 5

print("Ozone concentration by weekend vs. weekday:")
weekend_weekday_summary = ozone.groupby('is_weekend')['daily_max_8_hour_ozone_concentration'].describe()
print(weekend_weekday_summary)

# Visualization
plt.figure(figsize=(8, 6))
sns.boxplot(x='is_weekend', y='daily_max_8_hour_ozone_concentration', data=ozone)
plt.title('Daily Max 8-hour Ozone Concentration: Weekday vs. Weekend')
plt.xlabel('Is Weekend? (False=Weekday, True=Weekend)')
plt.ylabel('Ozone Concentration (ppm)')
plt.xticks([0, 1], ['Weekday', 'Weekend'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


# --- METHODOLOGY COMPARISON
print("\nOzone concentration by measurement method:")
# Group by 'Method Code' and get summary statistics
method_summary = ozone.groupby('method_code')['daily_max_8_hour_ozone_concentration'].describe()
print(method_summary)

# Visualize the comparison using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='method_code', y='daily_max_8_hour_ozone_concentration', data=ozone)
plt.title('Daily Max 8-hour Ozone Concentration by Measurement Method')
plt.xlabel('Method Code')
plt.ylabel('Ozone Concentration (ppm)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
#mapping

# Create site-level aggregated data for mapping
import geopandas as gpd
from shapely.geometry import Point

# Aggregate data by site location
site_data = ozone.groupby(['site_latitude', 'site_longitude', 'local_site_name', 'county']).agg({
    'daily_max_8_hour_ozone_concentration': ['mean', 'max', 'count'],
    'daily_aqi_value': ['mean', 'max']
}).round(4)

# Flatten column names
site_data.columns = ['_'.join(col).strip() for col in site_data.columns]
site_data = site_data.reset_index()

# Rename columns for clarity
site_data.rename(columns={
    'daily_max_8_hour_ozone_concentration_mean': 'avg_ozone_ppm',
    'daily_max_8_hour_ozone_concentration_max': 'max_ozone_ppm', 
    'daily_max_8_hour_ozone_concentration_count': 'measurement_count',
    'daily_aqi_value_mean': 'avg_aqi',
    'daily_aqi_value_max': 'max_aqi'
}, inplace=True)

print("Site-level data created:")
print("Shape:", site_data.shape)
print("Latitude range:", site_data['site_latitude'].min(), "to", site_data['site_latitude'].max())
print("Longitude range:", site_data['site_longitude'].min(), "to", site_data['site_longitude'].max())
print("Measurement count range:", site_data['measurement_count'].min(), "to", site_data['measurement_count'].max())
print("Average ozone range:", site_data['avg_ozone_ppm'].min(), "to", site_data['avg_ozone_ppm'].max())

print("\
Top 10 sites by average ozone concentration:")
print(site_data.nlargest(10, 'avg_ozone_ppm')[['local_site_name', 'county', 'avg_ozone_ppm', 'measurement_count']])
geometry = [Point(xy) for xy in zip(site_data['site_longitude'], site_data['site_latitude'])]
gdf_sites = gpd.GeoDataFrame(site_data, geometry=geometry, crs='EPSG:4326')

print("GeoDataFrame created with", len(gdf_sites), "sites")

# Try to get California county boundaries from a built-in source
try:
    # Download California counties from the US Census Bureau
    import requests
    import zipfile
    import io
    
    # US Counties shapefile URL
    url = "https://www2.census.gov/geo/tiger/GENZ2021/shp/cb_2021_us_county_20m.zip"
    
    print("Downloading US counties shapefile...")
    response = requests.get(url)
    
    if response.status_code == 200:
        # Extract the zip file
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
            zip_file.extractall("counties_data")
        
        # Load the shapefile
        counties = gpd.read_file("counties_data/cb_2021_us_county_20m.shp")
        
        # Filter for California (STATEFP = '06')
        ca_counties = counties[counties['STATEFP'] == '06'].copy()
        ca_counties = ca_counties.to_crs('EPSG:4326')  # Ensure same CRS
        
        print("California counties loaded:", len(ca_counties), "counties")
        print("Sample county names:", ca_counties['NAME'].head().tolist())
        
    else:
        print("Failed to download counties data, status code:", response.status_code)
        ca_counties = None
        
except Exception as e:
    print("Error loading counties:", str(e))
    ca_counties = None
# Folium?
import folium
import numpy as np


# --- 1. SITE DATA
site_data = ozone.groupby(['site_id', 'local_site_name', 'county', 'site_latitude', 'site_longitude']).agg(
    avg_ozone_ppm=('daily_max_8_hour_ozone_concentration', 'mean'),
    max_ozone_ppm=('daily_max_8_hour_ozone_concentration', 'max'),
    measurement_count=('date', 'count')
).reset_index()

# --- 2. Creating a base map centered on California ---
ca_center = [37.0, -119.5]  # Approximate center of California
m = folium.Map(location=ca_center, zoom_start=6, tiles='OpenStreetMap')

# Add county boundaries
if ca_counties is not None:
    folium.GeoJson(
        ca_counties,
        name='California Counties',
        style_function=lambda x: {
            'fillColor': 'none', # No fill, just borders
            'color': 'darkgray',
            'weight': 0.5,
            'dashArray': '5, 5'
        }
    ).add_to(m)





# --- 3. Creating color function---
def get_color(ozone_ppm):
    if ozone_ppm < 0.04:
        return 'green'
    elif ozone_ppm < 0.05:
        return 'orange'
    else:
        return 'red'

# --- 4. Add markers for each site ---
min_ozone = site_data['avg_ozone_ppm'].min()
max_ozone = site_data['avg_ozone_ppm'].max()

site_data_filtered = site_data[site_data['measurement_count'] >= 30].copy() # set threshold


for idx, row in site_data.iterrows():
    # Calculate normalized ozone for size, using full range for normalization
    normalized_ozone_for_size = (row['avg_ozone_ppm'] - min_ozone) / (max_ozone - min_ozone)
    color = get_color(row['avg_ozone_ppm']) # Use direct ppm for color

    # Create popup text
    popup_text = f"""
    <b>{row['local_site_name']}</b><br>
    County: {row['county']}<br>
    Avg Ozone: {row['avg_ozone_ppm']:.4f} ppm<br>
    Max Ozone: {row['max_ozone_ppm']:.4f} ppm<br>
    Measurements: {row['measurement_count']}
    """

    folium.CircleMarker(
        location=[row['site_latitude'], row['site_longitude']],
        radius=6 + (normalized_ozone_for_size * 8),  # Size based on ozone level
        popup=folium.Popup(popup_text, max_width=300),
        color='black',
        weight=1,
        fillColor=color,
        fillOpacity=0.7
    ).add_to(m)

# Prepare data for heatmap: [lat, lon, intensity]
from folium.plugins import HeatMap
# Let's use avg_ozone_ppm for intensity
heat_data = site_data[['site_latitude', 'site_longitude', 'avg_ozone_ppm']].values.tolist()

HeatMap(heat_data, radius=15, blur=10, max_zoom=10).add_to(m)
folium.LayerControl().add_to(m) # Add layer control to toggle heatmap/markers



min_ozone_val = site_data['avg_ozone_ppm'].min()
max_ozone_val = site_data['avg_ozone_ppm'].max()
avg_ozone_val = site_data['avg_ozone_ppm'].mean()

# Calculate normalized ozone for these specific values for size scaling
# Assuming your radius calculation is: radius = 6 + (normalized_ozone * 8)
def calculate_radius(ozone_ppm, min_val, max_val):
    if (max_val - min_val) == 0: # Avoid division by zero if all values are the same
        return 6
    normalized_ozone = (ozone_ppm - min_val) / (max_val - min_val)
    return 6 + (normalized_ozone * 10)

radius_low = calculate_radius(min_ozone_val, min_ozone_val, max_ozone_val)
radius_mid = calculate_radius(avg_ozone_val, min_ozone_val, max_ozone_val)
radius_high = calculate_radius(max_ozone_val, min_ozone_val, max_ozone_val)


# --- 5. Add a legend ---


low_radius_legend = 8 # Represents the small size
mid_radius_legend = 12 # Represents the medium size
high_radius_legend = 16 # Represents the large size

legend_html = f'''
<div style="position: fixed;
             bottom: 50px; left: 50px; width: 200px; height: 280px; 
             background-color: white; border:2px solid grey; z-index:9999;
             font-size:14px; padding: 10px">
<p><b>Average Ozone (ppm)</b></p>
<p><i class="fa fa-circle" style="color:green"></i> Low (< 0.04)</p>
<p><i class="fa fa-circle" style="color:orange"></i> Medium (0.04 - 0.05)</p>
<p><i class="fa fa-circle" style="color:red"></i> High (>= 0.05)</p>
<br>
<p><b>Marker Size by Avg Ozone</b></p>
<p style="margin-left: 10px;">
    <span style="display:inline-block; border-radius:50%; background-color:black; width:{low_radius_legend}px; height:{low_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> Low
</p>
<p style="margin-left: 10px;">
    <span style="display:inline-block; border-radius:50%; background-color:black; width:{mid_radius_legend}px; height:{mid_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> Medium
</p>
<p style="margin-left: 10px;">
    <span style="display:inline-block; border-radius:50%; background-color:black; width:{high_radius_legend}px; height:{high_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> High
</p>
</div>
'''

m.get_root().html.add_child(folium.Element(legend_html))

# Display the map
m
m.get_root().html.add_child(folium.Element(legend_html))
m.save('california_ozone_sites_map.html')
# --- 6. DISPLAY THE MAP 

m

Executive Summary

This report provides a comprehensive analysis of daily maximum 8-hour ozone concentrations in California, focusing on identifying temporal and regional trends, potential influencing factors, and areas of concern.

The key findings are:

Temporal Trends: Ozone concentrations follow a clear seasonal pattern, with levels rising significantly in the spring and peaking during the summer months (June-August).

Regional Hotspots: Geographically, the highest average ozone concentrations are found in inland and mountainous regions of Southern California and the Central Valley, particularly at sites like Sequoia & Kings Canyon NPs and Crestline.

Measurement Bias: A significant finding is that ozone levels reported by one specific measurement method (Method Code 53.0) are consistently higher than those from other methods, suggesting a potential methodological bias that should be investigated.

Urban Activity: Contrary to some hypotheses, there is no significant difference in ozone levels between weekdays and weekends, indicating that factors other than day-to-day urban activity may be the primary drivers of ozone formation in this dataset.

Based on these findings, it is recommended:

Focusing mitigation and monitoring efforts on the identified high-ozone regions during the summer months.

Conducting a thorough review and calibration study of the measurement methods, particularly Method Code 53.0, to ensure data consistency.

Investigating other factors, such as meteorological conditions (temperature, sunlight) and long-range transport of pollutants, which may explain the lack of a weekday/weekend effect.

1. EDA and Data Cleaning Process

The initial dataset contained 49,419 rows and 17 columns of information on daily ozone measurements. The following key steps were taken to prepare the data for analysis:

Data Type Conversion: The date column, which was initially of an object data type, was successfully converted to a datetime object, enabling us to perform time-series analysis and create features like month and is_weekend.

Handling Missing Values: Columns like daily_max_8_hour_ozone_concentration, daily_aqi_value, percent_complete, and cbsa_code contained missing values. These were carefully addressed, for example, by filling missing cbsa_name values with "Non-CBSA" to ensure all rows were usable.

Final Data Integrity: After cleaning, the final DataFrame had 45,869 rows, and as shown in the table below, all key columns had a full count of non-null values, ensuring the integrity and reliability of our analysis.

ColumnNon-Null CountData Type
date45869datetime64[ns]
daily_max_8_hour_ozone_concentration45869float64
daily_aqi_value45869float64
cbsa_name45869object
method_code45869float64

2. Daily Ozone Variation Over Time and Regions

The analysis of ozone levels over time reveals a strong seasonal cycle. As shown in the box plot below, ozone concentrations are lowest in the winter and rise sharply as temperatures increase, peaking in the summer before falling again in the autumn. This is a typical pattern for ozone, a secondary pollutant whose formation is driven by sunlight and heat.

3. High Ozone Concentrations and Method Comparison

High-Ozone Areas To identify areas with consistently high ozone, the data was aggregated to the site level, calculating the average ozone concentration for each monitoring station. The top 10 sites with the highest average ozone were predominantly located in mountainous and inland areas, which are known to trap pollutants.

Top 5 Sites by Average Ozone (ppm):

Sequoia & Kings Canyon NPs - Lower Kaweah: 0.0607

Crestline: 0.0573

Redlands: 0.0558

San Bernardino: 0.0554

Morongo Air Monitoring Station: 0.0538

Comparison of Measurement Methods A comparison of daily maximum 8-hour ozone concentrations across different measurement methods reveals a clear difference in reported values.

The box plot above shows that Method Code 53.0 reports significantly higher ozone concentrations than the other methods, particularly Method Codes 47.0 and 87.0, which account for the majority of the data. This finding suggests a potential systematic bias in the data that warrants further investigation to ensure the accuracy of the overall dataset.

4. Urban Activity (Weekend vs. Weekday)

The hypothesis that urban activity affects ozone levels was tested by comparing concentrations on weekdays versus weekends.

The box plot above shows virtually no difference in the distribution of ozone concentrations between weekdays and weekends. The median and quartile values are almost identical, suggesting that ozone levels are not strongly influenced by the reduced traffic and industrial activity typically seen on weekends. This points to other factors, such as weather patterns or regional pollutant transport, as being more dominant drivers of ozone formation.

5. Geospatial Heatmap (Bonus)

To visualize the geographic distribution of ozone concentrations, an interactive map of California was created. This map uses colored markers, with color indicating the average ozone level (green for low, orange for medium, red for high) and size representing the magnitude of the average concentration. This visualization allows for a quick and intuitive identification of high-ozone regions.

The map is available as a separate HTML file titled california_ozone_sites_map.html.