import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
ozone = pd.read_csv('data/ozone.csv')
ozone.head()
ozone.isnull().sum()
#Phase 1. Organizing the data
ozone_initial = ozone.shape[0]
#fixing column titles
ozone.columns = ozone.columns.str.replace(' ', '_').str.replace('-', '_').str.lower()
print(ozone.columns)
#dropna in concentration
ozone.dropna(subset=['daily_max_8_hour_ozone_concentration', 'daily_aqi_value'], inplace=True)
ozone_after_na_drop = ozone.shape[0]
print(f"DataFrame now has {ozone_after_na_drop} rows.")
#dropduplicates
ozone.drop_duplicates(inplace=True)
ozone_dropped_duplicates = ozone.shape[0]
#NA in method_code - replace with mode
mode_method = ozone['method_code'].mode()[0]
ozone['method_code'].fillna(mode_method, inplace=True)
#converting "date" to datetime format
ozone['date'] = pd.to_datetime(ozone['date'], errors='coerce')
print(f"New 'date' dtype: {ozone['date'].dtype}")
parsed_date_nas = ozone['date'].isnull().sum()
if parsed_date_nas > 0:
print(f"WARNING: {parsed_date_nas} 'date' values failed to parse and were converted to NaT.")
#Drop rows with NaT dates
ozone_before_date_na_drop = ozone.shape[0]
ozone.dropna(subset=['date'], inplace=True)
ozone_after_date_na_drop = ozone.shape[0]
print(f"Dropped {ozone_before_date_na_drop - ozone_after_date_na_drop} rows with unparseable dates.")
print(f"DataFrame now has {ozone_after_date_na_drop} rows.")
else:
print("All 'date' values parsed successfully.")
print(ozone["date"].isnull().sum())
unique_DTvalues = np.unique(ozone['date'])
#handle CBSA variables - fill numerical CBSA_code with 0 and CBSA_name with 'Non-CBSA'
ozone['cbsa_code'].fillna(0, inplace=True)
ozone['cbsa_name'].fillna('Non-CBSA', inplace=True)
# --- Final Check of Missing Values and Info ---
print("\n--- Final missing Values Count after cleaning ---")
print(ozone.isnull().sum())
print("\n--- Final DF info after cleaning ---")
ozone.info()
from scipy import stats
import numpy as np
import pandas as pd
z_concentration = np.abs(stats.zscore(ozone['daily_max_8_hour_ozone_concentration']))
threshold_z = 2
ozone_no_outliers_concentration = ozone[z_concentration <= threshold_z].copy()
print("Original DataFrame Shape:", ozone.shape)
print("DataFrame Shape after Removing Outliers (Concentration):", ozone_no_outliers_concentration.shape)
z_aqi = np.abs(stats.zscore(ozone['daily_aqi_value']))
ozone_no_outliers_aqi = ozone_no_outliers_concentration[z_aqi <= threshold_z].copy() # Apply to the already filtered DF
print("DataFrame Shape after Removing Outliers (AQI as well):", ozone_no_outliers_aqi.shape)
ozone = ozone_no_outliers_aqi.copy()
print("\nFinal DataFrame Shape after all outlier removal:", ozone.shape)
print("Outlier removal complete.")
#Phase 2: Exploratory Data Analysis (EDA)
# Look at seasonal patterns
ozone['date'] = pd.to_datetime(ozone['date'])
ozone['month'] = ozone['date'].dt.month
ozone['day_of_year'] = ozone['date'].dt.dayofyear
monthly_ozone = ozone.groupby("month")['daily_max_8_hour_ozone_concentration'].agg(['mean', 'std', 'count'])
print("Monthly ozone patterns:")
print(monthly_ozone)
# Check for unusual values and outliers
print("Checking for unusual ozone values:")
print("Zero ozone readings:", (ozone['daily_max_8_hour_ozone_concentration'] == 0).sum())
print("Very high ozone readings (>0.1 ppm):", (ozone['daily_max_8_hour_ozone_concentration'] > 0.1).sum())
# Look at the highest ozone days
high_ozone = ozone[ozone['daily_max_8_hour_ozone_concentration'] > 0.1].sort_values('daily_max_8_hour_ozone_concentration', ascending=False)
print("\
Top 10 highest ozone days:")
print(high_ozone[['date', 'local_site_name', 'daily_max_8_hour_ozone_concentration', 'daily_aqi_value']].head(10))
# Calculate correlation coefficient
correlation = ozone['daily_max_8_hour_ozone_concentration'].corr(ozone['daily_aqi_value'])
print("Correlation between ozone concentration and AQI:", correlation)
# Look at geographic patterns
print("\
Top 10 locations by average ozone concentration:")
location_ozone = ozone.groupby('local_site_name')['daily_max_8_hour_ozone_concentration'].agg(['mean', 'count']).sort_values('mean', ascending=False)
print(location_ozone.head(10))
#Now we can visualize monthly trends
plt.figure(figsize=(12, 7)) # Adjust figure size for better readability
sns.boxplot(data=ozone, x='month', y="daily_max_8_hour_ozone_concentration", palette='coolwarm_r')
sns.lineplot(data=ozone, x='month', y="daily_max_8_hour_ozone_concentration", color='darkred', linewidth=2)
plt.title('Daily Max 8-hour Ozone Concentration by Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Ozone Concentration (ppm)', fontsize=12)
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) # More readable month labels
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
#How does daily maximum 8-hour ozone concentration vary over time and regions?
# Group locations by major regions in California
print("Available geographic columns:")
print("- county:", ozone['county'].nunique(), "unique counties")
print("- county_fips_code:", ozone['county_fips_code'].nunique(), "unique FIPS codes")
print("- cbsa_name:", ozone['cbsa_name'].nunique(), "unique CBSA areas")
print("- cbsa_code:", ozone['cbsa_code'].nunique(), "unique CBSA codes")
print("\
County distribution:")
county_counts = ozone['county'].value_counts()
print(county_counts.head(15))
print("\
CBSA (Metropolitan Area) distribution:")
cbsa_counts = ozone['cbsa_name'].value_counts()
print(cbsa_counts.head(10))
# --- WEEKEND VS. WEEKDAY ANALYSIS ---
# Create a new column to identify weekends
ozone['is_weekend'] = ozone['date'].dt.dayofweek >= 5
print("Ozone concentration by weekend vs. weekday:")
weekend_weekday_summary = ozone.groupby('is_weekend')['daily_max_8_hour_ozone_concentration'].describe()
print(weekend_weekday_summary)
# Visualization
plt.figure(figsize=(8, 6))
sns.boxplot(x='is_weekend', y='daily_max_8_hour_ozone_concentration', data=ozone)
plt.title('Daily Max 8-hour Ozone Concentration: Weekday vs. Weekend')
plt.xlabel('Is Weekend? (False=Weekday, True=Weekend)')
plt.ylabel('Ozone Concentration (ppm)')
plt.xticks([0, 1], ['Weekday', 'Weekend'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# --- METHODOLOGY COMPARISON
print("\nOzone concentration by measurement method:")
# Group by 'Method Code' and get summary statistics
method_summary = ozone.groupby('method_code')['daily_max_8_hour_ozone_concentration'].describe()
print(method_summary)
# Visualize the comparison using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='method_code', y='daily_max_8_hour_ozone_concentration', data=ozone)
plt.title('Daily Max 8-hour Ozone Concentration by Measurement Method')
plt.xlabel('Method Code')
plt.ylabel('Ozone Concentration (ppm)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
#mapping
# Create site-level aggregated data for mapping
import geopandas as gpd
from shapely.geometry import Point
# Aggregate data by site location
site_data = ozone.groupby(['site_latitude', 'site_longitude', 'local_site_name', 'county']).agg({
'daily_max_8_hour_ozone_concentration': ['mean', 'max', 'count'],
'daily_aqi_value': ['mean', 'max']
}).round(4)
# Flatten column names
site_data.columns = ['_'.join(col).strip() for col in site_data.columns]
site_data = site_data.reset_index()
# Rename columns for clarity
site_data.rename(columns={
'daily_max_8_hour_ozone_concentration_mean': 'avg_ozone_ppm',
'daily_max_8_hour_ozone_concentration_max': 'max_ozone_ppm',
'daily_max_8_hour_ozone_concentration_count': 'measurement_count',
'daily_aqi_value_mean': 'avg_aqi',
'daily_aqi_value_max': 'max_aqi'
}, inplace=True)
print("Site-level data created:")
print("Shape:", site_data.shape)
print("Latitude range:", site_data['site_latitude'].min(), "to", site_data['site_latitude'].max())
print("Longitude range:", site_data['site_longitude'].min(), "to", site_data['site_longitude'].max())
print("Measurement count range:", site_data['measurement_count'].min(), "to", site_data['measurement_count'].max())
print("Average ozone range:", site_data['avg_ozone_ppm'].min(), "to", site_data['avg_ozone_ppm'].max())
print("\
Top 10 sites by average ozone concentration:")
print(site_data.nlargest(10, 'avg_ozone_ppm')[['local_site_name', 'county', 'avg_ozone_ppm', 'measurement_count']])
geometry = [Point(xy) for xy in zip(site_data['site_longitude'], site_data['site_latitude'])]
gdf_sites = gpd.GeoDataFrame(site_data, geometry=geometry, crs='EPSG:4326')
print("GeoDataFrame created with", len(gdf_sites), "sites")
# Try to get California county boundaries from a built-in source
try:
# Download California counties from the US Census Bureau
import requests
import zipfile
import io
# US Counties shapefile URL
url = "https://www2.census.gov/geo/tiger/GENZ2021/shp/cb_2021_us_county_20m.zip"
print("Downloading US counties shapefile...")
response = requests.get(url)
if response.status_code == 200:
# Extract the zip file
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
zip_file.extractall("counties_data")
# Load the shapefile
counties = gpd.read_file("counties_data/cb_2021_us_county_20m.shp")
# Filter for California (STATEFP = '06')
ca_counties = counties[counties['STATEFP'] == '06'].copy()
ca_counties = ca_counties.to_crs('EPSG:4326') # Ensure same CRS
print("California counties loaded:", len(ca_counties), "counties")
print("Sample county names:", ca_counties['NAME'].head().tolist())
else:
print("Failed to download counties data, status code:", response.status_code)
ca_counties = None
except Exception as e:
print("Error loading counties:", str(e))
ca_counties = None
# Folium?
import folium
import numpy as np
# --- 1. SITE DATA
site_data = ozone.groupby(['site_id', 'local_site_name', 'county', 'site_latitude', 'site_longitude']).agg(
avg_ozone_ppm=('daily_max_8_hour_ozone_concentration', 'mean'),
max_ozone_ppm=('daily_max_8_hour_ozone_concentration', 'max'),
measurement_count=('date', 'count')
).reset_index()
# --- 2. Creating a base map centered on California ---
ca_center = [37.0, -119.5] # Approximate center of California
m = folium.Map(location=ca_center, zoom_start=6, tiles='OpenStreetMap')
# Add county boundaries
if ca_counties is not None:
folium.GeoJson(
ca_counties,
name='California Counties',
style_function=lambda x: {
'fillColor': 'none', # No fill, just borders
'color': 'darkgray',
'weight': 0.5,
'dashArray': '5, 5'
}
).add_to(m)
# --- 3. Creating color function---
def get_color(ozone_ppm):
if ozone_ppm < 0.04:
return 'green'
elif ozone_ppm < 0.05:
return 'orange'
else:
return 'red'
# --- 4. Add markers for each site ---
min_ozone = site_data['avg_ozone_ppm'].min()
max_ozone = site_data['avg_ozone_ppm'].max()
site_data_filtered = site_data[site_data['measurement_count'] >= 30].copy() # set threshold
for idx, row in site_data.iterrows():
# Calculate normalized ozone for size, using full range for normalization
normalized_ozone_for_size = (row['avg_ozone_ppm'] - min_ozone) / (max_ozone - min_ozone)
color = get_color(row['avg_ozone_ppm']) # Use direct ppm for color
# Create popup text
popup_text = f"""
<b>{row['local_site_name']}</b><br>
County: {row['county']}<br>
Avg Ozone: {row['avg_ozone_ppm']:.4f} ppm<br>
Max Ozone: {row['max_ozone_ppm']:.4f} ppm<br>
Measurements: {row['measurement_count']}
"""
folium.CircleMarker(
location=[row['site_latitude'], row['site_longitude']],
radius=6 + (normalized_ozone_for_size * 8), # Size based on ozone level
popup=folium.Popup(popup_text, max_width=300),
color='black',
weight=1,
fillColor=color,
fillOpacity=0.7
).add_to(m)
# Prepare data for heatmap: [lat, lon, intensity]
from folium.plugins import HeatMap
# Let's use avg_ozone_ppm for intensity
heat_data = site_data[['site_latitude', 'site_longitude', 'avg_ozone_ppm']].values.tolist()
HeatMap(heat_data, radius=15, blur=10, max_zoom=10).add_to(m)
folium.LayerControl().add_to(m) # Add layer control to toggle heatmap/markers
min_ozone_val = site_data['avg_ozone_ppm'].min()
max_ozone_val = site_data['avg_ozone_ppm'].max()
avg_ozone_val = site_data['avg_ozone_ppm'].mean()
# Calculate normalized ozone for these specific values for size scaling
# Assuming your radius calculation is: radius = 6 + (normalized_ozone * 8)
def calculate_radius(ozone_ppm, min_val, max_val):
if (max_val - min_val) == 0: # Avoid division by zero if all values are the same
return 6
normalized_ozone = (ozone_ppm - min_val) / (max_val - min_val)
return 6 + (normalized_ozone * 10)
radius_low = calculate_radius(min_ozone_val, min_ozone_val, max_ozone_val)
radius_mid = calculate_radius(avg_ozone_val, min_ozone_val, max_ozone_val)
radius_high = calculate_radius(max_ozone_val, min_ozone_val, max_ozone_val)
# --- 5. Add a legend ---
low_radius_legend = 8 # Represents the small size
mid_radius_legend = 12 # Represents the medium size
high_radius_legend = 16 # Represents the large size
legend_html = f'''
<div style="position: fixed;
bottom: 50px; left: 50px; width: 200px; height: 280px;
background-color: white; border:2px solid grey; z-index:9999;
font-size:14px; padding: 10px">
<p><b>Average Ozone (ppm)</b></p>
<p><i class="fa fa-circle" style="color:green"></i> Low (< 0.04)</p>
<p><i class="fa fa-circle" style="color:orange"></i> Medium (0.04 - 0.05)</p>
<p><i class="fa fa-circle" style="color:red"></i> High (>= 0.05)</p>
<br>
<p><b>Marker Size by Avg Ozone</b></p>
<p style="margin-left: 10px;">
<span style="display:inline-block; border-radius:50%; background-color:black; width:{low_radius_legend}px; height:{low_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> Low
</p>
<p style="margin-left: 10px;">
<span style="display:inline-block; border-radius:50%; background-color:black; width:{mid_radius_legend}px; height:{mid_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> Medium
</p>
<p style="margin-left: 10px;">
<span style="display:inline-block; border-radius:50%; background-color:black; width:{high_radius_legend}px; height:{high_radius_legend}px; margin-right:5px; vertical-align:middle;"></span> High
</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))
# Display the map
m
m.get_root().html.add_child(folium.Element(legend_html))
m.save('california_ozone_sites_map.html')
# --- 6. DISPLAY THE MAP
m
Executive Summary
This report provides a comprehensive analysis of daily maximum 8-hour ozone concentrations in California, focusing on identifying temporal and regional trends, potential influencing factors, and areas of concern.
The key findings are:
Temporal Trends: Ozone concentrations follow a clear seasonal pattern, with levels rising significantly in the spring and peaking during the summer months (June-August).
Regional Hotspots: Geographically, the highest average ozone concentrations are found in inland and mountainous regions of Southern California and the Central Valley, particularly at sites like Sequoia & Kings Canyon NPs and Crestline.
Measurement Bias: A significant finding is that ozone levels reported by one specific measurement method (Method Code 53.0) are consistently higher than those from other methods, suggesting a potential methodological bias that should be investigated.
Urban Activity: Contrary to some hypotheses, there is no significant difference in ozone levels between weekdays and weekends, indicating that factors other than day-to-day urban activity may be the primary drivers of ozone formation in this dataset.
Based on these findings, it is recommended:
Focusing mitigation and monitoring efforts on the identified high-ozone regions during the summer months.
Conducting a thorough review and calibration study of the measurement methods, particularly Method Code 53.0, to ensure data consistency.
Investigating other factors, such as meteorological conditions (temperature, sunlight) and long-range transport of pollutants, which may explain the lack of a weekday/weekend effect.
1. EDA and Data Cleaning Process
The initial dataset contained 49,419 rows and 17 columns of information on daily ozone measurements. The following key steps were taken to prepare the data for analysis:
Data Type Conversion: The date column, which was initially of an object data type, was successfully converted to a datetime object, enabling us to perform time-series analysis and create features like month and is_weekend.
Handling Missing Values: Columns like daily_max_8_hour_ozone_concentration, daily_aqi_value, percent_complete, and cbsa_code contained missing values. These were carefully addressed, for example, by filling missing cbsa_name values with "Non-CBSA" to ensure all rows were usable.
Final Data Integrity: After cleaning, the final DataFrame had 45,869 rows, and as shown in the table below, all key columns had a full count of non-null values, ensuring the integrity and reliability of our analysis.
Column | Non-Null Count | Data Type |
---|---|---|
date | 45869 | datetime64[ns] |
daily_max_8_hour_ozone_concentration | 45869 | float64 |
daily_aqi_value | 45869 | float64 |
cbsa_name | 45869 | object |
method_code | 45869 | float64 |
2. Daily Ozone Variation Over Time and Regions
The analysis of ozone levels over time reveals a strong seasonal cycle. As shown in the box plot below, ozone concentrations are lowest in the winter and rise sharply as temperatures increase, peaking in the summer before falling again in the autumn. This is a typical pattern for ozone, a secondary pollutant whose formation is driven by sunlight and heat.
3. High Ozone Concentrations and Method Comparison
High-Ozone Areas To identify areas with consistently high ozone, the data was aggregated to the site level, calculating the average ozone concentration for each monitoring station. The top 10 sites with the highest average ozone were predominantly located in mountainous and inland areas, which are known to trap pollutants.
Top 5 Sites by Average Ozone (ppm):
Sequoia & Kings Canyon NPs - Lower Kaweah: 0.0607
Crestline: 0.0573
Redlands: 0.0558
San Bernardino: 0.0554
Morongo Air Monitoring Station: 0.0538
Comparison of Measurement Methods A comparison of daily maximum 8-hour ozone concentrations across different measurement methods reveals a clear difference in reported values.
The box plot above shows that Method Code 53.0 reports significantly higher ozone concentrations than the other methods, particularly Method Codes 47.0 and 87.0, which account for the majority of the data. This finding suggests a potential systematic bias in the data that warrants further investigation to ensure the accuracy of the overall dataset.
4. Urban Activity (Weekend vs. Weekday)
The hypothesis that urban activity affects ozone levels was tested by comparing concentrations on weekdays versus weekends.
The box plot above shows virtually no difference in the distribution of ozone concentrations between weekdays and weekends. The median and quartile values are almost identical, suggesting that ozone levels are not strongly influenced by the reduced traffic and industrial activity typically seen on weekends. This points to other factors, such as weather patterns or regional pollutant transport, as being more dominant drivers of ozone formation.
5. Geospatial Heatmap (Bonus)
To visualize the geographic distribution of ozone concentrations, an interactive map of California was created. This map uses colored markers, with color indicating the average ozone level (green for low, orange for medium, red for high) and size representing the magnitude of the average concentration. This visualization allows for a quick and intuitive identification of high-ozone regions.
The map is available as a separate HTML file titled california_ozone_sites_map.html.