Big Data Project - Revised

1 hidden cell

import pandas as pd
import os

# List all CSV files in the directory
csv_files = [file for file in os.listdir() if file.endswith('.csv')]

# Load each CSV file into a DataFrame and store them in a list
dfs = []
for file in csv_files:
    dfs.append(pd.read_csv(file))

# Concatenate all DataFrames into one
combined_data = pd.concat(dfs, ignore_index=True)

# Optional: Add a column to indicate the year of each record
combined_data['year'] = pd.to_datetime(combined_data['date']).dt.year

# Display the first few rows of the combined dataset
print(combined_data.head())

import matplotlib.pyplot as plt

# Calculate mean air temperature for each year
mean_temp_yearly = combined_data.groupby('year')['airtemp'].mean()

# Plot mean air temperature over the years
plt.figure(figsize=(12, 6))
mean_temp_yearly.plot(marker='o', color='b', linestyle='-')
plt.title('Mean Air Temperature Over the Years (1976-2024)')
plt.xlabel('Year')
plt.ylabel('Mean Air Temperature (°C)')
plt.grid(True)
plt.show()

import os
import pandas as pd

# List all CSV files in the directory
csv_files = [file for file in os.listdir() if file.endswith('.csv')]

# Load each CSV file into a DataFrame and store them in a list
dfs = []
for file in csv_files:
    dfs.append(pd.read_csv(file))

# Concatenate all DataFrames into one
combined_data = pd.concat(dfs, ignore_index=True)

# Optional: Add a column to indicate the year of each record
combined_data['year'] = pd.to_datetime(combined_data['date']).dt.year

# Display the first few rows of the combined dataset
print(combined_data.head())

import matplotlib.pyplot as plt

# Calculate mean air temperature for each year
mean_temp_yearly = combined_data.groupby('year')['winddir'].mean()

# Plot mean air temperature over the years
plt.figure(figsize=(12, 6))
mean_temp_yearly.plot(marker='o', color='b', linestyle='-')
plt.title('Mean Wind Direction Over the Years (1976-2024)')
plt.xlabel('Year')
plt.ylabel('Mean Wind Direction')
plt.grid(True)
plt.show()

Hidden code

3 hidden cells

import seaborn as sns

# Drop non-numeric columns before calculating correlation matrix
numeric_data = combined_data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()

# Visualize correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

# Extract year from the 'date' column for air pressure prediction purposes
combined_data['year'] = pd.to_datetime(combined_data['date']).dt.year

# Group by year and calculate the mean air temperature and air pressure for each year
yearly_avg = combined_data.groupby('year').agg({'airtemp': 'mean', 'atmpress': 'mean'}).reset_index()

# Prepare the data for training the models
X = yearly_avg['year'].values.reshape(-1, 1)  # Features
y_temp = yearly_avg['airtemp'].values  # Target variable for temperature
y_press = yearly_avg['atmpress'].values  # Target variable for air pressure

# Splitting the dataset into training and testing set for both targets
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y_temp, test_size=0.2, random_state=0)
X_train_press, X_test_press, y_train_press, y_test_press = train_test_split(X, y_press, test_size=0.2, random_state=0)

# Creating Linear Regression models for both targets
model_temp = LinearRegression()
model_press = LinearRegression()

# Training the models
model_temp.fit(X_train_temp, y_train_temp)
model_press.fit(X_train_press, y_train_press)

# Predicting air temperatures and air pressure for the next 30 years
future_years = np.array(range(2025, 2055)).reshape(-1, 1)
future_temps = model_temp.predict(future_years)
future_press = model_press.predict(future_years)

# Plotting the results for air temperature
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
plt.scatter(X, y_temp, color='blue', label='Actual Temperature Data')
plt.plot(X, model_temp.predict(X), color='red', label='Fitted line for Temperature')
plt.plot(future_years, future_temps, color='green', linestyle='--', label='Future Temperature Predictions')
plt.title('Air Temperature for the Next 30 Years')
plt.xlabel('Year')
plt.ylabel('Mean Air Temperature')
plt.legend()

# Plotting the results for air pressure
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
plt.scatter(X, y_press, color='purple', label='Actual Pressure Data')
plt.plot(X, model_press.predict(X), color='orange', label='Fitted line for Pressure')
plt.plot(future_years, future_press, color='brown', linestyle='--', label='Future Pressure Predictions')
plt.title('Air Pressure for the Next 30 Years')
plt.xlabel('Year')
plt.ylabel('Mean Air Pressure')
plt.legend()

plt.tight_layout()
plt.show()

from sklearn.metrics import accuracy_score
import pandas as pd

# Assuming y_test_temp, y_test_wind, y_test_sky, y_pred_temp, y_pred_wind, y_pred_sky are defined elsewhere

# Convert continuous predictions to discrete classes for classification
y_pred_temp_class = [round(temp) for temp in y_pred_temp]
y_pred_wind_class = [round(wind) for wind in y_pred_wind]
y_pred_sky_class = [round(sky) for sky in y_pred_sky]

# Calculate accuracy score for air temperature prediction
accuracy_temp = accuracy_score(y_test_temp, y_pred_temp_class, normalize=False) # Add normalize=False to handle multiclass targets

# Calculate accuracy score for wind direction prediction
accuracy_wind = accuracy_score(y_test_wind, y_pred_wind_class, normalize=False) # Add normalize=False to handle multiclass targets

# Calculate accuracy score for sky conditions prediction
accuracy_sky = accuracy_score(y_test_sky, y_pred_sky_class, normalize=False) # Add normalize=False to handle multiclass targets

# Display the accuracy scores
accuracy_scores = pd.DataFrame({
    'Prediction': ['Air Temperature', 'Wind Direction', 'Sky Conditions'],
    'Accuracy Score': [accuracy_temp, accuracy_wind, accuracy_sky]
})

accuracy_scores

Run cancelled

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Extract year and month from the 'date' column for air pressure prediction purposes
combined_data['year'] = pd.to_datetime(combined_data['date']).dt.year
combined_data['month'] = pd.to_datetime(combined_data['date']).dt.month

# Group by year and month and calculate the mean air temperature, wind direction, and sky conditions for each month
monthly_avg = combined_data.groupby(['year', 'month']).agg({'airtemp': 'mean', 'winddir': 'mean', 'sky': 'mean'}).reset_index()

# Prepare the data for training the models
X = monthly_avg[['year', 'month']].values  # Features
y_temp = monthly_avg['airtemp'].values  # Target variable for temperature
y_wind = monthly_avg['winddir'].values  # Target variable for wind direction
y_sky = monthly_avg['sky'].values  # Target variable for sky conditions

# Splitting the dataset into training and testing set for all targets
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y_temp, test_size=0.2, random_state=0)
X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(X, y_wind, test_size=0.2, random_state=0)
X_train_sky, X_test_sky, y_train_sky, y_test_sky = train_test_split(X, y_sky, test_size=0.2, random_state=0)

# Creating Linear Regression models for all targets
model_temp = LinearRegression()
model_wind = LinearRegression()
model_sky = LinearRegression()

# Training the models
model_temp.fit(X_train_temp, y_train_temp)
model_wind.fit(X_train_wind, y_train_wind)
model_sky.fit(X_train_sky, y_train_sky)

# Predicting air temperatures, wind directions, and sky conditions for the next 30 years
future_years = np.repeat(np.arange(2025, 2055), 12)  # Repeat each year 12 times for each month
future_months = np.tile(np.arange(1, 13), 30)  # Tile months 30 times for each year
future_data = np.column_stack((future_years, future_months))
future_temps = model_temp.predict(future_data)
future_wind = model_wind.predict(future_data)
future_sky = model_sky.predict(future_data)

# Plotting the results for air temperature, wind direction, and sky conditions
plt.figure(figsize=(18, 6))

# Plotting the results for air temperature
plt.subplot(1, 3, 1)  # 1 row, 3 columns, 1st subplot
plt.scatter(X[:, 0], y_temp, color='blue', label='Actual Temperature Data')
plt.plot(X[:, 0], model_temp.predict(X), color='red', label='Fitted line for Temperature')
plt.plot(future_years, future_temps, color='green', linestyle='--', label='Future Temperature Predictions')
plt.title('Air Temperature Prediction for the Next 30 Years')
plt.xlabel('Year')
plt.ylabel('Mean Air Temperature')
plt.legend()

# Plotting the results for wind direction
plt.subplot(1, 3, 2)  # 1 row, 3 columns, 2nd subplot
plt.scatter(X[:, 0], y_wind, color='orange', label='Actual Wind Direction Data')
plt.plot(X[:, 0], model_wind.predict(X), color='purple', label='Fitted line for Wind Direction')
plt.plot(future_years, future_wind, color='brown', linestyle='--', label='Future Wind Direction Predictions')
plt.title('Wind Direction Prediction for the Next 30 Years')
plt.xlabel('Year')
plt.ylabel('Mean Wind Direction')
plt.legend()

# Plotting the results for sky conditions
plt.subplot(1, 3, 3)  # 1 row, 3 columns, 3rd subplot
plt.scatter(X[:, 0], y_sky, color='green', label='Actual Sky Conditions Data')
plt.plot(X[:, 0], model_sky.predict(X), color='blue', label='Fitted line for Sky Conditions')
plt.plot(future_years, future_sky, color='orange', linestyle='--', label='Future Sky Conditions Predictions')
plt.title('Sky Conditions Prediction for the Next 30 Years')
plt.xlabel('Year')
plt.ylabel('Mean Sky Conditions')
plt.legend()
plt.show()

Run cancelled

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

data_files = ["1976.csv", "1977.csv", "1978.csv", "1979.csv", "1980.csv", "1981.csv", "1982.csv", "1983.csv", "1984.csv", "1985.csv", "1986.csv", "1987.csv", "1988.csv", "1989.csv", "1990.csv", "1991.csv", "1992.csv", "1993.csv", "1994.csv", "1995.csv", "1996.csv", "1997.csv", "1998.csv", "1999.csv", "2000.csv", "2001.csv", "2002.csv", "2003.csv", "2004.csv", "2005.csv", "2006.csv", "2007.csv", "2008.csv", "2009.csv", "2010.csv", "2011.csv", "2012.csv", "2013.csv", "2014.csv", "2015.csv", "2016.csv", "2017.csv", "2018.csv", "2019.csv", "2020.csv", "2021.csv", "2022.csv", "2023.csv", "2024.csv"]  

dfs = []
for file in data_files:
    df = pd.read_csv(file)
    dfs.append(df)

combined_data = pd.concat(dfs, ignore_index=True)
combined_data

X = combined_data[['airtemp']]
y_temp = combined_data['airtemp'] 

# Step 3: Model Training
model_temp = LinearRegression()
model_temp.fit(X, y_temp)

# Step 4: Model Evaluation
temp_predictions = model_temp.predict(X)
temp_mae = mean_absolute_error(y_temp, temp_predictions)
temp_r2 = r2_score(y_temp, temp_predictions)

print("Air Temperature MAE:", temp_mae)
print("Air Temperature R-squared (R2) Score:", temp_r2)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(X, y_temp, color='blue', label='Actual Temperature')
plt.plot(X, temp_predictions, color='red', label='Predicted Temperature')
plt.xlabel('Actual Temperature')
plt.ylabel('Predicted Temperature')
plt.title('Actual vs Predicted Temperature')
plt.legend()
plt.show()