Salary Prediction Classification using Gradient Boosting Classifier

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# Step 1: Data Preparation
# Load the dataset (example CSV file, replace with your actual data path)
data = pd.read_csv('00_convertcsv.csv')

# Convert release_date to datetime format
data['release_date'] = pd.to_datetime(data['release_date'])

# Sort by release_date
data.sort_values('release_date', inplace=True)

# Aggregate data by month (if needed)
data['month'] = data['release_date'].dt.to_period('M')
monthly_data = data.groupby('month').agg({
    'all_time_peak_ccu': 'sum'
}).reset_index()

# Convert month back to datetime for modeling
monthly_data['month'] = monthly_data['month'].dt.to_timestamp()

# Step 2: Feature Engineering
# Add time components
monthly_data['year'] = monthly_data['month'].dt.year
monthly_data['month_number'] = monthly_data['month'].dt.month

# Create lag features
monthly_data['lag_1'] = monthly_data['all_time_peak_ccu'].shift(1)
monthly_data['lag_2'] = monthly_data['all_time_peak_ccu'].shift(2)

# Create rolling statistics
monthly_data['rolling_mean_3'] = monthly_data['all_time_peak_ccu'].rolling(window=3).mean()

# Drop rows with NaN values (due to lag/rolling stats)
monthly_data.dropna(inplace=True)

# Step 3: Model Selection (ARIMA example)
# Define training and testing data
train_size = int(len(monthly_data) * 0.8)
train, test = monthly_data[:train_size], monthly_data[train_size:]

# Fit ARIMA model
model = ARIMA(train['all_time_peak_ccu'], order=(1, 1, 1))  # ARIMA(p, d, q)
model_fit = model.fit()

# Step 4: Training and Prediction
# Predict on test set
forecast = model_fit.forecast(steps=len(test))

# Step 5: Evaluation
mae = mean_absolute_error(test['all_time_peak_ccu'], forecast)
rmse = np.sqrt(mean_squared_error(test['all_time_peak_ccu'], forecast))
mape = np.mean(np.abs((test['all_time_peak_ccu'] - forecast) / test['all_time_peak_ccu'])) * 100

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

# Visualize actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(train['month'], train['all_time_peak_ccu'], label='Train')
plt.plot(test['month'], test['all_time_peak_ccu'], label='Test')
plt.plot(test['month'], forecast, label='Forecast')
plt.legend()
plt.title('Actual vs Predicted Game Trends')
plt.xlabel('Month')
plt.ylabel('All-Time Peak CCU')
plt.show()