Skip to content
Kaggle · Playground Prediction Competition - Forecasting Sticker Sales
import numpy as np
import pandas as pd
import sympy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressortrain_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")train_df['num_sold'] = train_df.groupby(['country', 'store', 'product'])['num_sold'].transform(lambda x: x.fillna(x.mean()))
train_df['num_sold'] = train_df['num_sold'].fillna(train_df['num_sold'].mean())train_df.isna().sum()# seasonality
train_df['month'] = pd.to_datetime(train_df['date']).dt.month
train_df['year'] = pd.to_datetime(train_df['date']).dt.year
train_df['month_year'] = train_df['year'].astype(str) + "-" + train_df['month'].astype(str)
train_df['month_year'] = pd.to_datetime(train_df['month_year'])
test_df['month'] = pd.to_datetime(test_df['date']).dt.month
test_df['year'] = pd.to_datetime(test_df['date']).dt.year
test_df['month_year'] = test_df['year'].astype(str) + "-" + test_df['month'].astype(str)
test_df['month_year'] = pd.to_datetime(test_df['month_year']) plt.figure(figsize=(12, 6))
sns.lineplot(
data = train_df,
x = "month_year",
y = "num_sold",
hue = "product"
)
plt.gcf().autofmt_xdate()
plt.show()monthly_seasonality = train_df.groupby(['product', 'month'])['num_sold'].mean().reset_index()plt.figure(figsize=(12, 6))
sns.lineplot(
data=monthly_seasonality,
x="month",
y="num_sold",
hue="product",
marker="o"
)
plt.title("Ürün Bazında Aylık Sezonsallık")
plt.xlabel("Ay")
plt.ylabel("Ortalama Satış")
plt.xticks(range(1, 13)) # Display months as 1-12
plt.grid()
plt.legend(title="Product", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()product_monthly_sales = train_df.groupby(['month', 'product'])['num_sold'].sum().unstack()
correlation_matrix = product_monthly_sales.corr()
print(correlation_matrix)# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(
correlation_matrix,
annot=True,
fmt=".2f",
cmap="coolwarm",
cbar=True,
linewidths=0.5
)
plt.title("Ürünler Arası Korelasyon Matrisi")
plt.xlabel("Ürünler")
plt.ylabel("Ürünler")
plt.tight_layout()
plt.show()
train_df['monthly_seasonality'] = train_df.groupby(['product', 'month'])['num_sold'].transform('mean')# Total sales by country
country_sales = train_df.groupby('country')['num_sold'].sum().sort_values(ascending=False).reset_index()
# Plot total sales by country
plt.figure(figsize=(8, 6))
sns.barplot(data=country_sales, x='num_sold', y='country', palette="viridis")
plt.title("Ülke Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Ülke")
plt.tight_layout()
plt.show()# Total sales by product and country
product_country_sales = train_df.groupby(['country', 'product'])['num_sold'].sum().reset_index()
# Plot sales by product and country
plt.figure(figsize=(12, 8))
sns.barplot(
data=product_country_sales,
x='num_sold',
y='country',
hue='product',
palette="viridis"
)
plt.title("Ülke ve Ürün Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Ülke")
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()# Total sales by store
store_sales = train_df.groupby('store')['num_sold'].sum().sort_values(ascending=False).reset_index()
# Plot total sales by store
plt.figure(figsize=(8, 6))
sns.barplot(data=store_sales, x='num_sold', y='store', palette="coolwarm")
plt.title("Mağaza Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Mağaza Türü")
plt.tight_layout()
plt.show()# Total sales by store and product
store_product_sales = train_df.groupby(['store', 'product'])['num_sold'].sum().reset_index()
# Plot sales by store and product
plt.figure(figsize=(12, 8))
sns.barplot(
data=store_product_sales,
x='num_sold',
y='store',
hue='product',
palette="coolwarm"
)
plt.title("Mağaza ve Ürün Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Mağaza Türü")
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()