Skip to content
import numpy as np
import pandas as pd
import sympy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
train_df['num_sold'] = train_df.groupby(['country', 'store', 'product'])['num_sold'].transform(lambda x: x.fillna(x.mean()))
train_df['num_sold'] = train_df['num_sold'].fillna(train_df['num_sold'].mean())
train_df.isna().sum()
# seasonality
train_df['month'] = pd.to_datetime(train_df['date']).dt.month
train_df['year'] = pd.to_datetime(train_df['date']).dt.year
train_df['month_year'] = train_df['year'].astype(str) + "-" + train_df['month'].astype(str)
train_df['month_year'] = pd.to_datetime(train_df['month_year']) 

test_df['month'] = pd.to_datetime(test_df['date']).dt.month
test_df['year'] = pd.to_datetime(test_df['date']).dt.year
test_df['month_year'] = test_df['year'].astype(str) + "-" + test_df['month'].astype(str)
test_df['month_year'] = pd.to_datetime(test_df['month_year']) 
plt.figure(figsize=(12, 6))
sns.lineplot(
    data = train_df,
    x = "month_year",
    y = "num_sold",
    hue = "product"
)

plt.gcf().autofmt_xdate()
plt.show()
monthly_seasonality = train_df.groupby(['product', 'month'])['num_sold'].mean().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=monthly_seasonality,
    x="month",
    y="num_sold",
    hue="product",
    marker="o"
)

plt.title("Ürün Bazında Aylık Sezonsallık")
plt.xlabel("Ay")
plt.ylabel("Ortalama Satış")
plt.xticks(range(1, 13))  # Display months as 1-12
plt.grid()
plt.legend(title="Product", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
product_monthly_sales = train_df.groupby(['month', 'product'])['num_sold'].sum().unstack()

correlation_matrix = product_monthly_sales.corr()
print(correlation_matrix)
# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    cbar=True,
    linewidths=0.5
)

plt.title("Ürünler Arası Korelasyon Matrisi")
plt.xlabel("Ürünler")
plt.ylabel("Ürünler")
plt.tight_layout()
plt.show()
train_df['monthly_seasonality'] = train_df.groupby(['product', 'month'])['num_sold'].transform('mean')
# Total sales by country
country_sales = train_df.groupby('country')['num_sold'].sum().sort_values(ascending=False).reset_index()

# Plot total sales by country
plt.figure(figsize=(8, 6))
sns.barplot(data=country_sales, x='num_sold', y='country', palette="viridis")

plt.title("Ülke Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Ülke")
plt.tight_layout()
plt.show()
# Total sales by product and country
product_country_sales = train_df.groupby(['country', 'product'])['num_sold'].sum().reset_index()

# Plot sales by product and country
plt.figure(figsize=(12, 8))
sns.barplot(
    data=product_country_sales,
    x='num_sold',
    y='country',
    hue='product',
    palette="viridis"
)

plt.title("Ülke ve Ürün Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Ülke")
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# Total sales by store
store_sales = train_df.groupby('store')['num_sold'].sum().sort_values(ascending=False).reset_index()

# Plot total sales by store
plt.figure(figsize=(8, 6))
sns.barplot(data=store_sales, x='num_sold', y='store', palette="coolwarm")

plt.title("Mağaza Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Mağaza Türü")
plt.tight_layout()
plt.show()
# Total sales by store and product
store_product_sales = train_df.groupby(['store', 'product'])['num_sold'].sum().reset_index()

# Plot sales by store and product
plt.figure(figsize=(12, 8))
sns.barplot(
    data=store_product_sales,
    x='num_sold',
    y='store',
    hue='product',
    palette="coolwarm"
)

plt.title("Mağaza ve Ürün Bazında Toplam Satışlar")
plt.xlabel("Toplam Satış")
plt.ylabel("Mağaza Türü")
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()