Skip to content
import numpy as np
import pandas as pd
import sympy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
# Read the training data from 'train.csv' into a pandas DataFrame
train_df = pd.read_csv("train.csv")

# Read the test data from 'test.csv' into a pandas DataFrame
test_df = pd.read_csv("test.csv")

# Read the sample submission file from 'sample_submission.csv' into a pandas DataFrame
sample_submission = pd.read_csv("sample_submission.csv")
# Fill missing values in 'num_sold' column by group mean for each (country, store, product) combination
train_df['num_sold'] = train_df.groupby(['country', 'store', 'product'])['num_sold'].transform(
    lambda x: x.fillna(x.mean())
)

# If any missing values remain (e.g., if a group is all NaN), fill them with the overall mean of 'num_sold'
train_df['num_sold'] = train_df['num_sold'].fillna(train_df['num_sold'].mean())
# Check for missing values in each column of the train_df DataFrame
train_df.isna().sum()
# Extract the month from the 'date' column and create a new 'month' column in train_df
train_df['month'] = pd.to_datetime(train_df['date']).dt.month

# Extract the year from the 'date' column and create a new 'year' column in train_df
train_df['year'] = pd.to_datetime(train_df['date']).dt.year

# Combine 'year' and 'month' columns into a string in the format 'YYYY-MM' and assign to 'month_year'
train_df['month_year'] = train_df['year'].astype(str) + "-" + train_df['month'].astype(str)

# Convert the 'month_year' string column to a datetime object (first day of the month)
train_df['month_year'] = pd.to_datetime(train_df['month_year']) 

# Repeat the same process for the test_df DataFrame

# Extract the month from the 'date' column and create a new 'month' column in test_df
test_df['month'] = pd.to_datetime(test_df['date']).dt.month

# Extract the year from the 'date' column and create a new 'year' column in test_df
test_df['year'] = pd.to_datetime(test_df['date']).dt.year

# Combine 'year' and 'month' columns into a string in the format 'YYYY-MM' and assign to 'month_year'
test_df['month_year'] = test_df['year'].astype(str) + "-" + test_df['month'].astype(str)

# Convert the 'month_year' string column to a datetime object (first day of the month)
test_df['month_year'] = pd.to_datetime(test_df['month_year'])
# Set the size of the plot to 12 inches wide by 6 inches tall
plt.figure(figsize=(12, 6))

# Create a line plot using seaborn:
# - data: use the train_df DataFrame
# - x-axis: 'month_year' column (monthly time periods)
# - y-axis: 'num_sold' column (number of items sold)
# - hue: separate lines for each 'product'
sns.lineplot(
    data = train_df,
    x = "month_year",
    y = "num_sold",
    hue = "product"
)

# Automatically format the x-axis dates for better readability
plt.gcf().autofmt_xdate()

# Display the plot
plt.show()
# Group the train_df DataFrame by 'product' and 'month' columns
# For each group, calculate the mean of the 'num_sold' column
# Reset the index to turn the grouped columns back into regular columns
monthly_seasonality = train_df.groupby(['product', 'month'])['num_sold'].mean().reset_index()
# Set the size of the plot to 12 inches wide by 6 inches tall
plt.figure(figsize=(12, 6))

# Create a line plot to visualize monthly seasonality for each product
# - data: use the monthly_seasonality DataFrame
# - x-axis: 'month' column (1-12)
# - y-axis: 'num_sold' column (average number of items sold)
# - hue: separate lines for each 'product'
# - marker: use circles to mark data points
sns.lineplot(
    data=monthly_seasonality,
    x="month",
    y="num_sold",
    hue="product",
    marker="o"
)

# Set the title of the plot
plt.title("Ürün Bazında Aylık Sezonsallık")

# Set the label for the x-axis
plt.xlabel("Ay")

# Set the label for the y-axis
plt.ylabel("Ortalama Satış")

# Set the x-axis ticks to display months as 1-12
plt.xticks(range(1, 13))

# Add a grid to the plot for better readability
plt.grid()

# Place the legend outside the plot on the upper left
plt.legend(title="Product", bbox_to_anchor=(1.05, 1), loc="upper left")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()
# Group the train_df DataFrame by 'month' and 'product' columns
# For each group, calculate the total (sum) number of items sold ('num_sold')
# The result is a DataFrame with a MultiIndex (month, product)
product_monthly_sales = train_df.groupby(['month', 'product'])['num_sold'].sum().unstack()

# Calculate the correlation matrix between products based on their monthly sales
# This shows how the sales of different products are correlated across months
correlation_matrix = product_monthly_sales.corr()

# Display the correlation matrix
correlation_matrix
# Set the figure size for the heatmap
plt.figure(figsize=(8, 6))

# Create a heatmap to visualize the correlation matrix between products
sns.heatmap(
    correlation_matrix,   # The correlation matrix to visualize
    annot=True,           # Display the correlation coefficients in each cell
    fmt=".2f",            # Format the annotation text to 2 decimal places
    cmap="coolwarm",      # Use the 'coolwarm' colormap for color encoding
    cbar=True,            # Show the color bar legend
    linewidths=0.5        # Set the width of the lines that will divide each cell
)

# Set the title of the plot
plt.title("Ürünler Arası Korelasyon Matrisi")

# Set the label for the x-axis
plt.xlabel("Ürünler")

# Set the label for the y-axis
plt.ylabel("Ürünler")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the heatmap
plt.show()
# Calculate the average number of items sold ('num_sold') for each combination of 'product' and 'month'
# For each row in train_df, assign the mean sales of its corresponding product and month to a new column 'monthly_seasonality'
train_df['monthly_seasonality'] = train_df.groupby(['product', 'month'])['num_sold'].transform('mean')
# Group the data by 'country' and sum the 'num_sold' values for each country
# Then, sort the results in descending order of total sales and reset the index
country_sales = train_df.groupby('country')['num_sold'].sum().sort_values(ascending=False).reset_index()

# Set the figure size for the bar plot
plt.figure(figsize=(8, 6))

# Create a horizontal bar plot to visualize total sales by country
sns.barplot(
    data=country_sales,    # DataFrame containing total sales by country
    x='num_sold',          # Total sales on the x-axis
    y='country',           # Country on the y-axis
    palette="viridis"      # Use the 'viridis' color palette
)

# Set the title of the plot
plt.title("Ülke Bazında Toplam Satışlar")

# Set the label for the x-axis
plt.xlabel("Toplam Satış")

# Set the label for the y-axis
plt.ylabel("Ülke")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the bar plot
plt.show()
# Group the data by both 'country' and 'product', then sum the 'num_sold' values for each group
# This gives the total sales for each product in each country
product_country_sales = train_df.groupby(['country', 'product'])['num_sold'].sum().reset_index()

# Set the figure size for the bar plot
plt.figure(figsize=(12, 8))

# Create a horizontal bar plot to visualize total sales by product and country
sns.barplot(
    data=product_country_sales,  # DataFrame containing total sales by product and country
    x='num_sold',                # Total sales on the x-axis
    y='country',                 # Country on the y-axis
    hue='product',               # Different colors for each product
    palette="viridis"            # Use the 'viridis' color palette
)

# Set the title of the plot
plt.title("Ülke ve Ürün Bazında Toplam Satışlar")

# Set the label for the x-axis
plt.xlabel("Toplam Satış")

# Set the label for the y-axis
plt.ylabel("Ülke")

# Add a legend for the products, positioned outside the plot
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the bar plot
plt.show()
# Group the data by 'store' and sum the 'num_sold' values for each store
# This gives the total sales for each store
store_sales = train_df.groupby('store')['num_sold'].sum().sort_values(ascending=False).reset_index()

# Set the figure size for the bar plot
plt.figure(figsize=(8, 6))

# Create a horizontal bar plot to visualize total sales by store
sns.barplot(
    data=store_sales,    # DataFrame containing total sales by store
    x='num_sold',        # Total sales on the x-axis
    y='store',           # Store on the y-axis
    palette="coolwarm"   # Use the 'coolwarm' color palette
)

# Set the title of the plot
plt.title("Mağaza Bazında Toplam Satışlar")

# Set the label for the x-axis
plt.xlabel("Toplam Satış")

# Set the label for the y-axis
plt.ylabel("Mağaza Türü")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the bar plot
plt.show()
# Group the data by 'store' and 'product', then sum the 'num_sold' values for each group
# This gives the total sales for each product in each store
store_product_sales = train_df.groupby(['store', 'product'])['num_sold'].sum().reset_index()

# Set the figure size for the bar plot
plt.figure(figsize=(12, 8))

# Create a horizontal bar plot to visualize total sales by store and product
sns.barplot(
    data=store_product_sales,  # DataFrame containing total sales by store and product
    x='num_sold',              # Total sales on the x-axis
    y='store',                 # Store on the y-axis
    hue='product',             # Different colors for each product
    palette="coolwarm"         # Use the 'coolwarm' color palette
)

# Set the title of the plot
plt.title("Mağaza ve Ürün Bazında Toplam Satışlar")

# Set the label for the x-axis
plt.xlabel("Toplam Satış")

# Set the label for the y-axis
plt.ylabel("Mağaza Türü")

# Add a legend for the 'product' variable, positioned outside the plot
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")

# Adjust the layout to prevent overlap
plt.tight_layout()

# Display the bar plot
plt.show()