Skip to content
Kaggle · Playground Prediction Competition - Forecasting Sticker Sales
import numpy as np
import pandas as pd
import sympy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor# Read the training data from 'train.csv' into a pandas DataFrame
train_df = pd.read_csv("train.csv")
# Read the test data from 'test.csv' into a pandas DataFrame
test_df = pd.read_csv("test.csv")
# Read the sample submission file from 'sample_submission.csv' into a pandas DataFrame
sample_submission = pd.read_csv("sample_submission.csv")# Fill missing values in 'num_sold' column by group mean for each (country, store, product) combination
train_df['num_sold'] = train_df.groupby(['country', 'store', 'product'])['num_sold'].transform(
lambda x: x.fillna(x.mean())
)
# If any missing values remain (e.g., if a group is all NaN), fill them with the overall mean of 'num_sold'
train_df['num_sold'] = train_df['num_sold'].fillna(train_df['num_sold'].mean())# Check for missing values in each column of the train_df DataFrame
train_df.isna().sum()# Extract the month from the 'date' column and create a new 'month' column in train_df
train_df['month'] = pd.to_datetime(train_df['date']).dt.month
# Extract the year from the 'date' column and create a new 'year' column in train_df
train_df['year'] = pd.to_datetime(train_df['date']).dt.year
# Combine 'year' and 'month' columns into a string in the format 'YYYY-MM' and assign to 'month_year'
train_df['month_year'] = train_df['year'].astype(str) + "-" + train_df['month'].astype(str)
# Convert the 'month_year' string column to a datetime object (first day of the month)
train_df['month_year'] = pd.to_datetime(train_df['month_year'])
# Repeat the same process for the test_df DataFrame
# Extract the month from the 'date' column and create a new 'month' column in test_df
test_df['month'] = pd.to_datetime(test_df['date']).dt.month
# Extract the year from the 'date' column and create a new 'year' column in test_df
test_df['year'] = pd.to_datetime(test_df['date']).dt.year
# Combine 'year' and 'month' columns into a string in the format 'YYYY-MM' and assign to 'month_year'
test_df['month_year'] = test_df['year'].astype(str) + "-" + test_df['month'].astype(str)
# Convert the 'month_year' string column to a datetime object (first day of the month)
test_df['month_year'] = pd.to_datetime(test_df['month_year'])# Set the size of the plot to 12 inches wide by 6 inches tall
plt.figure(figsize=(12, 6))
# Create a line plot using seaborn:
# - data: use the train_df DataFrame
# - x-axis: 'month_year' column (monthly time periods)
# - y-axis: 'num_sold' column (number of items sold)
# - hue: separate lines for each 'product'
sns.lineplot(
data = train_df,
x = "month_year",
y = "num_sold",
hue = "product"
)
# Automatically format the x-axis dates for better readability
plt.gcf().autofmt_xdate()
# Display the plot
plt.show()# Group the train_df DataFrame by 'product' and 'month' columns
# For each group, calculate the mean of the 'num_sold' column
# Reset the index to turn the grouped columns back into regular columns
monthly_seasonality = train_df.groupby(['product', 'month'])['num_sold'].mean().reset_index()# Set the size of the plot to 12 inches wide by 6 inches tall
plt.figure(figsize=(12, 6))
# Create a line plot to visualize monthly seasonality for each product
# - data: use the monthly_seasonality DataFrame
# - x-axis: 'month' column (1-12)
# - y-axis: 'num_sold' column (average number of items sold)
# - hue: separate lines for each 'product'
# - marker: use circles to mark data points
sns.lineplot(
data=monthly_seasonality,
x="month",
y="num_sold",
hue="product",
marker="o"
)
# Set the title of the plot
plt.title("Ürün Bazında Aylık Sezonsallık")
# Set the label for the x-axis
plt.xlabel("Ay")
# Set the label for the y-axis
plt.ylabel("Ortalama Satış")
# Set the x-axis ticks to display months as 1-12
plt.xticks(range(1, 13))
# Add a grid to the plot for better readability
plt.grid()
# Place the legend outside the plot on the upper left
plt.legend(title="Product", bbox_to_anchor=(1.05, 1), loc="upper left")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the plot
plt.show()# Group the train_df DataFrame by 'month' and 'product' columns
# For each group, calculate the total (sum) number of items sold ('num_sold')
# The result is a DataFrame with a MultiIndex (month, product)
product_monthly_sales = train_df.groupby(['month', 'product'])['num_sold'].sum().unstack()
# Calculate the correlation matrix between products based on their monthly sales
# This shows how the sales of different products are correlated across months
correlation_matrix = product_monthly_sales.corr()
# Display the correlation matrix
correlation_matrix# Set the figure size for the heatmap
plt.figure(figsize=(8, 6))
# Create a heatmap to visualize the correlation matrix between products
sns.heatmap(
correlation_matrix, # The correlation matrix to visualize
annot=True, # Display the correlation coefficients in each cell
fmt=".2f", # Format the annotation text to 2 decimal places
cmap="coolwarm", # Use the 'coolwarm' colormap for color encoding
cbar=True, # Show the color bar legend
linewidths=0.5 # Set the width of the lines that will divide each cell
)
# Set the title of the plot
plt.title("Ürünler Arası Korelasyon Matrisi")
# Set the label for the x-axis
plt.xlabel("Ürünler")
# Set the label for the y-axis
plt.ylabel("Ürünler")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the heatmap
plt.show()# Calculate the average number of items sold ('num_sold') for each combination of 'product' and 'month'
# For each row in train_df, assign the mean sales of its corresponding product and month to a new column 'monthly_seasonality'
train_df['monthly_seasonality'] = train_df.groupby(['product', 'month'])['num_sold'].transform('mean')# Group the data by 'country' and sum the 'num_sold' values for each country
# Then, sort the results in descending order of total sales and reset the index
country_sales = train_df.groupby('country')['num_sold'].sum().sort_values(ascending=False).reset_index()
# Set the figure size for the bar plot
plt.figure(figsize=(8, 6))
# Create a horizontal bar plot to visualize total sales by country
sns.barplot(
data=country_sales, # DataFrame containing total sales by country
x='num_sold', # Total sales on the x-axis
y='country', # Country on the y-axis
palette="viridis" # Use the 'viridis' color palette
)
# Set the title of the plot
plt.title("Ülke Bazında Toplam Satışlar")
# Set the label for the x-axis
plt.xlabel("Toplam Satış")
# Set the label for the y-axis
plt.ylabel("Ülke")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the bar plot
plt.show()# Group the data by both 'country' and 'product', then sum the 'num_sold' values for each group
# This gives the total sales for each product in each country
product_country_sales = train_df.groupby(['country', 'product'])['num_sold'].sum().reset_index()
# Set the figure size for the bar plot
plt.figure(figsize=(12, 8))
# Create a horizontal bar plot to visualize total sales by product and country
sns.barplot(
data=product_country_sales, # DataFrame containing total sales by product and country
x='num_sold', # Total sales on the x-axis
y='country', # Country on the y-axis
hue='product', # Different colors for each product
palette="viridis" # Use the 'viridis' color palette
)
# Set the title of the plot
plt.title("Ülke ve Ürün Bazında Toplam Satışlar")
# Set the label for the x-axis
plt.xlabel("Toplam Satış")
# Set the label for the y-axis
plt.ylabel("Ülke")
# Add a legend for the products, positioned outside the plot
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the bar plot
plt.show()# Group the data by 'store' and sum the 'num_sold' values for each store
# This gives the total sales for each store
store_sales = train_df.groupby('store')['num_sold'].sum().sort_values(ascending=False).reset_index()
# Set the figure size for the bar plot
plt.figure(figsize=(8, 6))
# Create a horizontal bar plot to visualize total sales by store
sns.barplot(
data=store_sales, # DataFrame containing total sales by store
x='num_sold', # Total sales on the x-axis
y='store', # Store on the y-axis
palette="coolwarm" # Use the 'coolwarm' color palette
)
# Set the title of the plot
plt.title("Mağaza Bazında Toplam Satışlar")
# Set the label for the x-axis
plt.xlabel("Toplam Satış")
# Set the label for the y-axis
plt.ylabel("Mağaza Türü")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the bar plot
plt.show()# Group the data by 'store' and 'product', then sum the 'num_sold' values for each group
# This gives the total sales for each product in each store
store_product_sales = train_df.groupby(['store', 'product'])['num_sold'].sum().reset_index()
# Set the figure size for the bar plot
plt.figure(figsize=(12, 8))
# Create a horizontal bar plot to visualize total sales by store and product
sns.barplot(
data=store_product_sales, # DataFrame containing total sales by store and product
x='num_sold', # Total sales on the x-axis
y='store', # Store on the y-axis
hue='product', # Different colors for each product
palette="coolwarm" # Use the 'coolwarm' color palette
)
# Set the title of the plot
plt.title("Mağaza ve Ürün Bazında Toplam Satışlar")
# Set the label for the x-axis
plt.xlabel("Toplam Satış")
# Set the label for the y-axis
plt.ylabel("Mağaza Türü")
# Add a legend for the 'product' variable, positioned outside the plot
plt.legend(title="Ürün", bbox_to_anchor=(1.05, 1), loc="upper left")
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the bar plot
plt.show()