Skip to content
Machine Learning Classification Model Predicting Customers Reorder Based on First Order
Branton Stanley
** This was a project I completed for my Master's degree in Data Science ** It does suffer from a lack of data, as I only had about 4,000 customers, but over 3,500 products
Importing Data and Libraries
import pandas as pd
import numpy as np
import os
#ML Models
import tensorflow as tf
from tensorflow import keras
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
#Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#ML Model Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score
#Dimensionality Reduction
from sklearn.decomposition import PCA
#Grid Search for parameter optimization
from sklearn.model_selection import GridSearchCV
#import keras_tuner
from tensorflow.keras import layers
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
Importing the data
data = pd.read_csv('Business Sales Transaction.csv')
data
data = data[data['Quantity']>0] #Dropping all rows with negative quantity (negative quantity represents a return)
data
#Converting TrasactionNo to data type integer
data['TransactionNo'] = data['TransactionNo'].astype(int)
data.dtypes
Data Preparation
df = data.copy() #Creating a copy of the data so any changes won't affect the original data
First Order Information and Target Column
The following section will add the 'second_order' target column and will remove all order information after the first order
#Getting the first order transaction number and the number of orders placed by each CustomerNo
df_orders = df.groupby(by = 'CustomerNo').agg(
first_order = ('TransactionNo', 'min'),
orders = ('TransactionNo', 'nunique')
)
df_orders
#Getting the second_order target column
df_orders['second_order'] = np.where(df_orders['orders'] > 1, 1, 0)
df_orders
#Merging the two tables, leaving only first order information and adding the target column
df_first_order = pd.merge(df, df_orders, how = 'inner', left_on = ['CustomerNo', 'TransactionNo'], right_on = ['CustomerNo', 'first_order'])
df_first_order