Skip to content

Loading libraries and the data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

pd.set_option('display.max_columns', None)
#importing the first dataset df1
df1 = pd.read_excel("QVI_transaction_data.xlsx")
#reading the excel file takes too long. We will save the df as csv and we'll use the csv file moving forward.
#df1.to_csv('QVI_transaction_data.csv', index=False)
#re-importing the df
df1 = pd.read_csv('QVI_transaction_data.csv')
#Exploring df1's shape and first rows
print("Shape:", df1.shape, '\n')
df1.head()
#importing the second dataset
df2 = pd.read_csv('QVI_purchase_behaviour.csv')
#Exploring df2's shape and first rows
print("Shape:", df2.shape, '\n')
df2.head()
#combining the two datasets into one
df = df1.merge(df2, how='left', on='LYLTY_CARD_NBR')
df.head()
print("Merged shape:", df.shape)

Data Cleaning

#Formula to convert excel date integer to pandas datetime object
def xldate_to_datetime(xldate):
    temp = dt.datetime(1899, 12, 30)
    delta = dt.timedelta(days=xldate)
    return temp+delta
#Reassigning the Date column with correct datetime values
df['DATE'] = df['DATE'].apply(xldate_to_datetime)
#Verifying that the data type for date changed
df.info()
#descriptive statistics
df.describe()