Skip to content
Competition - Supply Chain Analytics
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
import sys
#!{sys.executable} -m pip install geopandas
#!{sys.executable} -m pip install geopy
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.distance import distance# Import data
df_main=pd.read_csv("data/orders_and_shipments.csv")
df_main.head()# Examine shape of df_main
df_main.shape# Generate summary statistics for orders dataset
df_main.describe(include='all')# Check for missing values (remove head method to see all results), though no vals are missing
df_main.isnull().sum().head()# Some columns have a leading/trailing spaces in their names, let's remove those
df_main=df_main.rename(columns=lambda x: x.strip())
df_main.columns# Replace dashes in discount percent column with 0s
# First, strip spaces from Discount % column
df_main['Discount %']=df_main['Discount %'].str.strip()
# Make replacements
df_main['Discount %']=np.where(df_main['Discount %']=="-", 0, df_main['Discount %'])
# Change dtype of Discount %
df_main=df_main.astype({'Discount %':'float'})# For dashboard build, having a datetime value in df_main will be useful
# Concat order year, month, day, and time into a format that pd.to_datetime will accept
df_main['Order DT']=df_main['Order Year'].astype(str)+'-'+df_main['Order Month'].astype(str)+'-'+df_main['Order Day'].astype(str)+' '+df_main['Order Time'].astype(str)
# Convert new col to DT
df_main['Order DT']=pd.to_datetime(df_main['Order DT'])
df_main.head()# Use geolocator to assign coordinates to following cols: (1) warehouse and (2) customer country
# These values will be approximations because they use countries instead of cities, but good enough for our analysis
# Extract unique vals from warehouse country, customer country
wh_unique=df_main['Warehouse Country'].unique()
cust_unique=df_main['Customer Country'].unique()
# Use geopy to create dictionary values of latitudes, longitudes
geolocator=Nominatim(user_agent="my_geocoder")
# Wh latitiudes
wh_lat={}
for i in range(0, len(wh_unique)):
wh_lat[wh_unique[i]]=geolocator.geocode(wh_unique[i]).latitude
# Country latitudes
cust_lat={}
for i in range(0, len(cust_unique)):
cust_lat[cust_unique[i]]=geolocator.geocode(cust_unique[i]).latitude# Wh lon
wh_lon={}
for i in range(0, len(wh_unique)):
wh_lon[wh_unique[i]]=geolocator.geocode(wh_unique[i]).longitude
# Cust lon
cust_lon={}
for i in range(0, len(cust_unique)):
cust_lon[cust_unique[i]]=geolocator.geocode(cust_unique[i]).longitude# Create new cols in df_main using dicts
df_main['Warehouse Latitude']=df_main['Warehouse Country'].map(wh_lat)
df_main['Warehouse Longitude']=df_main['Warehouse Country'].map(wh_lon)# Continue...
df_main['Customer Latitude']=df_main['Customer Country'].map(cust_lat)
df_main['Customer Longitude']=df_main['Customer Country'].map(cust_lon)# Calculate distance between wh and customer. To do this, we need to make tuples
# Create wh coords column, cust coords column
df_main['Warehouse Coords']=list(zip(df_main['Warehouse Latitude'], df_main['Warehouse Longitude']))
df_main['Customer Coords']=list(zip(df_main['Customer Latitude'], df_main['Customer Longitude']))
df_main.head()# Create distance column
df_main['Distance in KM (WH to Cust)']=""
for i in range(0, len(df_main['Warehouse Coords'])):
df_main['Distance in KM (WH to Cust)'][i]=distance(df_main['Warehouse Coords'][i], df_main['Customer Coords'][i]).kilometers# Finally, to finish the geographic part, let's see if there are any orders that should have been filled by the opposite wh
pr_lat=18.2247706
us_lat=39.7837304
pr_lon=-66.4858295
us_lon=-100.445882
# Create alternate lat, lon cols
df_main['Alternate WH Lat']=np.where(df_main['Warehouse Country']=='USA', pr_lat, us_lat)
df_main['Alternate WH Lon']=np.where(df_main['Warehouse Country']=='USA', pr_lon, us_lon)
# Zip cols as above
df_main['Alternate WH Coords']=list(zip(df_main['Alternate WH Lat'], df_main['Alternate WH Lon']))
# Create column that calculates distance from alternate WH to cust
df_main['Alt Distance in KM (Alt WH to Cust)']=""
for i in range(0, len(df_main['Alternate WH Coords'])):
df_main['Alt Distance in KM (Alt WH to Cust)'][i]=distance(df_main['Alternate WH Coords'][i], df_main['Customer Coords'][i]).kilometers
# Last, create "Is_Optimal" col to see if item was fulfilled from best WH
df_main['Optimal Ful (Y/N)']=np.where(df_main['Distance in KM (WH to Cust)']>df_main['Alt Distance in KM (Alt WH to Cust)'], 'N', 'Y')
df_main.head()