Skip to content
0
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
import sys
#!{sys.executable} -m pip install geopandas
#!{sys.executable} -m pip install geopy
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.distance import distance
# Import data
df_main=pd.read_csv("data/orders_and_shipments.csv")
df_main.head()
# Examine shape of df_main
df_main.shape
# Generate summary statistics for orders dataset
df_main.describe(include='all')
# Check for missing values (remove head method to see all results), though no vals are missing
df_main.isnull().sum().head()
# Some columns have a leading/trailing spaces in their names, let's remove those
df_main=df_main.rename(columns=lambda x: x.strip())
df_main.columns
# Replace dashes in discount percent column with 0s

# First, strip spaces from Discount % column
df_main['Discount %']=df_main['Discount %'].str.strip()

# Make replacements
df_main['Discount %']=np.where(df_main['Discount %']=="-", 0, df_main['Discount %'])

# Change dtype of Discount %
df_main=df_main.astype({'Discount %':'float'})
# For dashboard build, having a datetime value in df_main will be useful
# Concat order year, month, day, and time into a format that pd.to_datetime will accept
df_main['Order DT']=df_main['Order Year'].astype(str)+'-'+df_main['Order Month'].astype(str)+'-'+df_main['Order Day'].astype(str)+' '+df_main['Order Time'].astype(str)

# Convert new col to DT
df_main['Order DT']=pd.to_datetime(df_main['Order DT'])
df_main.head()
# Use geolocator to assign coordinates to following cols: (1) warehouse and (2) customer country
# These values will be approximations because they use countries instead of cities, but good enough for our analysis

# Extract unique vals from warehouse country, customer country
wh_unique=df_main['Warehouse Country'].unique()
cust_unique=df_main['Customer Country'].unique()

# Use geopy to create dictionary values of latitudes, longitudes
geolocator=Nominatim(user_agent="my_geocoder")

# Wh latitiudes
wh_lat={}
for i in range(0, len(wh_unique)):
    wh_lat[wh_unique[i]]=geolocator.geocode(wh_unique[i]).latitude

# Country latitudes
cust_lat={}
for i in range(0, len(cust_unique)):
    cust_lat[cust_unique[i]]=geolocator.geocode(cust_unique[i]).latitude
# Wh lon
wh_lon={}
for i in range(0, len(wh_unique)):
    wh_lon[wh_unique[i]]=geolocator.geocode(wh_unique[i]).longitude

# Cust lon    
cust_lon={}
for i in range(0, len(cust_unique)):
    cust_lon[cust_unique[i]]=geolocator.geocode(cust_unique[i]).longitude
# Create new cols in df_main using dicts
df_main['Warehouse Latitude']=df_main['Warehouse Country'].map(wh_lat)
df_main['Warehouse Longitude']=df_main['Warehouse Country'].map(wh_lon)
# Continue...
df_main['Customer Latitude']=df_main['Customer Country'].map(cust_lat)
df_main['Customer Longitude']=df_main['Customer Country'].map(cust_lon)
# Calculate distance between wh and customer. To do this, we need to make tuples
# Create wh coords column, cust coords column
df_main['Warehouse Coords']=list(zip(df_main['Warehouse Latitude'], df_main['Warehouse Longitude']))
df_main['Customer Coords']=list(zip(df_main['Customer Latitude'], df_main['Customer Longitude']))
df_main.head()
# Create distance column
df_main['Distance in KM (WH to Cust)']=""
for i in range(0, len(df_main['Warehouse Coords'])):
    df_main['Distance in KM (WH to Cust)'][i]=distance(df_main['Warehouse Coords'][i], df_main['Customer Coords'][i]).kilometers
# Finally, to finish the geographic part, let's see if there are any orders that should have been filled by the opposite wh
pr_lat=18.2247706
us_lat=39.7837304
pr_lon=-66.4858295
us_lon=-100.445882

# Create alternate lat, lon cols
df_main['Alternate WH Lat']=np.where(df_main['Warehouse Country']=='USA', pr_lat, us_lat)
df_main['Alternate WH Lon']=np.where(df_main['Warehouse Country']=='USA', pr_lon, us_lon)

# Zip cols as above
df_main['Alternate WH Coords']=list(zip(df_main['Alternate WH Lat'], df_main['Alternate WH Lon']))

# Create column that calculates distance from alternate WH to cust
df_main['Alt Distance in KM (Alt WH to Cust)']=""
for i in range(0, len(df_main['Alternate WH Coords'])):
    df_main['Alt Distance in KM (Alt WH to Cust)'][i]=distance(df_main['Alternate WH Coords'][i], df_main['Customer Coords'][i]).kilometers
    
# Last, create "Is_Optimal" col to see if item was fulfilled from best WH
df_main['Optimal Ful (Y/N)']=np.where(df_main['Distance in KM (WH to Cust)']>df_main['Alt Distance in KM (Alt WH to Cust)'], 'N', 'Y')
df_main.head()