Skip to content
0
# loading the data
import pandas as pd
data = pd.read_csv("data/orders_and_shipments.csv")
data.head(3)
# check the column names
print(data.columns)
# remove extra spaces from column names
data.columns = data.columns.str.strip()
print(data.columns)
def create_date_columns(df, day_col, month_col, year_col, col_name):
    """
    Create a new column with a datetime object from three columns with day, month and year.
    """
    df[col_name] = pd.to_datetime(df[month_col].astype(str) + '-' + df[day_col].astype(str) + '-' + df[year_col].astype(str))
# create order date column
create_date_columns(data, 'Order Day', 'Order Month', 'Order Year', 'Order Date')

# create shipment date column
create_date_columns(data, 'Shipment Day', 'Shipment Month', 'Shipment Year', 'Shipment Date')
def create_datetime_columns(df, day_col, month_col, year_col, time_col, col_name):
    """
    Create a new column with a datetime object from three columns with day, month and year.
    """
    df[col_name] = pd.to_datetime(df[month_col].astype(str) + '-' + df[day_col].astype(str) + '-' + df[year_col].astype(str) + ' ' + df[time_col].astype(str))
# create order datetime column
create_datetime_columns(data, 'Order Day', 'Order Month', 'Order Year', 'Order Time', 'Order Datetime')
# create shipment days column [actual time to place shipment]
data['Shipment Days'] = (data['Shipment Date'] - data['Order Date']).dt.days
import numpy as np

# create shipment delay column
data['Shipment Delay'] = np.where(data['Shipment Days - Scheduled'] > data['Shipment Days'], 0, data['Shipment Days'] - data['Shipment Days - Scheduled'])
# if shipment delay is negative there might be error in the shipment month
fil_df = np.where(data['Shipment Days'] < 0 )
filtered_shipment_date = data.loc[fil_df, 'Shipment Date']
filtered_order_date = data.loc[fil_df, 'Order Date']
filtered_shipment_day = data.loc[fil_df, 'Shipment Day']
filtered_shipment_month = data.loc[fil_df, 'Shipment Month']
filtered_shipment_year = data.loc[fil_df, 'Shipment Year']
filtered_order_day = data.loc[fil_df, 'Order Day']
filtered_order_month = data.loc[fil_df, 'Order Month']
filtered_order_year = data.loc[fil_df, 'Order Year']

# Create a new DataFrame to combine the 'Shipment Date' and 'Order Date'
combined_data = pd.DataFrame({'Order Day': filtered_order_day, 'Order Month': filtered_order_month, 'Order Year': filtered_order_year, 
                              'Order Date': filtered_order_date, 
                              'Shipment Day': filtered_shipment_day, 'Shipment Month': filtered_shipment_month, 'Shipment Year': filtered_shipment_year,
                              'Shipment Date': filtered_shipment_date, })

# Print the combined DataFrame
combined_data

So there are 2735 orders which shipment dates preceed the order dates which is not possible in a real life scenario. So, I have to remove those from data.

wrong_indices = np.where(data['Shipment Date'] < data['Order Date'])
data.loc[wrong_indices, 'Shipment Date'].count()

# Remove the wrong rows from the DataFrame
mask = np.where(data['Shipment Date'] >= data['Order Date'])
data = data.loc[mask]
# check the customer country column values
data['Customer Country'].unique()
data['Customer Country'] = np.where(data['Customer Country'] == 'Cote d�Ivoire', "Cote d'Ivoire", data['Customer Country'])
data['Customer Country'] = np.where(data['Customer Country'] == 'Dominican�Republic', 'Dominican Republic', data['Customer Country'])
data['Customer Country'] = np.where(data['Customer Country'] == 'Per�', 'Peru', data['Customer Country'])
data['Customer Country'] = np.where(data['Customer Country'] == 'Algeria�', 'Algeria', data['Customer Country'])
data['Customer Country'] = np.where(data['Customer Country'] == 'Israel�', 'Israel', data['Customer Country'])
data['Customer Country'] = np.where(data['Customer Country'] == 'Ben�n', 'Benin', data['Customer Country'])