Skip to content
DoorDash Data EDA and Feature Engineering
  • AI Chat
  • Code
  • Report
  • import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    #read data
    data = pd.read_csv('historical_data.csv')
    data.head()
    #check column data types
    data.info()
    #convert date columns to date type
    data['created_at'] = pd.to_datetime(data['created_at'])
    data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
    from datetime import datetime
    data['actual_total_delivery_duration'] = (data['actual_delivery_time'] - data['created_at']).dt.total_seconds()
    # Convert actual_total_delivery_duration from timedelta to total seconds
    
    data['busy_dashers_ratio'] = data['total_busy_dashers'] / data['total_onshift_dashers']
    data['estimated_non_prep_duration'] = data['estimated_store_to_consumer_driving_duration'] + data['estimated_order_place_duration']
    #check which columns to encode
    data['market_id'].nunique()
    #check which columns to encode
    data['store_id'].nunique()
    data['order_protocol'].nunique()
    #encoding order_protocol
    order_protocol_dummies = pd.get_dummies(data['order_protocol'])
    order_protocol_dummies = order_protocol_dummies.add_prefix('order_protocol_')
    order_protocol_dummies.head()
    #encoding market_id
    market_id_dummies = pd.get_dummies(data['market_id'])
    market_id_dummies = market_id_dummies.add_prefix('market_id_')
    market_id_dummies.head()
    #create dictionary with most repeated categories for each store to fill null rows where it is possible
    store_id_unique = data['store_id'].unique().tolist()
    store_id_and_category = {store_id: data[data['store_id'] == store_id].store_primary_category.mode() for store_id in store_id_unique}
    import numpy as np
    
    def fill(store_id):
        try:
            return store_id_and_category[store_id].values[0]
        except:
            return np.nan
    
    #fill null values
    data['nan_free_store_primary_category'] = data['store_id'].apply(fill)
    data['nan_free_store_primary_category'].nunique()
    #encoding store_primary_category
    store_primary_category_dummies = pd.get_dummies(data['nan_free_store_primary_category'])
    store_primary_category_dummies = store_primary_category_dummies.add_prefix('category_')
    store_primary_category_dummies.head()