Skip to content

Explore a DataFrame

Welcome to your workspace! In this walkthrough, you will learn the basics of Workspace as you load data and explore it with Python!

Keep an eye out for 💪  icons throughout the notebook. These will indicate opportunities for you to try out Workspace for yourself!

import pandas as pd
import seaborn as sns
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
data_2018.head()
data_2018 = pd.read_csv('last_year_merchants.csv').reset_index()
data_2019 = pd.read_csv('merchant_transactions.csv').reset_index()

data_2018 = data_2018.drop(labels='index', axis=1)
print('\n')
data_2019 = data_2019.drop(labels='index', axis=1)

data_2019.head()
data_2018['live_date'] = pd.to_datetime(data_2018.live_date)
data_2018['last_trx_date'] = pd.to_datetime(data_2018.last_trx_date)
data_2019['trx_date'] = pd.to_datetime(data_2019.trx_date)
data_2019['dow'] = data_2019.trx_date.dt.day_name()
data_2019['month'] = data_2019.trx_date.dt.month_name()
data_2019['date_filter'] = data_2019.trx_date.dt.day
data_2019['cycles'] = data_2019.trx_date.dt.week
def merch_cat(live_date, last_trx_date):
    if (last_trx_date - live_date).days > 30:
        return 'active'
    if (last_trx_date - live_date).days <= 30:
        return 'churn'
    if (last_trx_date - live_date).days <= 30 & completed_trx <= 1:
        return 'inactive'
    else:
        return 'undefined'

data_2018['status'] = data_2018.apply(lambda row: merch_cat(row['live_date'], row['last_trx_date']), axis=1)
data_2018.head()
data_2019.columns
d = {('trx_id', 'customer_id'):pd.Series.nunique, ('trx_items', 'trx_gmv', 'trx_gbv', 'trx_gmv_after_promo'):'sum'}

data_19_sum = data_2019.groupby('merchant_id').agg({k: v for t, v in d.items() for k in t})
data_19_sum = data_19_sum.reset_index()
datelive = pd.DataFrame(data_2019.groupby('merchant_id')['trx_date'].agg(['min', 'max'])).reset_index()

data_19_sum = data_19_sum.merge(datelive, on='merchant_id', how='left')
data_19_sum.head()
data_19_sum = data_19_sum.rename(columns={'min':'first_19', 'max':'last_19'})
data_2019.head()
data_2019['city_id'] = data_2019.merchant_id.str[1:4]

data_19_sum = data_19_sum.merge(data_2019[['merchant_id','city_id']], on='merchant_id', how='left')
data_19_sum.head()
def merch_cat(first_19, last_19):
    if (last_19 - first_19).days > 30:
        return 'active'
    if (last_19 - first_19).days <= 30:
        return 'churn'
    if (last_19 - first_19).days <= 30 & trx_id <= 1:
        return 'inactive'
    else:
        return 'undefined'

data_19_sum['status'] = data_19_sum.apply(lambda row: merch_cat(row['first_19'], row['last_19']), axis=1)
data_19_sum.head()
churning = data_2018[data_2018.city_id == 'C11'][['merchant_id','status']].merge(data_19_sum[['merchant_id', 'status']], on='merchant_id', how='left')
churning = churning.rename(columns={'status_x':'status_18', 'status_y':'status_19'})
churning.pivot_table(index='status_18', columns='status_19', values='merchant_id', aggfunc=pd.Series.nunique, fill_value=0)