Skip to content
import urllib.request
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import numpy as np
import seaborn as sns
import geopandas as gpd
import six
from datetime import datetime
%matplotlib inline

# df = pd.read_excel('master_file_final.xlsx', sheet_name='Sheet1')

# df.to_csv('master_data.csv')
df = pd.read_csv('master_data.csv')

#Set Index master_data Nomor and Remove Unnamed: 0
del df['Unnamed: 0']
df.set_index('Nomor', inplace=True)
display(df.tail())
df['Bahan'] = df['Bahan'].astype('category')
df['Kota'] = df['Kota'].astype('category')
df['Gudang Pengiriman'] = df['Gudang Pengiriman'].astype('category')
df['Nama Campaign'] = df['Nama Campaign'].astype('category')
df['Provinsi'] = df['Provinsi'].astype('category')
df['Nama Kurir'] = df['Nama Kurir'].astype('category')
df['Tipe Pengiriman (regular, same day, etc)'] = df['Tipe Pengiriman (regular, same day, etc)'].astype('category')

df['Tanggal Pesanan Selesai'] = df['Tanggal Pesanan Selesai'].apply(lambda x: datetime.strptime(x,'%d-%m-%Y'))
df['Tanggal Pembayaran'] = df['Tanggal Pembayaran'].apply(lambda x: datetime.strptime(x,'%d-%m-%Y %H:%M:%S'))
df['Hour'] = df['Tanggal Pembayaran'].dt.hour
df['Total Penjualan'] = df['Jumlah Produk Dibeli'] * df['Harga Jual (IDR)']
df['Order Month'] = df['Tanggal Pembayaran'].dt.strftime('%Y-%m')
df['Order Month'] = pd.to_datetime(df['Order Month'])



replace_values = {'Jakarta Gading':'Jakarta', 
                  'Surabaya Kaliwaron':'Surabaya', 'Bandung Cinambo':'Bandung', 
                  'Bandung Gedebage':'Bandung', 'Jakarta Pulo Gadung':'Jakarta', 'Makassar Teuku Umar':'Makassar', 
                  'Palembang Ratu Sianum':'Palembang', 'Palembang Kimarogan':'Palembang', 'Surabaya Berbek':'Surabaya', 
                  'Bandung Soetta':'Bandung', 'Surabaya Osowilangun':'Surabaya'}

df = df.replace({"Gudang Pengiriman":replace_values})

cols = list(df.columns.values)
df = df[['Nomor Invoice', 'Tanggal Pembayaran', 'Hour', 'Order Month', 'Status Terakhir', 'Tanggal Pesanan Selesai', 
         'Waktu Pesanan Selesai', 'Nama Produk', 'Ukuran', 'Warna', 'Kategori', 'Bahan', 'Tipe Produk', 'Jumlah Produk Dibeli', 
         'Harga Awal (IDR)', 'Diskon Produk (IDR)', 'Harga Jual (IDR)', 'Total Penjualan', 'Jumlah Subsidi Tokopedia (IDR)', 
         'Nilai Voucher Toko Terpakai (IDR)', 'Jenis Voucher Toko Terpakai', 'Kode Voucher Toko Yang Digunakan', 
         'Biaya Pengiriman Tunai (IDR)', 'Biaya Asuransi Pengiriman (IDR)', 'Total Biaya Pengiriman (IDR)', 'Nama Pembeli', 
         'No Telp Pembeli', 'Nama Penerima', 'No Telp Penerima', 'Alamat Pengiriman', 'Kota', 'Provinsi', 'Nama Kurir', 
         'Tipe Pengiriman (regular, same day, etc)', 'No Resi / Kode Booking', 'Tanggal Pengiriman Barang', 'Waktu Pengiriman Barang', 
         'Gudang Pengiriman', 'Nama Campaign', 'Bebas Ongkir', 'COD']]

print(cols)
print(df.info())
display(df.head(5))

Most Sales in the Category with Bar Plot

total_bahan_terjual = df.groupby('Bahan').agg({'Jumlah Produk Dibeli':'count'}).sort_values(by='Jumlah Produk Dibeli', ascending=False)
# print(total_bahan_terjual)

# Figure Size
fig, ax = plt.subplots(figsize=(16, 9))

# Horizontal Bar Plot
ax.barh(total_bahan_terjual.index, total_bahan_terjual['Jumlah Produk Dibeli'])

# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
    
# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

# Add padding between axes and labels
ax.xaxis.set_tick_params(pad=5)
ax.yaxis.set_tick_params(pad=10)

# Add x, y gridlines
ax.grid(b=True, color='grey',
        linestyle='-.', linewidth=0.5,
        alpha=0.2)
  
# Show top values
ax.invert_yaxis()
  
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize=10, fontweight='bold',
             color='grey')

# Add Plot Title
ax.set_title('Total Penjualan per-Bahan Periode Jul 2020 - Jul 2022',
             loc='left')
    
plt.show()

Most Sales Distribution in Percent Pie Chart

# fig = plt.figure(figsize =(12, 9))
# # Creating plot
# plt.pie(total_bahan_terjual['Jumlah Produk Dibeli'], labels = total_bahan_terjual.index)

# plt.show()

Most Delivery by Courier Name

total_pengiriman_courier = df.groupby('Nama Kurir').agg({'Tipe Pengiriman (regular, same day, etc)':'count'}).sort_values(by='Tipe Pengiriman (regular, same day, etc)', ascending=False)
# print(total_pengiriman_courier)

# Figure Size
fig, ax = plt.subplots(figsize=(16, 9))

# Horizontal Bar Plot
ax.barh(total_pengiriman_courier.index, total_pengiriman_courier['Tipe Pengiriman (regular, same day, etc)'])

# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
    
# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

# Add padding between axes and labels
ax.xaxis.set_tick_params(pad=5)
ax.yaxis.set_tick_params(pad=10)

# Add x, y gridlines
ax.grid(b=True, color='grey',
        linestyle='-.', linewidth=0.5,
        alpha=0.2)

# Show top values
ax.invert_yaxis()

# Add Plot Title
ax.set_title('Total pengiriman Kurir Jul 2020 - Jul 2022',
             loc='left')

# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize=10, fontweight='bold',
             color='grey')

plt.show()

Most Buyers Name of All time (Count) (Top 10)

top10_buyer = df.groupby('Nama Pembeli').agg({'Jumlah Produk Dibeli':'count'}).sort_values('Jumlah Produk Dibeli', ascending=False).head(10)

print(top10_buyer)

Choropleth Map Distribution of Buyer (Count)

fp = "bps/idn_admbnda_adm1_bps_20200401.shp"
map_df = gpd.read_file(fp)
# print(map_df)
# Rename Non Match province between two dataframe
map_df = map_df.replace("Dki Jakarta","DKI Jakarta")

df = df.replace("D.I. Aceh","Aceh")
df = df.replace("D.I. Yogyakarta", "Daerah Istimewa Yogyakarta")


# Function to show non match Province
def get_difference(list_a, list_b):
    return set(list_a)-set(list_b)

list_a = df.Provinsi.unique().tolist()
list_b = map_df.ADM1_EN.unique().tolist()

non_match = list(get_difference(list_a, list_b))
print("No match elements: ", non_match)
province_sum = df.groupby('Provinsi').agg({"Jumlah Produk Dibeli":"sum"}).sort_values(by="Jumlah Produk Dibeli", ascending=False).reset_index()
# print(province_sum.head())