Skip to content
JustDiceCase
import pandas as pd
import numpy as np
adspend = pd.read_csv("adspend.csv")
installs = pd.read_csv("installs.csv")
payouts = pd.read_csv("payouts.csv")
revenue = pd.read_csv("revenue.csv")
print(adspend.head())
print(adspend.info())
print(adspend.isnull().sum())
print(adspend["client_id"].unique())
adspend["event_date"] = pd.to_datetime(adspend["event_date"])
col_names = adspend.columns.to_list()
col_names[col_names.index("client_id")] = "app_id"
adspend.columns = col_names
adspend.info()
print(installs.head())
print(installs.info())
print(installs.isnull().sum())
print(installs["country_id"].describe())
print(installs["country_id"].unique())
print(installs["app_id"].unique())
print(installs["network_id"].unique())
a = installs["device_os_version"].unique()
a.sort()
print(a)
installs["event_date"] = pd.to_datetime(installs["event_date"])
#Add a new col for device age based on OS
#installs["device_cat"]
#Revenue/Payout/Counts by country, network
#Top 10 Apps by Revenue/Payout/Counts
#AOV by country/network/app
print(payouts.head())
print(payouts.info())
print(payouts.isnull().sum())
payouts["event_date"] = pd.to_datetime(payouts["event_date"])
#Payout by day, weekday/weekend
print(revenue.head())
print(revenue.info())
print(revenue.isnull().sum())
revenue["event_date"] = pd.to_datetime(revenue["event_date"])
#Convert dates to date data type
#None of the dataset have any null values, data in the columns is also in the expected range
#Revenue by date, by weekday/weekend
Todo #Add a new col for device age based on OS
Deliverables
- Revenue/Payout/Counts by country, network, device age
- Top 10 Apps by Revenue/Payout/Counts
- AOV/CTA by country/network/app/Device age
- Payout/Revenue/Counts by day, weekday/weekend
- Profitability by country/network (Ads spend vs revenue)
Dimensions Country, network, device age (no adpsend) , top 10 apps (no adspend), clients (ads only). Client ID may be same thing as app_id
Metrics Revenue, Payout, Counts (no of downloads), AOV, CTA
#Make note about user quality and commericial conditions which are not present but may be useful
#Joins
\Installs with revenue and payoutsZA
installs_revenue = installs.merge(revenue[["install_id","value_usd"]], on="install_id", how="inner")
installs_payouts = installs.merge(payouts[["install_id","value_usd"]], on="install_id", how="inner")
print(installs_revenue.info(show_counts=True))
print(installs_payouts.info(show_counts=True))
#Revenue/Payout/Counts by country, network, device age
#Top 10 Apps by Revenue/Payout/Counts
#AOV by country/network/app/Device age (Rev/Count of rev for that channel)
#Payout/Revenue/Counts by day, weekday/weekend
print(installs_revenue.groupby("country_id")["value_usd"].sum())
print(installs_payouts.groupby("country_id")["value_usd"].sum())
print(installs_revenue.groupby("country_id")["value_usd"].count())
print(installs_payouts.groupby("country_id")["value_usd"].count())
#CTA (Count of installs/Adspend for that channel)
print(adspend.groupby("country_id")["value_usd"].sum())
print(installs.groupby("country_id")["install_id"].count())
#Profitability (Revenue/Adspend for that channel)
print(adspend.groupby("country_id")["value_usd"].sum())
print(installs.groupby("country_id")["install_id"].count())
print(installs_payouts.groupby("country_id")["value_usd"])
dimensions = ["country_id","network_id", "device_cat", "app_id"]
country = pd.DataFrame({
"revenue": installs_revenue.groupby("country_id")["value_usd"].sum(),
"adspend": adspend.groupby("country_id")["value_usd"].sum(),
"payouts": installs_payouts.groupby("country_id")["value_usd"].sum(),
"installs": installs.groupby("country_id")["event_date"].count()
}
).reset_index()
print(country)