COS Case Study — DataLab

#Import Packages
import numpy as np
import pandas as pd

#Importing the files
case_study_provider_jd_data = pd.read_csv('case_study_provider_jd_data.txt', sep='|')
case_study_npi_file = pd.read_csv('case_study_npi_file.csv')
case_study_prescription_data = pd.read_csv('case study_prescription_data.csv')

#Handy List for when an operation needs to be performed on all dataframes
df_list = [case_study_provider_jd_data, case_study_prescription_data, case_study_npi_file]

#Dropping Unnecessary Columns and correcting naming conventions
case_study_provider_jd_data.drop(["Unnamed: 0"], axis=1, inplace=True)
case_study_prescription_data.drop(["Unnamed: 8"], axis=1, inplace=True)
for i in df_list:
    i.columns = i.columns.str.strip()
    i.columns = i.columns.str.title()
    i.columns = i.columns.str.replace(" ", "_")
    
#Clean/Transform License Numbers
#Assumption: License Numbers need to be in the format : License Type-Number 
case_study_provider_jd_data['License_Number'] = case_study_provider_jd_data["License_Type"].map(str) + "-" + case_study_provider_jd_data['License_Number'].astype('str').str.extract('(\d+)')[0]

case_study_npi_file['Provider_License_Number'] = case_study_npi_file["Medicare_Provider_Type_Desc"].map(str) + "-" + case_study_npi_file['Provider_License_Number'].astype('str').str.extract('(\d+)')[0]

case_study_prescription_data['License_Number'] = case_study_prescription_data["License_Type"].map(str) + "-" + case_study_prescription_data['License_Number'].astype('str').str.extract('(\d+)')[0]

#Title Casing
case_cols1=["Full_Name","First_Name","Middle_Name","Last_Name","City"]
case_study_provider_jd_data[case_cols1] = case_study_provider_jd_data[case_cols1].apply(lambda x: x.astype(str).str.title())

case_cols2=["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name"]
case_study_npi_file[case_cols2] = case_study_npi_file[case_cols2].apply(lambda x: x.astype(str).str.title())

case_cols3=["Full_Name","City"]
case_study_prescription_data[case_cols3] = case_study_prescription_data[case_cols3].apply(lambda x: x.astype(str).str.title())

#Date Data Types
date_cols1=["License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date"]
for i in date_cols1:
    case_study_provider_jd_data[i] = pd.to_datetime(case_study_provider_jd_data[i])

case_study_prescription_data["Last_Registration_Date"] = pd.to_datetime(case_study_prescription_data["Last_Registration_Date"])

#Drop any duplicates/Null rows
case_study_provider_jd_data = case_study_provider_jd_data.drop_duplicates().dropna(how="all")
case_study_npi_file = case_study_npi_file.drop_duplicates().dropna(how="all")
case_study_prescription_data = case_study_prescription_data.drop_duplicates().dropna(how="all")

#Display a list of columns in all 3 dataframes and a sample of the data
print(case_study_provider_jd_data.info())
print(case_study_npi_file.info())
print(case_study_prescription_data.info())
print(case_study_provider_jd_data.head())
print(case_study_npi_file.head())
print(case_study_prescription_data.head())

#case_study_provider_jd_data.isnull().sum()
#case_study_provider_jd_data[case_study_provider_jd_data["City"].isnull()]
mask = case_study_provider_jd_data[["City","State","Mail_Zip_Code"]].isnull().any(axis=1)
case_study_provider_jd_data["License_Number"].unique()
#case_study_npi_file["Provider_License_Number"].unique()
#case_study_prescription_data["License_Number"].unique()
case_study_provider_jd_data["Address_Line_2"].unique()

#Merging All 3 data frames on License Numbers
case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="outer", suffixes=["_jd", "_npi"])\
.merge(case_study_prescription_data, left_on="License_Number", right_on="License_Number", how="outer", suffixes=["_jd", "_pres"])

#Create a FullName Column from the names in NPI File
case_df1["Provider_Full_Name"] = case_df1["Provider_First_Name"] + " " + case_df1["Provider_Middle_Name"] + " " + case_df1["Provider_Last_Name"]

#Coalesce fields that are repeated across the different files
case_df1["Full_Name"] = case_df1[["Full_Name_jd","Provider_Full_Name","Full_Name_pres"]].bfill(axis=1).iloc[:,0]
case_df1["City"] = case_df1[["City_jd","City_pres"]].bfill(axis=1).iloc[:,0]
case_df1["State"] = case_df1[["State_jd","State_pres"]].bfill(axis=1).iloc[:,0]
case_df1["County"] = case_df1[["County_jd","County_pres"]].bfill(axis=1).iloc[:,0]
case_df1["License_Type"] = case_df1[["License_Type_jd","License_Type_pres"]].bfill(axis=1).iloc[:,0]
case_df1["License_Number"] = case_df1[["License_Number","Provider_License_Number"]].bfill(axis=1).iloc[:,0]

#Drop extra fields after coalescing and reorder the remaining fields
drop_list =["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name","Provider_Full_Name","First_Name","Middle_Name","Last_Name","Full_Name_jd","Full_Name_pres","City_jd","City_pres","State_jd","State_pres","County_jd","County_pres","License_Type_jd","License_Type_pres","Provider_License_Number"]
case_df1 = case_df1.drop(drop_list, axis=1)
case_df1 = case_df1[["Full_Name","Address_Line_1","Address_Line_2","City","County","State","Mail_Zip_Code","Npi","License_Type","License_Number","Practice_Type","Prescription_Id","Last_Registration_Date","License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date","License_Status_Description","Healthcare_Provider_Taxonomy_Code","Provider_License_Number_State_Code","Healthcare_Provider_Primary_Taxonomy_Switch","Medicare_Specialty_Code","Medicare_Provider_Type_Desc","Provider_Taxonomy_Code","Provider_Taxonomy_Code_Desc","Record_Type"]]

#Create a new DF for NPI and Pres IDs that are assigned to more than 1 provider
npis = case_df1.drop_duplicates(subset=["Npi","Full_Name"])
duplicate_npis = npis[npis.duplicated(subset=["Npi"], keep=False) & (~npis["Npi"].isnull())]

pres = case_df1.drop_duplicates(subset=["Prescription_Id","Full_Name"])
duplicate_pres = pres[pres.duplicated(subset=["Prescription_Id"], keep=False) & (~pres["Prescription_Id"].isnull())]

duplicates = pd.concat([duplicate_npis,duplicate_pres])

#Anti Join duplicates with the original merge to exclude the duplicated NPI and Prescription IDs
final_df = case_df1.merge(duplicates, on="License_Number", indicator=True, how="outer")

drop_list = case_df1.columns.to_list()
drop_list = [i + "_y" for i in drop_list]
drop_list+= ["_merge"]
drop_list.remove("License_Number_y")
final_df = final_df[final_df["_merge"]=="left_only"].drop(drop_list, axis=1)
final_df.columns = final_df.columns.str.replace("_x","")

#case_study_provider_jd_data["County"].describe()
#case_study_npi_file["Medicare_Provider_Type_Desc"].value_counts()
#case_study_provider_jd_data.columns = case_study_provider_jd_data.columns.str.strip()
#case_study_provider_jd_data.info()

#case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="inner", suffixes=["_jd", "_npi"])\
#.merge(case_study_prescription_data, on="License_Number", how="inner", suffixes=["_jd","_pre"])
#case_df1.info()

#case_df2 = case_study_provider_jd_data.merge(case_study_npi_file, left_on=["Last_Name","First_Name"], right_on=["Provider_Last_Name","Provider_First_Name"], how="outer", suffixes=["_jd", "_npi"])

#names = pd.DataFrame()
#names[["Last_Name", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
#names[["First_Name","Middle_Name"]] = names["fm_name"].str.split(n=1, expand=True)
#names.drop(["fm_name"], axis=1)
#case_study_prescription_data[["Last_Name","First_Name", "Middle_Name"]] = names[["Last_Name","First_Name", "Middle_Name"]]

#case_df2 = case_study_provider_jd_data.merge(case_study_npi_file, left_on=["Last_Name","First_Name"], right_on=["Provider_Last_Name","Provider_First_Name"], how="outer", suffixes=["_jd", "_npi"]).merge(case_study_provider_jd_data, on=["Last_Name","First_Name"])

#case_study_prescription_data["Last_Name"] = case_study_prescription_data["Full_Name"].str.split(pat = ',')[]
#.merge(case_study_prescription_data, on="Full_NameNumber", how="inner", suffixes=["_jd","_pre"])
#case_study_prescription_data.head()
#case_df2.info()

names = pd.DataFrame()
names[["Last_Name3", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
names[["First_Name3","Middle_Name3"]] = names["fm_name"].str.split(n=1, expand=True)
names.drop(["fm_name"], axis=1)
case_study_prescription_data[["Last_Name_pres","First_Name_pres", "Middle_Name_pres"]] = names[["Last_Name","First_Name", "Middle_Name"]]

case_study_npi_file["Provider_Full_Name"] = case_study_npi_file["Provider_First_Name"].str.cat(case_study_npi_file["Provider_Middle_Name"], sep=" ").str.cat(case_study_npi_file["Provider_Last_Name"])

case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="outer", suffixes=["_jd", "_npi"])\
.merge(case_study_prescription_data, left_on="License_Number", right_on="License_Number", how="outer", suffixes=["_jd", "_pres"])
case_df1["First_Name"] = case_df1[["First_Name_jd","Provider_First_Name", "First_Name_pres"]].bfill(axis=1).iloc[:,0]
case_df1["Last_Name"] = case_df1[["Last_Name_jd","Provider_Last_Name", "Last_Name_pres"]].bfill(axis=1).iloc[:,0]
case_df1["Middle_Name"] = case_df1[["Middle_Name_jd","Provider_Middle_Name","Middle_Name_pres"]].bfill(axis=1).iloc[:,0]


case_df1["City"] = case_df1[["City_jd","City_pres"]].bfill(axis=1).iloc[:,0]
case_df1["State"] = case_df1[["State_jd","State_pres"]].bfill(axis=1).iloc[:,0]
case_df1["County"] = case_df1[["County_jd","County_pres"]].bfill(axis=1).iloc[:,0]
case_df1["License_Type"] = case_df1[["License_Type_jd","License_Type_pres"]].bfill(axis=1).iloc[:,0]
case_df1["License_Number"] = case_df1[["License_Number","Provider_License_Number"]].bfill(axis=1).iloc[:,0]

drop_list =["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name","Provider_Full_Name","First_Name_jd","Middle_Name_jd","Last_Name_jd","Full_Name_jd","First_Name_pres","Middle_Name_pres","Last_Name_pres","Full_Name_pres","City_jd","City_pres","State_jd","State_pres","County_jd","County_pres","License_Type_jd","License_Type_pres","Provider_License_Number"]
case_df1 = case_df1.drop(drop_list, axis=1)
case_df1 = case_df1.drop_duplicates()
case_df1 = case_df1[["Full_Name","First_Name","Middle_Name","Last_Name","Address_Line_1","Address_Line_2","City","County","State","Mail_Zip_Code","Npi","License_Type","License_Number","Practice_Type","Prescription_Id","Last_Registration_Date","License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date","License_Status_Description","Healthcare_Provider_Taxonomy_Code","Provider_License_Number_State_Code","Healthcare_Provider_Primary_Taxonomy_Switch","Medicare_Specialty_Code","Medicare_Provider_Type_Desc","Provider_Taxonomy_Code","Provider_Taxonomy_Code_Desc","Record_Type"]]
case_df1.info()




#case_df1 = case_df1[~(case_df1.duplicated(subset=["NPI"], keep=False) | case_df1.duplicated(subset=["prescription_id"], keep=False))]
#case_df1.info()

#case_df2 = case_df1[case_df1.duplicated(subset=["NPI"], keep=False) | case_df1.duplicated(subset=["Prescription_Id"], keep=False)]
case_df1

DataFrameas

df3

variable

Run cancelled

SELECT
	c.CustomerID,
	c.CustomerName,
	p.ProductID,
	p.ProductName
	FROM
	'Q3_Customers.csv' AS c
	LEFT JOIN
	(
	SELECT 
		CustomerID,
		ProductID,
		RANK() OVER(
			PARTITION BY CustomerID 
			ORDER BY SUM(Quantity) DESC 
			RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
		) AS r_no
		FROM 'Q3_Invoices.csv'
		GROUP BY CustomerID, ProductID
		ORDER BY CustomerID, r_no
	) AS s1
	ON c.CustomerID = s1.CustomerID
	LEFT JOIN
	'Q3_Products2.csv' AS p
	ON p.ProductID = s1.ProductID
	WHERE r_no <=2
	ORDER BY c.CustomerID, s1.r_no, p.Price

DataFrameas

df2

variable

Run cancelled

SELECT
	CustomerID,
	ProductID
FROM
	(
	SELECT 
		CustomerID,
		ProductID,
		ROW_NUMBER() OVER(PARTITION BY CustomerID ORDER BY s1.n_orders DESC) AS r_no
		FROM 
		(
		SELECT
			CustomerID,
			ProductID,
			SUM(Quantity) AS n_orders
		FROM 'Q3_Invoices.csv'
		GROUP BY CustomerID,ProductID
			) as s1
		ORDER BY CustomerID, r_no
	) as s2
WHERE r_no <=2

DataFrameas

df4

variable

Run cancelled

SELECT
	c.CustomerID,
	c.CustomerName,
	p.ProductID,
	p.ProductName,
	s1.instances_purchased
	FROM
	'Q3_Customers.csv' AS c
	LEFT JOIN
	(
	SELECT 
		CustomerID,
		ProductID,
		SUM(Quantity) AS instances_purchased,
		RANK() OVER(
			PARTITION BY CustomerID 
			ORDER BY SUM(Quantity) DESC
			RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
		) AS r_no
		FROM 'Q3_Invoices.csv'
		GROUP BY CustomerID, ProductID
		ORDER BY CustomerID, r_no
	) AS s1
	ON c.CustomerID = s1.CustomerID
	LEFT JOIN
	'Q3_Products2.csv' AS p
	ON p.ProductID = s1.ProductID
	WHERE r_no <=2
	ORDER BY c.CustomerID, s1.r_no, p.Price

Run cancelled

#Q2

def test_valid(id_num, id_name):
    return (test_name(id_num, id_name), test_num(id_num))

def test_name(id_num, id_name):
    if id_num[1] == id_name[0]:
        return "Valid"
    return "Invalid"

def test_num(id_num):
    if((
        (
            int(id_num[3]) +
            int(id_num[5]) +
            int(id_num[7])
        ) * 2 +
        (
            int(id_num[2]) +
            int(id_num[4]) +
            int(id_num[6])
        )
    )%10 == int(id_num[8])):
        return "Valid"
    return "Invalid"

pres = pd.read_csv("Q2_Pres.csv")
pres[["name_match_test","id_num_test"]] = pd.DataFrame(
  pres.apply(lambda row:test_valid(row["id_number"], row["Name"]), axis=1).tolist()
)
pres
#pres["name_match_test"] = pres.apply(lambda row:test_name(row["id_number"],row["Name"]), axis=1)
#pres["id_num_test"] = pres.apply(lambda row:test_num(row["id_number"]), axis=1)

Run cancelled

names = pd.DataFrame()
names[["Last_Name", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
names[["First_Name","Middle_Name"]] = names["fm_name"].str.split(n=1, expand=True)
names.drop(["fm_name"], axis=1)