Skip to content
COS Case Study
  • AI Chat
  • Code
  • Report
  • #Import Packages
    import numpy as np
    import pandas as pd
    
    #Importing the files
    case_study_provider_jd_data = pd.read_csv('case_study_provider_jd_data.txt', sep='|')
    case_study_npi_file = pd.read_csv('case_study_npi_file.csv')
    case_study_prescription_data = pd.read_csv('case study_prescription_data.csv')
    
    #Handy List for when an operation needs to be performed on all dataframes
    df_list = [case_study_provider_jd_data, case_study_prescription_data, case_study_npi_file]
    
    #Dropping Unnecessary Columns and correcting naming conventions
    case_study_provider_jd_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    case_study_prescription_data.drop(["Unnamed: 8"], axis=1, inplace=True)
    for i in df_list:
        i.columns = i.columns.str.strip()
        i.columns = i.columns.str.title()
        i.columns = i.columns.str.replace(" ", "_")
        
    #Clean/Transform License Numbers
    #Assumption: License Numbers need to be in the format : License Type-Number 
    case_study_provider_jd_data['License_Number'] = case_study_provider_jd_data["License_Type"].map(str) + "-" + case_study_provider_jd_data['License_Number'].astype('str').str.extract('(\d+)')[0]
    
    case_study_npi_file['Provider_License_Number'] = case_study_npi_file["Medicare_Provider_Type_Desc"].map(str) + "-" + case_study_npi_file['Provider_License_Number'].astype('str').str.extract('(\d+)')[0]
    
    case_study_prescription_data['License_Number'] = case_study_prescription_data["License_Type"].map(str) + "-" + case_study_prescription_data['License_Number'].astype('str').str.extract('(\d+)')[0]
    
    #Title Casing
    case_cols1=["Full_Name","First_Name","Middle_Name","Last_Name","City"]
    case_study_provider_jd_data[case_cols1] = case_study_provider_jd_data[case_cols1].apply(lambda x: x.astype(str).str.title())
    
    case_cols2=["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name"]
    case_study_npi_file[case_cols2] = case_study_npi_file[case_cols2].apply(lambda x: x.astype(str).str.title())
    
    case_cols3=["Full_Name","City"]
    case_study_prescription_data[case_cols3] = case_study_prescription_data[case_cols3].apply(lambda x: x.astype(str).str.title())
    
    #Date Data Types
    date_cols1=["License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date"]
    for i in date_cols1:
        case_study_provider_jd_data[i] = pd.to_datetime(case_study_provider_jd_data[i])
    
    case_study_prescription_data["Last_Registration_Date"] = pd.to_datetime(case_study_prescription_data["Last_Registration_Date"])
    
    #Drop any duplicates/Null rows
    case_study_provider_jd_data = case_study_provider_jd_data.drop_duplicates().dropna(how="all")
    case_study_npi_file = case_study_npi_file.drop_duplicates().dropna(how="all")
    case_study_prescription_data = case_study_prescription_data.drop_duplicates().dropna(how="all")
    
    #Display a list of columns in all 3 dataframes and a sample of the data
    print(case_study_provider_jd_data.info())
    print(case_study_npi_file.info())
    print(case_study_prescription_data.info())
    print(case_study_provider_jd_data.head())
    print(case_study_npi_file.head())
    print(case_study_prescription_data.head())
    #case_study_provider_jd_data.isnull().sum()
    #case_study_provider_jd_data[case_study_provider_jd_data["City"].isnull()]
    mask = case_study_provider_jd_data[["City","State","Mail_Zip_Code"]].isnull().any(axis=1)
    case_study_provider_jd_data["License_Number"].unique()
    #case_study_npi_file["Provider_License_Number"].unique()
    #case_study_prescription_data["License_Number"].unique()
    case_study_provider_jd_data["Address_Line_2"].unique()
    
    #Merging All 3 data frames on License Numbers
    case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="outer", suffixes=["_jd", "_npi"])\
    .merge(case_study_prescription_data, left_on="License_Number", right_on="License_Number", how="outer", suffixes=["_jd", "_pres"])
    
    #Create a FullName Column from the names in NPI File
    case_df1["Provider_Full_Name"] = case_df1["Provider_First_Name"] + " " + case_df1["Provider_Middle_Name"] + " " + case_df1["Provider_Last_Name"]
    
    #Coalesce fields that are repeated across the different files
    case_df1["Full_Name"] = case_df1[["Full_Name_jd","Provider_Full_Name","Full_Name_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["City"] = case_df1[["City_jd","City_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["State"] = case_df1[["State_jd","State_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["County"] = case_df1[["County_jd","County_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["License_Type"] = case_df1[["License_Type_jd","License_Type_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["License_Number"] = case_df1[["License_Number","Provider_License_Number"]].bfill(axis=1).iloc[:,0]
    
    #Drop extra fields after coalescing and reorder the remaining fields
    drop_list =["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name","Provider_Full_Name","First_Name","Middle_Name","Last_Name","Full_Name_jd","Full_Name_pres","City_jd","City_pres","State_jd","State_pres","County_jd","County_pres","License_Type_jd","License_Type_pres","Provider_License_Number"]
    case_df1 = case_df1.drop(drop_list, axis=1)
    case_df1 = case_df1[["Full_Name","Address_Line_1","Address_Line_2","City","County","State","Mail_Zip_Code","Npi","License_Type","License_Number","Practice_Type","Prescription_Id","Last_Registration_Date","License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date","License_Status_Description","Healthcare_Provider_Taxonomy_Code","Provider_License_Number_State_Code","Healthcare_Provider_Primary_Taxonomy_Switch","Medicare_Specialty_Code","Medicare_Provider_Type_Desc","Provider_Taxonomy_Code","Provider_Taxonomy_Code_Desc","Record_Type"]]
    
    #Create a new DF for NPI and Pres IDs that are assigned to more than 1 provider
    npis = case_df1.drop_duplicates(subset=["Npi","Full_Name"])
    duplicate_npis = npis[npis.duplicated(subset=["Npi"], keep=False) & (~npis["Npi"].isnull())]
    
    pres = case_df1.drop_duplicates(subset=["Prescription_Id","Full_Name"])
    duplicate_pres = pres[pres.duplicated(subset=["Prescription_Id"], keep=False) & (~pres["Prescription_Id"].isnull())]
    
    duplicates = pd.concat([duplicate_npis,duplicate_pres])
    
    #Anti Join duplicates with the original merge to exclude the duplicated NPI and Prescription IDs
    final_df = case_df1.merge(duplicates, on="License_Number", indicator=True, how="outer")
    
    drop_list = case_df1.columns.to_list()
    drop_list = [i + "_y" for i in drop_list]
    drop_list+= ["_merge"]
    drop_list.remove("License_Number_y")
    final_df = final_df[final_df["_merge"]=="left_only"].drop(drop_list, axis=1)
    final_df.columns = final_df.columns.str.replace("_x","")
    #case_study_provider_jd_data["County"].describe()
    #case_study_npi_file["Medicare_Provider_Type_Desc"].value_counts()
    #case_study_provider_jd_data.columns = case_study_provider_jd_data.columns.str.strip()
    #case_study_provider_jd_data.info()
    
    #case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="inner", suffixes=["_jd", "_npi"])\
    #.merge(case_study_prescription_data, on="License_Number", how="inner", suffixes=["_jd","_pre"])
    #case_df1.info()
    
    #case_df2 = case_study_provider_jd_data.merge(case_study_npi_file, left_on=["Last_Name","First_Name"], right_on=["Provider_Last_Name","Provider_First_Name"], how="outer", suffixes=["_jd", "_npi"])
    
    #names = pd.DataFrame()
    #names[["Last_Name", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
    #names[["First_Name","Middle_Name"]] = names["fm_name"].str.split(n=1, expand=True)
    #names.drop(["fm_name"], axis=1)
    #case_study_prescription_data[["Last_Name","First_Name", "Middle_Name"]] = names[["Last_Name","First_Name", "Middle_Name"]]
    
    #case_df2 = case_study_provider_jd_data.merge(case_study_npi_file, left_on=["Last_Name","First_Name"], right_on=["Provider_Last_Name","Provider_First_Name"], how="outer", suffixes=["_jd", "_npi"]).merge(case_study_provider_jd_data, on=["Last_Name","First_Name"])
    
    #case_study_prescription_data["Last_Name"] = case_study_prescription_data["Full_Name"].str.split(pat = ',')[]
    #.merge(case_study_prescription_data, on="Full_NameNumber", how="inner", suffixes=["_jd","_pre"])
    #case_study_prescription_data.head()
    #case_df2.info()
    names = pd.DataFrame()
    names[["Last_Name3", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
    names[["First_Name3","Middle_Name3"]] = names["fm_name"].str.split(n=1, expand=True)
    names.drop(["fm_name"], axis=1)
    case_study_prescription_data[["Last_Name_pres","First_Name_pres", "Middle_Name_pres"]] = names[["Last_Name","First_Name", "Middle_Name"]]
    
    case_study_npi_file["Provider_Full_Name"] = case_study_npi_file["Provider_First_Name"].str.cat(case_study_npi_file["Provider_Middle_Name"], sep=" ").str.cat(case_study_npi_file["Provider_Last_Name"])
    
    case_df1 = case_study_provider_jd_data.merge(case_study_npi_file, left_on="License_Number", right_on="Provider_License_Number", how="outer", suffixes=["_jd", "_npi"])\
    .merge(case_study_prescription_data, left_on="License_Number", right_on="License_Number", how="outer", suffixes=["_jd", "_pres"])
    case_df1["First_Name"] = case_df1[["First_Name_jd","Provider_First_Name", "First_Name_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["Last_Name"] = case_df1[["Last_Name_jd","Provider_Last_Name", "Last_Name_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["Middle_Name"] = case_df1[["Middle_Name_jd","Provider_Middle_Name","Middle_Name_pres"]].bfill(axis=1).iloc[:,0]
    
    
    case_df1["City"] = case_df1[["City_jd","City_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["State"] = case_df1[["State_jd","State_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["County"] = case_df1[["County_jd","County_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["License_Type"] = case_df1[["License_Type_jd","License_Type_pres"]].bfill(axis=1).iloc[:,0]
    case_df1["License_Number"] = case_df1[["License_Number","Provider_License_Number"]].bfill(axis=1).iloc[:,0]
    
    drop_list =["Provider_First_Name","Provider_Middle_Name","Provider_Last_Name","Provider_Full_Name","First_Name_jd","Middle_Name_jd","Last_Name_jd","Full_Name_jd","First_Name_pres","Middle_Name_pres","Last_Name_pres","Full_Name_pres","City_jd","City_pres","State_jd","State_pres","County_jd","County_pres","License_Type_jd","License_Type_pres","Provider_License_Number"]
    case_df1 = case_df1.drop(drop_list, axis=1)
    case_df1 = case_df1.drop_duplicates()
    case_df1 = case_df1[["Full_Name","First_Name","Middle_Name","Last_Name","Address_Line_1","Address_Line_2","City","County","State","Mail_Zip_Code","Npi","License_Type","License_Number","Practice_Type","Prescription_Id","Last_Registration_Date","License_First_Issue_Date","License_Last_Renewed_Date","License_Expiration_Date","License_Status_Description","Healthcare_Provider_Taxonomy_Code","Provider_License_Number_State_Code","Healthcare_Provider_Primary_Taxonomy_Switch","Medicare_Specialty_Code","Medicare_Provider_Type_Desc","Provider_Taxonomy_Code","Provider_Taxonomy_Code_Desc","Record_Type"]]
    case_df1.info()
    
    
    
    
    #case_df1 = case_df1[~(case_df1.duplicated(subset=["NPI"], keep=False) | case_df1.duplicated(subset=["prescription_id"], keep=False))]
    #case_df1.info()
    
    #case_df2 = case_df1[case_df1.duplicated(subset=["NPI"], keep=False) | case_df1.duplicated(subset=["Prescription_Id"], keep=False)]
    case_df1
    Unknown integration
    DataFrameavailable as
    df3
    variable
    Run cancelled
    SELECT
    	c.CustomerID,
    	c.CustomerName,
    	p.ProductID,
    	p.ProductName
    	FROM
    	'Q3_Customers.csv' AS c
    	LEFT JOIN
    	(
    	SELECT 
    		CustomerID,
    		ProductID,
    		RANK() OVER(
    			PARTITION BY CustomerID 
    			ORDER BY SUM(Quantity) DESC 
    			RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    		) AS r_no
    		FROM 'Q3_Invoices.csv'
    		GROUP BY CustomerID, ProductID
    		ORDER BY CustomerID, r_no
    	) AS s1
    	ON c.CustomerID = s1.CustomerID
    	LEFT JOIN
    	'Q3_Products2.csv' AS p
    	ON p.ProductID = s1.ProductID
    	WHERE r_no <=2
    	ORDER BY c.CustomerID, s1.r_no, p.Price
    This query is taking long to finish...Consider adding a LIMIT clause or switching to Query mode to preview the result.
    Unknown integration
    DataFrameavailable as
    df2
    variable
    Run cancelled
    SELECT
    	CustomerID,
    	ProductID
    FROM
    	(
    	SELECT 
    		CustomerID,
    		ProductID,
    		ROW_NUMBER() OVER(PARTITION BY CustomerID ORDER BY s1.n_orders DESC) AS r_no
    		FROM 
    		(
    		SELECT
    			CustomerID,
    			ProductID,
    			SUM(Quantity) AS n_orders
    		FROM 'Q3_Invoices.csv'
    		GROUP BY CustomerID,ProductID
    			) as s1
    		ORDER BY CustomerID, r_no
    	) as s2
    WHERE r_no <=2
    This query is taking long to finish...Consider adding a LIMIT clause or switching to Query mode to preview the result.
    Unknown integration
    DataFrameavailable as
    df4
    variable
    Run cancelled
    SELECT
    	c.CustomerID,
    	c.CustomerName,
    	p.ProductID,
    	p.ProductName,
    	s1.instances_purchased
    	FROM
    	'Q3_Customers.csv' AS c
    	LEFT JOIN
    	(
    	SELECT 
    		CustomerID,
    		ProductID,
    		SUM(Quantity) AS instances_purchased,
    		RANK() OVER(
    			PARTITION BY CustomerID 
    			ORDER BY SUM(Quantity) DESC
    			RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    		) AS r_no
    		FROM 'Q3_Invoices.csv'
    		GROUP BY CustomerID, ProductID
    		ORDER BY CustomerID, r_no
    	) AS s1
    	ON c.CustomerID = s1.CustomerID
    	LEFT JOIN
    	'Q3_Products2.csv' AS p
    	ON p.ProductID = s1.ProductID
    	WHERE r_no <=2
    	ORDER BY c.CustomerID, s1.r_no, p.Price
    This query is taking long to finish...Consider adding a LIMIT clause or switching to Query mode to preview the result.
    Run cancelled
    #Q2
    
    def test_valid(id_num, id_name):
        return (test_name(id_num, id_name), test_num(id_num))
    
    def test_name(id_num, id_name):
        if id_num[1] == id_name[0]:
            return "Valid"
        return "Invalid"
    
    def test_num(id_num):
        if((
            (
                int(id_num[3]) +
                int(id_num[5]) +
                int(id_num[7])
            ) * 2 +
            (
                int(id_num[2]) +
                int(id_num[4]) +
                int(id_num[6])
            )
        )%10 == int(id_num[8])):
            return "Valid"
        return "Invalid"
    
    pres = pd.read_csv("Q2_Pres.csv")
    pres[["name_match_test","id_num_test"]] = pd.DataFrame(
      pres.apply(lambda row:test_valid(row["id_number"], row["Name"]), axis=1).tolist()
    )
    pres
    #pres["name_match_test"] = pres.apply(lambda row:test_name(row["id_number"],row["Name"]), axis=1)
    #pres["id_num_test"] = pres.apply(lambda row:test_num(row["id_number"]), axis=1)
    Run cancelled
    names = pd.DataFrame()
    names[["Last_Name", "fm_name"]] = case_study_prescription_data["Full_Name"].str.split(pat = ',', expand=True)
    names[["First_Name","Middle_Name"]] = names["fm_name"].str.split(n=1, expand=True)
    names.drop(["fm_name"], axis=1)