Skip to content

Sales Data Analyst Take-Home Test

Part 1: CMS Data Exercise

# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# load staffing dataset
staffing_data = pd.read_csv('PBJ_Daily_Nurse_Staffing_Q1_2024.csv', encoding='ISO-8859-1')
staffing_data.shape
#load provider dataset
provider_info = pd.read_csv('NH_ProviderInfo_Oct2024.csv', encoding='ISO-8859-1')
provider_info.shape
# check for missing values 
staffing_data.isna().sum()
# Rename the 1st column as it is has unwanted strings 
staffing_data.rename(columns={'PROVNUM': 'PROVNUM'}, inplace=True)

# Explicitly cast columns to appropriate data types
staffing_data = staffing_data.astype({
    'PROVNUM': 'string',
    'PROVNAME': 'string',
    'CITY': 'string',
    'STATE': 'string',
    'COUNTY_NAME': 'string',
    'CY_Qtr': 'string',
    'COUNTY_FIPS': 'int64',
    'WorkDate': 'int64',
    'MDScensus': 'int64'
})

# Ensure all float columns are of type float
float_columns = [
    col for col in staffing_data.columns 
    if col.startswith('Hrs_') or col.startswith('Contractor_Prop_')
]
staffing_data[float_columns] = staffing_data[float_columns].astype('float64')

# Verify the updated data types
print(staffing_data.dtypes)
staffing_data.head(10)
# Define roles to summarize
roles = ['RNDON', 'RNadmin', 'RN', 'LPNadmin', 'LPN', 'CNA', 'NAtrn', 'MedAide']

# Create a dictionary for aggregated totals for all roles
aggregations = {}
aggregations['PROVNAME'] = 'first'  # Keep the first (and only) PROVNAME for each PROVNUM

# Add aggregation for each role: total hours, employee hours, and contractor hours
for role in roles:
    aggregations[f'Hrs_{role}'] = 'sum'          # Total hours for the role
    aggregations[f'Hrs_{role}_emp'] = 'sum'      # Total employee hours for the role
    aggregations[f'Hrs_{role}_ctr'] = 'sum'      # Total contractor hours for the role

# Group by PROVNUM and calculate the summary
summary_df = staffing_data.groupby('PROVNUM').agg(aggregations).reset_index()

# Add totals across all roles
summary_df['Total_Hours_AllRoles'] = summary_df[[f'Hrs_{role}' for role in roles]].sum(axis=1)
summary_df['Total_Employee_Hours'] = summary_df[[f'Hrs_{role}_emp' for role in roles]].sum(axis=1)
summary_df['Total_Contractor_Hours'] = summary_df[[f'Hrs_{role}_ctr' for role in roles]].sum(axis=1)

# Calculate contractor utilization ratio
summary_df['Contractor_Utilization_Ratio'] = summary_df['Total_Contractor_Hours'] / summary_df['Total_Hours_AllRoles']

# Display the resulting summary DataFrame
#summary_df.head()
# Sort by Total_Contractor_Hours in descending order
#sorted_summary = summary_df.sort_values(by='Contractor_Utilization_Ratio', ascending=False).reset_index(drop=True)

# Display the top results
#sorted_summary.head()

Contractor Utilization Distribution

# Plot the distribution of Contractor Utilization Ratio
plt.figure(figsize=(10, 6))
plt.hist(summary_df['Contractor_Utilization_Ratio'], bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Contractor Utilization Ratio')
plt.xlabel('Contractor Utilization Ratio')
plt.ylabel('Number of Providers')
plt.grid(True)
plt.show()

Explanation: Contractor_Utilization_Ratio: This is the key metric representing the fraction of total hours worked by contractors. The histogram shows the general distribution of contractor utilization across all providers. Most providers do not use the contract they use employees.

Analysis of the Histogram: Distribution of Contractor Utilization Ratio

Observations:

  1. Dominance of Low Contractor Utilization:
  • The majority of providers have a contractor utilization ratio close to 0.0, indicating a heavy reliance on employee staff rather than contractors.
  • The distribution is heavily right-skewed, with most providers using minimal or no contractors.
  1. Gradual Decline in Higher Utilization Ratios:
  • A smaller number of providers have contractor utilization ratios in the range of 0.1 to 0.3.
  • Very few providers exceed a contractor utilization ratio of 0.4, suggesting limited reliance on contractors.
  1. Rare Full Contractor Usage:
  • Almost no providers have a contractor utilization ratio approaching 1.0, indicating that fully contractor-based - staffing is not common in the industry.

Top Providers with High Contractor Utilization

# Sort by Contractor Utilization Ratio to find the top 10 providers with the highest utilization
top_providers_contractor_utilization = summary_df[['PROVNUM', 'PROVNAME', 'Contractor_Utilization_Ratio']].sort_values('Contractor_Utilization_Ratio', ascending=False).head(20)

top_providers_contractor_utilization