Skip to content

Dive into the heart of data science with a project that combines healthcare insights and predictive analytics. As a Data Scientist at a top Health Insurance company, you have the opportunity to predict customer healthcare costs using the power of machine learning. Your insights will help tailor services and guide customers in planning their healthcare expenses more effectively.

Dataset Summary

Meet your primary tool: the insurance.csv dataset. Packed with information on health insurance customers, this dataset is your key to unlocking patterns in healthcare costs. Here's what you need to know about the data you'll be working with:

insurance.csv

ColumnData TypeDescription
ageintAge of the primary beneficiary.
sexobjectGender of the insurance contractor (male or female).
bmifloatBody mass index, a key indicator of body fat based on height and weight.
childrenintNumber of dependents covered by the insurance plan.
smokerobjectIndicates whether the beneficiary smokes (yes or no).
regionobjectThe beneficiary's residential area in the US, divided into four regions.
chargesfloatIndividual medical costs billed by health insurance.

A bit of data cleaning is key to ensure the dataset is ready for modeling. Once your model is built using the insurance.csv dataset, the next step is to apply it to the validation_dataset.csv. This new dataset, similar to your training data minus the charges column, tests your model's accuracy and real-world utility by predicting costs for new customers.

Let's Get Started!

This project is your playground for applying data science in a meaningful way, offering insights that have real-world applications. Ready to explore the data and uncover insights that could revolutionize healthcare planning? Let's begin this exciting journey!

# Re-run this cell
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# Loading the insurance dataset
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head()

insurance.dropna(inplace=True)

#Functions to clean the dataset
def clean_age(insurance):
    insurance["age"] = insurance["age"].astype(str)
    insurance["age"] = insurance["age"].str.replace("-","",regex=False)
    insurance["age"] = insurance["age"].str.replace(".0","",regex=False)
    insurance["age"] = insurance["age"].astype(int)

def clean_sex(insurance):
    insurance["sex"] = insurance["sex"].astype("category")
    sex_rep = {"woman":"female","F":"female","man":"male","M":"male"}
    insurance["sex"] = insurance["sex"].replace(sex_rep)
    insurance["sex"] = insurance["sex"].astype("object")

def clean_bmi(insurance):
    insurance["bmi"] = insurance["bmi"].astype(str)
    insurance["bmi"] = insurance["bmi"].str.strip()
    insurance["bmi"] = pd.to_numeric(insurance["bmi"])

def clean_children(insurance):
    insurance["children"] = insurance["children"].astype("str")
    insurance["children"] = insurance["children"].str.replace("-","",regex=False)
    insurance["children"] = insurance["children"].str.replace(".0","",regex=False)
    insurance["children"] = insurance["children"].astype(int)

def clean_region(insurance):
    insurance["region"] = insurance["region"].str.lower()

def clean_charges(insurance):
    insurance["charges"] = insurance["charges"].str.replace("$","", regex=False)
    insurance["charges"] = insurance["charges"].astype(float)
    
#Data cleaning process
clean_age(insurance)
clean_bmi(insurance)
clean_charges(insurance)
clean_region(insurance)
clean_children(insurance)
clean_sex(insurance)

#Dropping Nan values
insurance.dropna(inplace=True)

#function for Encoding features 
def dumm(insurance):
    dum_var = ["sex","smoker","region"]
    df = pd.get_dummies(insurance, columns=dum_var)
    return df
    
#Encoding process
insurance = dumm(insurance)

#Features and target 
features = insurance.drop("charges", axis=1)
target = insurance["charges"]

#scaling the features 
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)



# Implement model creation and training here
# Use as many cells as you need
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor 


#Models to train the data
linreg = LinearRegression()
lass = Lasso(alpha=0.4,random_state=42)
ridge = Ridge(alpha=0.4,random_state=42)
dt = DecisionTreeRegressor(max_depth=3, min_samples_leaf=2, min_samples_split=2, random_state=42)
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, random_state=42)
knr = KNeighborsRegressor(n_neighbors=5)
models = {"LinearRegression":linreg,"Lasso":lass,"Ridge":ridge,"DecisionTreeRegressor":dt,
          "RandomForest":rf,"Gradientboost":gbr,"Kneighbour":knr}

#creating a dictionary for the r2 score of the model
model_score_dict = {}
for name, model in models.items():
    score = cross_val_score(model, features_scaled, target, cv=5, scoring="r2",error_score="raise")
    model_score_dict[name] = score.mean()    
print(f"These are the scores of the model:\n {model_score_dict}")


#r2 score for the linear regression    
r2_score = model_score_dict["LinearRegression"]
print(f"\nThis is the r2 score of LinearRegression: {r2_score}")  
    
#Cleaning the validation data
valid = pd.read_csv("validation_dataset.csv")
clean_age(valid)
clean_bmi(valid)
clean_region(valid)
clean_sex(valid)
clean_children(valid)

#Encoding the validation data
valid_dum = dumm(valid) 
valid.head()
valid_scaled = scaler.fit_transform(valid_dum)

#creating a dictionary of the predicted value of each of the model
pred_dict = {}
for name,model in models.items():
    model.fit(features_scaled, target)
    pred = model.predict(valid_scaled)
    pred_dict[name] = pred
    
#creating a functions that produces a dataframe that contains each of the predicted values of the model seperately making sure that the minimum charge is 1000
def make_pred_df(valid_data, predict_dict):
    data_model = {}
    for name,pred in predict_dict.items():
        new_def = valid_data.copy()
        new_def[name] = pred
        new_def[name] = new_def[name].apply(lambda x: 1000 if x <= 1000 else x)
        data_model[name] = new_def
    return data_model

#Calling the function
pred_df_dict = make_pred_df(valid_data=valid,predict_dict=pred_dict)


#printing out the predicted values for the Linear regression
validation_data = pred_df_dict["LinearRegression"]
validation_data.rename(columns={"LinearRegression":"predicted_charges"}, inplace=True)

      

# The regression with the highest r2 score 
reg_max_r2score = max(model_score_dict, key=model_score_dict.get)
print(f"\nThis is the regression with the highest r2 score: {reg_max_r2score}: {model_score_dict[reg_max_r2score]}")
max_r2score_df = pred_df_dict["Gradientboost"].rename(columns={"Gradientboost":"predicted_charges"})
print(f"\nThis is dataframe of the predicted charges of the model with the max r2 score:\n{max_r2score_df}")