Skip to content

Analyzing unicorn company data

In this workspace, we'll be exploring the relationship between total funding a company receives and its valuation.

import pandas as pd

df = pd.read_csv("employee_survey.csv")

df.info
df.columns
# To clean the data, we can start by dropping any rows with missing values
df.dropna(inplace=True)

# Finally, we can rename some of the columns to make them more descriptive
df.rename(columns={"agree": "job_satisfaction", "workloc": "work_location", "workleng": "work_length", "manag_resp": "manager_responsibility", "involved": "employee_involvement", "autonomy": "autonomy_level", "direct_manag": "direct_manager_effectiveness", "integrated": "team_integration", "welfare": "employee_welfare", "training": "training_opportunities", "reflexive": "reflexivity", "innovative": "innovation", "customers": "customer_focus", "work_press": "work_pressure", "overall": "overall_rating", "intent_quit": "intent_to_quit", "sentiment": "employee_sentiment"}, inplace=True)
df.columns
# Create a new dataframe with only the columns needed
df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit", "employee_sentiment"]]
# Calculate descriptive statistics for the numerical columns in the dataframe
df_new.describe()
# Calculate the correlation matrix for the numerical columns in the dataframe
corr_matrix = df_new.corr()

# Round the correlation values to 2 decimal places
corr_matrix = corr_matrix.round(2)

# Display the correlation matrix
corr_matrix
import seaborn as sns

#setting the size of the plot
sns.set (rc={'figure.figsize':(15,8)})

# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
import seaborn as sns

# Create a pairplot of the numerical columns in the dataframe
sns.pairplot(df_new)
# Set the independent variables as all columns in df_new except for "intent_to_quit"
X = df_new.drop("intent_to_quit", axis=1)

# Set the dependent variable as "intent_to_quit"
y = df_new["intent_to_quit"]
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate the variance inflation factor for each independent variable in X
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

# Display the variance inflation factor for each independent variable
vif
# Remove variables that are not relevant to the analysis or are highly correlated with other variables
df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit"]]
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate the variance inflation factor for each independent variable in df_new
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
vif["features"] = df_new.columns

# Display the variance inflation factor for each independent variable
vif
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate the variance inflation factor for each independent variable in df_new
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
vif["features"] = df_new.columns

# Remove variables with VIF greater than or equal to 10
while vif["VIF Factor"].max() >= 10:
    max_vif_feature = vif.loc[vif["VIF Factor"].idxmax(), "features"]
    df_new = df_new.drop(max_vif_feature, axis=1)
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
    vif["features"] = df_new.columns



Open the video in a new tab