Skip to content

Analyzing unicorn company data
Analyzing unicorn company data
In this workspace, we'll be exploring the relationship between total funding a company receives and its valuation.
import pandas as pd
df = pd.read_csv("employee_survey.csv")
df.infodf.columns# To clean the data, we can start by dropping any rows with missing values
df.dropna(inplace=True)
# Finally, we can rename some of the columns to make them more descriptive
df.rename(columns={"agree": "job_satisfaction", "workloc": "work_location", "workleng": "work_length", "manag_resp": "manager_responsibility", "involved": "employee_involvement", "autonomy": "autonomy_level", "direct_manag": "direct_manager_effectiveness", "integrated": "team_integration", "welfare": "employee_welfare", "training": "training_opportunities", "reflexive": "reflexivity", "innovative": "innovation", "customers": "customer_focus", "work_press": "work_pressure", "overall": "overall_rating", "intent_quit": "intent_to_quit", "sentiment": "employee_sentiment"}, inplace=True)df.columns# Create a new dataframe with only the columns needed
df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit", "employee_sentiment"]]# Calculate descriptive statistics for the numerical columns in the dataframe
df_new.describe()
# Calculate the correlation matrix for the numerical columns in the dataframe
corr_matrix = df_new.corr()
# Round the correlation values to 2 decimal places
corr_matrix = corr_matrix.round(2)
# Display the correlation matrix
corr_matriximport seaborn as sns
#setting the size of the plot
sns.set (rc={'figure.figsize':(15,8)})
# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")import seaborn as sns
# Create a pairplot of the numerical columns in the dataframe
sns.pairplot(df_new)# Set the independent variables as all columns in df_new except for "intent_to_quit"
X = df_new.drop("intent_to_quit", axis=1)
# Set the dependent variable as "intent_to_quit"
y = df_new["intent_to_quit"]from statsmodels.stats.outliers_influence import variance_inflation_factor
# Calculate the variance inflation factor for each independent variable in X
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
# Display the variance inflation factor for each independent variable
vif# Remove variables that are not relevant to the analysis or are highly correlated with other variables
df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit"]]from statsmodels.stats.outliers_influence import variance_inflation_factor
# Calculate the variance inflation factor for each independent variable in df_new
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
vif["features"] = df_new.columns
# Display the variance inflation factor for each independent variable
viffrom statsmodels.stats.outliers_influence import variance_inflation_factor
# Calculate the variance inflation factor for each independent variable in df_new
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
vif["features"] = df_new.columns
# Remove variables with VIF greater than or equal to 10
while vif["VIF Factor"].max() >= 10:
max_vif_feature = vif.loc[vif["VIF Factor"].idxmax(), "features"]
df_new = df_new.drop(max_vif_feature, axis=1)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
vif["features"] = df_new.columns