Skip to content
Analyzing unicorn company data
  • AI Chat
  • Code
  • Report
  • Analyzing unicorn company data

    In this workspace, we'll be exploring the relationship between total funding a company receives and its valuation.

    import pandas as pd
    
    df = pd.read_csv("employee_survey.csv")
    
    df.info
    df.columns
    # To clean the data, we can start by dropping any rows with missing values
    df.dropna(inplace=True)
    
    # Finally, we can rename some of the columns to make them more descriptive
    df.rename(columns={"agree": "job_satisfaction", "workloc": "work_location", "workleng": "work_length", "manag_resp": "manager_responsibility", "involved": "employee_involvement", "autonomy": "autonomy_level", "direct_manag": "direct_manager_effectiveness", "integrated": "team_integration", "welfare": "employee_welfare", "training": "training_opportunities", "reflexive": "reflexivity", "innovative": "innovation", "customers": "customer_focus", "work_press": "work_pressure", "overall": "overall_rating", "intent_quit": "intent_to_quit", "sentiment": "employee_sentiment"}, inplace=True)
    df.columns
    # Create a new dataframe with only the columns needed
    df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit", "employee_sentiment"]]
    # Calculate descriptive statistics for the numerical columns in the dataframe
    df_new.describe()
    
    # Calculate the correlation matrix for the numerical columns in the dataframe
    corr_matrix = df_new.corr()
    
    # Round the correlation values to 2 decimal places
    corr_matrix = corr_matrix.round(2)
    
    # Display the correlation matrix
    corr_matrix
    import seaborn as sns
    
    #setting the size of the plot
    sns.set (rc={'figure.figsize':(15,8)})
    
    # Create a heatmap of the correlation matrix
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
    import seaborn as sns
    
    # Create a pairplot of the numerical columns in the dataframe
    sns.pairplot(df_new)
    # Set the independent variables as all columns in df_new except for "intent_to_quit"
    X = df_new.drop("intent_to_quit", axis=1)
    
    # Set the dependent variable as "intent_to_quit"
    y = df_new["intent_to_quit"]
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    
    # Calculate the variance inflation factor for each independent variable in X
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    # Display the variance inflation factor for each independent variable
    vif
    # Remove variables that are not relevant to the analysis or are highly correlated with other variables
    df_new = df[["work_location", "work_length", "manager_responsibility", "employee_involvement", "autonomy_level", "direct_manager_effectiveness", "team_integration", "employee_welfare", "training_opportunities", "reflexivity", "innovation", "customer_focus", "work_pressure", "overall_rating", "intent_to_quit"]]
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    
    # Calculate the variance inflation factor for each independent variable in df_new
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
    vif["features"] = df_new.columns
    
    # Display the variance inflation factor for each independent variable
    vif
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    
    # Calculate the variance inflation factor for each independent variable in df_new
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
    vif["features"] = df_new.columns
    
    # Remove variables with VIF greater than or equal to 10
    while vif["VIF Factor"].max() >= 10:
        max_vif_feature = vif.loc[vif["VIF Factor"].idxmax(), "features"]
        df_new = df_new.drop(max_vif_feature, axis=1)
        vif = pd.DataFrame()
        vif["VIF Factor"] = [variance_inflation_factor(df_new.values, i) for i in range(df_new.shape[1])]
        vif["features"] = df_new.columns



    Open the video in a new tab