Skip to content

HR Analytics: Exploring Employee Data with R

# Import libraries needed
library(dplyr)
library(tidyr)
library(readr)
library(broom)
library(ggplot2)
# Get data urls
hr_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/hr_data.csv"
fair_pay_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/fair_pay_data.csv"
survey_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/survey_data.csv"
# Read data
hr_data <- read_csv(hr_data_url, show_col_types = FALSE)
fair_pay_data <-  read_csv(fair_pay_data_url, show_col_types = FALSE)
survey_data <-  read_csv(survey_data_url, show_col_types = FALSE)

# Show head and structure of each dataframe
head(hr_data)
glimpse(hr_data)
head(fair_pay_data)
glimpse(fair_pay_data)
head(survey_data)
glimpse(survey_data)
# Merge survey and fair_pay data
merged <- merge(survey_data, fair_pay_data, by = c("employee_id", "department", "salary"), all.x = TRUE)

head(merged)
glimpse(merged)
# Merge the merged dataframe with hr_data

data <- merge(merged, hr_data, by = c("employee_id", "department", "job_level"))

head(data)
glimpse(data)
# Check for missing values
sum(is.na(data))
mean_salary_for_dep <- data %>%
							group_by(department) %>%
								summarize(mean_salary = mean(salary))

ggplot(mean_salary_for_dep, aes(x=department, y=mean_salary, fill=department)) +
	geom_col() + 
		scale_fill_manual(values=c("darkred",
                             	"darkgreen",
                                  "darkblue")) +
		 							theme(legend.position = "none")
# Are employees in finance earning the same as others?
data <- data %>%
			mutate(is_finance = ifelse(department == "Finance", 1, 0))

test <- t.test(salary~is_finance, data=data) %>%
	tidy() 

test %>%
	pull(p.value, statistic)

if(test$p.value < 0.05){
    print('Null hypothesis rejected at 95% CL')
} else {
    print("Failed to reject null hypothesis at 95% CL")
}
mean_salary_for_dep_and_gender <- data %>%
							group_by(department, gender) %>%
								summarize(mean_salary = mean(salary))

ggplot(mean_salary_for_dep_and_gender, aes(x=department, y=mean_salary, fill=department)) +
	geom_col() +
		facet_wrap(~gender) + 
			scale_fill_manual(values=c("darkred",
                             	"darkgreen",
                                  "darkblue")) +
		 							theme(legend.position = "none")
# Are gender and department independent?
test <- chisq.test(data$department, data$gender) %>%
		tidy()
			

test %>%
	pull(p.value, statistic)

if(test$p.value < 0.05){
    print('Null hypothesis rejected at 95% CL')
} else {
    print("Failed to reject null hypothesis at 95% CL")
}
mean_salary_for_dep_and_level <- data %>%
							group_by(department, job_level) %>%
								summarize(mean_salary = mean(salary))

ggplot(mean_salary_for_dep_and_level, aes(x=job_level, y=mean_salary, fill=job_level)) +
	geom_col() + 
		facet_wrap(~department) + 
			scale_fill_manual(values=c("lightblue",
                             	"darkolivegreen",
                                  "darkcyan"))  +
									 theme(legend.position = "none")
# Are salaried employees earning the same as hourly ones?
salaried_hourly <- data %>%
						filter(job_level != "Manager") 


test <- t.test(salary~job_level, data=salaried_hourly) %>%
			tidy() 

test %>%
	pull(p.value, statistic)

if(test$p.value < 0.05){
    print('Null hypothesis rejected at 95% CL')
} else {
    print("Failed to reject null hypothesis at 95% CL")
}
ggplot(data, aes(x=gender, fill=job_level)) +
	geom_bar() + 
		scale_fill_manual(values=c("lightblue",
                             	"darkolivegreen",
                                  "darkcyan"))
# Are gender and job level independent?
test <- chisq.test(data$gender, data$job_level) %>%
			tidy() 

test %>%
	pull(p.value, statistic)

if(test$p.value < 0.05){
    print('Null hypothesis rejected at 95% CL')
} else {
    print("Failed to reject null hypothesis at 95% CL")
}