Skip to content

As a Data Analyst at a leading global HR consultancy, your mission is to delve into an extensive database of resumes to identify suitable candidates for tech-focused roles. This task involves using regular expressions to extract key data points and applying data preprocessing techniques to organize this information effectively.

Dataset Summary

resumes.csv

ColumnData TypeDescription
IDfloatUnique identifier for each resume.
Resume_strobjectFull text of the resume, rich with details for analysis.
CategoryobjectJob category of the resume, indicating the field of expertise.

Let's Get Started!

Embark on this analytical journey to harness advanced data analysis techniques for real-world HR challenges. This project is your chance to impact the hiring process by ensuring that tech talent finds their ideal job. Let's begin this exciting journey!

import pandas as pd
import re

# Load the resume dataset from a CSV file into a DataFrame
resumes = pd.read_csv('resumes.csv')
resumes.sample(3)
# Start coding here!
# Use as many cells as you need.

1 - Define regex patterns

# Locating technical skills
skill_regex = r'\b(python|sql|r|excel)\b'
# Identifying job titles
job_regex = r'^([A-Z\s\.\,\-]+)\b'
# Extracting educational background
edu_regex = r'\b(PhD|MCs|Master|BCs|Bachelor)\b'

2 - Implement regex for data extraction

# skills = []
# jobs = []
# edus = []

# for resume in resumes['Resume_str']:
    
#     job_title_match = re.search(job_regex, resume)
#     if job_title_match is not None:
#         job_title = job_title_match.group(0).strip()
#     else:
#         job_title = ''
#     jobs.append(job_title)
    
#     skills_matches = re.findall(skill_regex, resume, flags=re.IGNORECASE)
#     unique_skills = []
#     for skill in skills_matches:
#         skill_title = skill.title()
#         if skill_title not in unique_skills:
#             unique_skills.append(skill_title)
#     skills.append(','.join(unique_skills))
    
#     edus_matches = re.findall(edu_regex, resume, flags=re.IGNORECASE)
#     unique_edus = []
#     for edu in edus_matches:
#         edu_title = edu.title()
#         if edu_title not in unique_edus:
#             unique_edus.append(edu_title)
#     edus.append(','.join(unique_edus))
# Using loops and conditional statements
skills = []
jobs = []
edus = []
for resume in resumes['Resume_str']:

    # Extract the job title using regex - most recent job title held
    job_title_match = re.search(job_regex, resume)
    if job_title_match is not None: # If a job title is found in the resume
        job_title = job_title_match.group(0).strip()
    else: # If no job title is found in the resume
        job_title = '' # Assign an empty string to indicate no job title found
    jobs.append(job_title) # Add the extracted job title to the list of job titles
    
    # Find all programming skills mentioned in the resume and make them unique
    skills_matches = re.findall(skill_regex, resume, flags=re.IGNORECASE) # case-insensitive matching
    unique_skills = []
    for skill in skills_matches: # Remove duplicates and format to title case
        skill_title = skill.title()
        if skill_title not in unique_skills:
            unique_skills.append(skill_title)
    skills.append(','.join(unique_skills)) # Convert list to comma-separated string
    
    # Find all educational degrees mentioned in the resume and make them unique
    education_matches = re.findall(edu_regex, resume, flags=re.IGNORECASE)
    unique_education = []
    for education in education_matches: # Remove duplicates and format to title case
        education_title = education.title()
        if education_title not in unique_education:
            unique_education.append(education_title)
    edus.append(','.join(unique_education)) # Convert list to comma-separated string
jobs[:5]
skills[:5]
edus[:5]
resumes.head()

3 - Structure data into a DataFrame