Skip to content
# Tools
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc
from mpl_toolkits.axes_grid1 import ImageGrid
import regex as re
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize

About Notebook

As long as provided the following Tables with the schemas below notebook can be replicated for other job queries.

  • job_postings_links
  • job_postings_descriptions

Pre-Processing

# Loading data
job_postings_links = pd.read_excel('Systems Engineer Data/Systems Integration Engineer Dec 21 Query.xlsx')
display(job_postings_links.head(2))
job_postings_descriptions = pd.read_excel('Systems Engineer Data/Systems Integration Engineer Dec 21 Query Results.xlsx')
display(job_postings_descriptions.head(2))
total_orignal_records = job_postings_descriptions.shape[0]

Data Validation

assert (job_postings_links.dtypes == object).sum() == len(job_postings_links.columns)
assert (job_postings_descriptions.dtypes == object).sum() == len(job_postings_descriptions.columns)
job_postings_links['Date'] = pd.to_datetime(job_postings_links['Date'])

Descriptive Statistics

Visual 1

# Numerical Variable
px.ecdf(job_postings_links, x='Date', width=500, height=500,marginal="histogram", title='Job Posting Dates Distribution')
# categorical variables
print('job_postings_links:')
display(job_postings_links[job_postings_links.columns[:-1]].describe())
print('job_postings_descriptions:')
display(job_postings_descriptions.describe())
# Checking for missing values
print(job_postings_links.isna().sum())
print(job_postings_descriptions.isna().sum())

If any description value is missing that record is removed