Skip to content
Counter Bag-of-words Model for Linked In job position: System Integration Engineer
# Tools
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc
from mpl_toolkits.axes_grid1 import ImageGrid
import regex as re
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize
About Notebook
As long as provided the following Tables with the schemas below notebook can be replicated for other job queries.
- job_postings_links
- job_postings_descriptions
Pre-Processing
# Loading data
job_postings_links = pd.read_excel('Systems Engineer Data/Systems Integration Engineer Dec 21 Query.xlsx')
display(job_postings_links.head(2))
job_postings_descriptions = pd.read_excel('Systems Engineer Data/Systems Integration Engineer Dec 21 Query Results.xlsx')
display(job_postings_descriptions.head(2))
total_orignal_records = job_postings_descriptions.shape[0]
Data Validation
assert (job_postings_links.dtypes == object).sum() == len(job_postings_links.columns)
assert (job_postings_descriptions.dtypes == object).sum() == len(job_postings_descriptions.columns)
job_postings_links['Date'] = pd.to_datetime(job_postings_links['Date'])
Descriptive Statistics
Visual 1
# Numerical Variable
px.ecdf(job_postings_links, x='Date', width=500, height=500,marginal="histogram", title='Job Posting Dates Distribution')
# categorical variables
print('job_postings_links:')
display(job_postings_links[job_postings_links.columns[:-1]].describe())
print('job_postings_descriptions:')
display(job_postings_descriptions.describe())
# Checking for missing values
print(job_postings_links.isna().sum())
print(job_postings_descriptions.isna().sum())
If any description value is missing that record is removed