Skip to content
Sample Data Scientist Associate Solution (copy)
Data Scientist Associate Practical Exam Submission
GoalZone Booking Attendance Analysis
Task 1
The dataset contains 1500 rows and 8 columns with some missing values before cleaning. I have validated all the variables against the set criteria in task description:
- booking_id: same as described
- month_as_member: same as decribed and no missing values
- weight: same as decribed and with 20 missing values and filled the missing values with the average of the column
- days_before: no missing values and used tring replace method to convert to remove the days affix in some of the values
- day_of_week: same as decribed and no missing values
- time: same as decribed and no missing values
- category: same as decribed and no missing values
After the data validation, the dataset contains the same number of records and fields
Loading original Dataset
# Data Validation
# import the reuired libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# data validation - all variables are as decribed in the task list
df = pd.read_csv('fitness_class_2212.csv')
df.info()
# check the number of missing values
df.isna().sum()
Validate the categorical variables
cat_columns = ['day_of_week', 'time', 'category']
# validate the minimum values of the columns months_as_member, days_before, weight
print("Minimum values")
for col in ['months_as_member', 'days_before', 'weight']:
print(f"{col}: {df[col].min()} ")
Filling the missing values
# fill missing values with predefined values
def fill_missing(df):
to_be_unknown, to_be_avg, to_be_0 = ['day_of_week', 'time', 'category'], ['months_as_member', 'weight'], ['days_before']
for col in df.columns:
if col in to_be_unknown:
df[col].fillna('unknown', inplace=True)
if col in to_be_avg:
df[col].fillna(df[col].mean(), inplace=True)
if col in to_be_0:
df[col].fillna(0, inplace=True)
df.dropna(inplace=True) # if not in either ie column attended, remove the row
return df
df = fill_missing(df)
# check values if they fit description
for col in ['day_of_week', 'time', 'category']:
print(f"{col} \n {df[col].unique()}")
df['days_before'].unique()
Cleaning the data
# clean values to fit description - day_of_week(wed, mon, fri), category(-, to unknown)
def clean_day_of_week(df):
df['day_of_week'].mask(df['day_of_week'].str.startswith('Wed'), 'Wed', inplace=True)
df['day_of_week'].mask(df['day_of_week'].str.startswith('Mon'), 'Mon', inplace=True)
df['day_of_week'].mask(df['day_of_week'].str.startswith('Fri'), 'Fri', inplace=True)
return df
def clean_category(df):
df['category'].mask(df['category'] == '-', 'unknown', inplace=True)
return df
# clean types - from object to pandas category for easier working with during visualisation and modelling
def clean_types(df):
for col in cat_columns:
df[col] = df[col].astype('category')
df['days_before'] = df['days_before'].astype('int64')
return df
# claen days_defore
def clean_days_defore(df):
df['days_before'] = df['days_before'].str.replace(' days', '')
return df
df = clean_day_of_week(df)
df = clean_category(df)
df = clean_days_defore(df)
df = clean_types(df)
df.info()