Skip to content
0

Data processing


1 hidden cell

Import modules

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(context='notebook', 
              style='dark', 
              font_scale=1.2)

Read the dataset

# read the dataset 'accidents', parse 'date' and 'time' as datetime objects, 
# drop 'accident_reference' and 'accident_year' columns as 'accident_index' column contains them both, 
# set the index column
accidents = (pd.read_csv(r'./data/accident-data.csv',
                         parse_dates=[['date', 'time']])
               .set_index('accident_index')
               .drop(columns=['accident_reference', 'accident_year']))

# extract 'hour' column with hour values of 'date_time' column
accidents['hour'] = accidents.date_time.dt.hour

# add column 'level' that indicates if the accident is major or minor
accidents['level'] = [1 if (accident['number_of_casualties'] >= 3)
                         & (accident['accident_severity'] == 1) 
                        else 0
                      for index, accident in accidents.iterrows()]
accidents.isna().sum()

Read the discriptions

# read the lookup file
lookup = pd.read_csv(r'./data/road-safety-lookups.csv',
                     keep_default_na=True, 
                     error_bad_lines=False, 
                     skip_blank_lines=True)

lookup = lookup.drop(columns='note').dropna()

# add discription for 'minor' and 'major' values of column 'level' as rows in 'lookup' DataFrame
lookup = lookup.append({'table': 'Accident', 
                        'field name': 'level', 
                        'code/format': 0, 
                        'label': 'minor'}, 
                       ignore_index=True)

lookup = lookup.append({'table': 'Accident', 
                        'field name': 'level', 
                        'code/format': 1, 
                        'label': 'major'}, 
                       ignore_index=True)

# drop redundant items from lookup including badly coded 'speed_limit'
lookup = lookup[~lookup['field name'].isin(['accident_index',
                                            'accident_year',
                                            'accident_reference', 
                                            'date', 'time', 
                                            'first_road_number', 
                                            'second_road_number',
                                            'speed_limit'])]

# recreate 'speed_limit' as categories in 'lookup'
for limit in range(20, 80, 10):
    lookup = lookup.append({'table': 'Accident',
                            'field name': 'speed_limit',
                            'code/format': limit,
                            'label': limit},
                           ignore_index=True)

Create boolean masks for dtypes

# columns with corresponding labels in 'lookup' are grouped as 'categoricals'
categoricals = lookup['field name'].unique()

# columns that are not from categoricals are masked as 'numericals'
numericals = accidents.columns[~np.isin(accidents.columns, 
                                        categoricals)].drop(
                                            labels=['date_time', 
                                                    'longitude', 
                                                    'latitude', 
                                                    'first_road_number', 
                                                    'second_road_number'])

Deal with missing values

Looking at value counts of 'junction_details' and 'junction_control' revealed that same accidents that were labeled 0 for 'not at junction' in 'junction_control' were labeled -1 for 'missing data' in 'junction_details'. Assuming that the first is correct we can rename the missing values to 0 in the second.

accidents.junction_control = accidents.junction_control.replace(-1, 0)
accidents.speed_limit = accidents.speed_limit.replace(-1, 30)
accidents = accidents[(accidents.iloc[:, :] != -1).all(axis=1)]

Write a function to decode categories

def decode_categorical(df, categorical):
    '''
    Take categorical column from DataFrame df and return 
    decoded values using codes with corresponding labels from lookup DataFrame.
    
    df: (DataFrame)
    categorical: (string)
    '''
    labels = lookup['label'].loc[
                lookup['field name'] == categorical].values
    
    code_format = lookup['code/format'].loc[
                    lookup['field name'] == categorical].astype('int').values
   
    dtype = pd.CategoricalDtype(code_format, 
                                ordered=True)
    
    decoded = pd.Categorical(df[categorical].values,
                             dtype=dtype)
    
    return decoded.rename_categories(labels)

Subset the data using dtype masks, merge and decode categorical variables