Competition - high fatality accidents

### load packages
library(tidyverse)
library(lubridate)
# Install the randomForest package if not already installed
if (!require(randomForest)) {
  install.packages("randomForest")
  library(randomForest)
}
library(maps)

Load the data

### read accidents dataset
accidents <- readr::read_csv('./data/accident-data.csv')

### since it is only 14 rows with NA in dataset, we can drop these rows without imputation
accidents <- na.omit(accidents)

Create new vars:

month
hour
dayperiod (grouped hours of day in "morning","afternoon","evening","night")
major accid var as accidents with number of casualties >= 3 and fatal severity

### create month, hour, dayperiod and major_accid vars
accid <- accidents %>%
  mutate(month = month(as.Date(date, format = "%d/%m/%Y")),
         hour = format(as.POSIXct(time), format = "%H"),
         dayperiod = ifelse(hour %in% c("00","01","02","03","04","05"),"night",
                      ifelse(hour %in% c("06","07","08","09","10","11"),"morning",
                      ifelse(hour %in% c("12","13","14","15","16","17"),"afternoon","evening"))),
         major_accid = ifelse(accident_severity==1 & number_of_casualties >=3,
                              "yes","no"))

### convert some vars to factors
vars <- c("accident_severity","day_of_week","first_road_class",
          "road_type","second_road_class","junction_detail","junction_control",
          "pedestrian_crossing_human_control","pedestrian_crossing_physical_facilities",
          "light_conditions","weather_conditions", "road_surface_conditions","special_conditions_at_site",
          "carriageway_hazards","urban_or_rural_area","month","hour","dayperiod", "major_accid")
accid[,vars] <- lapply(accid[,vars] , factor)

Let's create a report that covers the following:

What time of day and day of the week do most major incidents happen?
Are there any patterns in the time of day/ day of the week when major incidents occur?
What characteristics stand out in major incidents compared with other accidents?
On what areas would you recommend the planning team focus their brainstorming efforts to reduce major incidents?

1. What time of day and day of the week do most major incidents happen?

Major accidents by hour of day

### major_accid by hour of day divided into two clusters
hour_gr <- accid %>% filter(major_accid == "yes") %>% group_by(hour) %>% summarize(count = n())
hour_cl <- kmeans(hour_gr$count, centers = 2)
hour_gr$cluster <- factor(hour_cl$cluster)
ggplot(hour_gr, aes(x = hour, y = count, fill = cluster))+
  geom_bar(stat = 'identity',position = position_dodge())+
  ggtitle("Fig.1. Major accidents by hour of the day")+
  theme(axis.text.x = element_text(face="bold", color="#993333", 
                                   size=10, angle=90))

On the fig.1 It is clear that most major accidents occurred in the second part of the day (most likely due to traffic increasing)

Major accidents by day of the week

### day of the week
accid %>% filter(major_accid == "yes") %>% group_by(day_of_week) %>% summarize(count = n()) %>%
  ggplot(aes(x = day_of_week, y = count))+geom_col()+
  scale_x_discrete(labels=c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday",
                            "4" = "Wednesday", "5" = "Thursday",
                            "6" = "Friday", "7" = "Saturday"))+
  ggtitle("Fig.2. Major accidents by day of the week")+
  theme(axis.text.x = element_text(face="bold", color="#993333", 
                                   size=10, angle=30))

According to the day of week (Fig.2) the most major accidents occurred in Saturday, Sunday and Thursday

2. Are there any patterns in the time of day/ day of the week when major incidents occur?

Major accidents by day of the week and period of day

### day of the week and period of day
accid %>% filter(major_accid == "yes") %>% group_by(day_of_week, dayperiod) %>% summarize(count = n(), across()) %>%
  ggplot(aes(x = day_of_week, y = count, fill = dayperiod))+geom_bar(stat = 'identity',position = position_dodge())+
  scale_x_discrete(labels=c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday",
                            "4" = "Wednesday", "5" = "Thursday",
                            "6" = "Friday", "7" = "Saturday"))+
  ggtitle("Fig.3. Major accidents by day of the week and period of day")+
  theme(axis.text.x = element_text(face="bold", color="#993333", 
                                   size=10, angle=0))

According to the Fig.3 the most major accidents of Sunday, Monday and Tuesday occurred in afternoon while major accidents of Wednesday, Thursday and Saturday occurred in evening

3. What characteristics stand out in major incidents compared with other accidents?

To find most important vars in respect to major_accid var let's build the Random Forest model without train/test splitting


table(accid$major_accid)

### the target var looks imbalance
### let's use "undersample majority class" technique to reduce "no" rows down to 250
no_major_accid_df = accid %>% filter(major_accid == "no")
set.seed(42)
sample_no_major_accid_df <- sample_n(no_major_accid_df, 250)
### create major_accid_df
major_accid_df = accid %>% filter(major_accid == "yes")
### combine two data frames
df <- rbind(major_accid_df, sample_no_major_accid_df)

### drop some columns before fit the model
drop <- c("accident_index","accident_year","accident_reference",
          "accident_severity","number_of_casualties","number_of_vehicles",
          "longitude","latitude","date","time", "dayperiod")
df = df[,!(names(df) %in% drop)]
df <- as.data.frame(df)

### tune the random forest model
set.seed(123)
tuneRF(df[,-20], df[,20],
               stepFactor = 0.5,
               plot = FALSE,
               ntreeTry = 300,
               trace = FALSE,
               improve = 0.05)

### fit the random forest model
set.seed(3)
rf <- randomForest(major_accid ~., data = df,
                   ntree = 300, mtry = 8, importance = TRUE, proximity = TRUE)

### we got the results with OOB error close to 27%

‌
‌
‌