# Load necessary libraries
library(dplyr)
library(tidyr)
# Read the data files
flights <- read.csv("flights2022-h2.csv")
airlines <- read.csv("airlines.csv")
airports <- read.csv("airports.csv")
# 1. Find the airline and airport pair that receives the most flights from NYC
frequent_pair <- flights %>%
filter(origin %in% c("JFK", "LGA", "EWR")) %>%
group_by(carrier, dest) %>%
summarise(flight_count = n(), .groups = "drop") %>%
arrange(desc(flight_count)) %>%
slice(1)
# Add names
frequent <- frequent_pair %>%
left_join(airlines, by = "carrier") %>%
left_join(airports, by = c("dest" = "faa")) %>%
select(carrier, dest, airline_name = name.x, airport_name = name.y)
# Calculate average duration for the most frequent pair
avg_duration <- flights %>%
filter(carrier == frequent$carrier[1], dest == frequent$dest[1]) %>%
summarise(avg_duration = mean(air_time, na.rm = TRUE) / 60) # FIX: missing closing parenthesis
frequent$avg_duration_hours <- avg_duration$avg_duration
# 2. Find the airport with longest average flight duration from NYC
# Step-by-step to find the airport with the longest average flight duration from NYC
# Step 1: Join airline and airport names to flights
joined_data <- flights %>%
left_join(airlines, by = "carrier") %>%
left_join(airports, by = c("dest" = "faa"))
# Step 2: Filter to only NYC airports
nyc_flights <- joined_data %>%
filter(origin %in% c("JFK", "LGA", "EWR"))
# Step 3: Group by destination and airline
summary_data <- nyc_flights %>%
group_by(dest, name.x, name.y) %>%
summarise(
flight_count = n(),
avg_duration = mean(air_time, na.rm = TRUE) / 60,
.groups = "drop"
)
# Step 4: Filter only destinations with valid duration
longest <- summary_data %>%
arrange(desc(avg_duration)) %>%
slice(1) %>%
rename(
airline_name = name.x,
airport_name = name.y
)
# 3. Find the least frequented destination for flights departing from JFK
least <- flights %>%
filter(origin == "JFK") %>%
group_by(dest) %>%
summarise(flight_count = n(), .groups = "drop") %>%
arrange(flight_count) %>%
slice(1) %>%
left_join(airports, by = c("dest" = "faa")) %>%
pull(name)
# Results
print("Most frequent airline and airport pair from NYC:")
print(frequent)
print("Airport with longest average flight duration from NYC:")
print(longest)
print("Least frequented destination from JFK:")
print(least)