An Exploratory Analysis of Hotel Bookings
Introduction
This project will undertake an analysis of a dataset of hotel bookings, with the aim of identifying possible factors linked with the cancelation of such bookings. Knowing these factors can be useful for identifying and accounting for unused rooms, and ultimately aiming to reduce the number of unused rooms and maximize revenue. This analysis will involve an exploratory analysis and cleaning of the initial dataset, visualization of relevant statistics, and a concluding statement presenting a summary of the project's findings.
# Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Perform initial exploratory analysis
hotel_bookings = pd.read_csv('hotel_bookings.csv')
print(hotel_bookings.head())
print(hotel_bookings.info())
print(hotel_bookings.shape)
print(hotel_bookings.isna().sum())
for column in hotel_bookings.columns:
print(hotel_bookings[column].value_counts())Initial Exploratory Analysis
Upon importing and analyzing the dataset's source file, we can see that the dataset includes a large number of columns with binary data. A closer look reveals that many of these binary columns present data as a simple value of a broader variable shared amongst columns, and are exclusive to one another. This is useful when looking at a singular record at a time, where it can present attributes as being present or absent, or when looking at aggregate numbers of a particular value within a variable, but is less useful when comparing multiple records, and we can easily consolidate them into singular columns with string entries to more easily analyze the data.
Additionally, the remaining binary columns can be better analyzed by converting to a different data type, in this case to boolean data. Furthermore, a handful of columns present useful data by themselves, but would be more useful by creating an additional column with an aggregate function of those columns; thus, we will create new columns combining week night and weekend night totals to sum total nights stayed, multiplying total nights stayed by average daily rate to get total revenue, and combining adult, children, and baby counts to sum total number of guests in each party.
# Consolidate multiple binary columns into singular string column
# Consolidate into booked_by column
hotel_bookings.loc[hotel_bookings['booked_by_company'] == 1, 'booked_by'] = "Company"
hotel_bookings.loc[hotel_bookings['booked_by_agent'] == 1, 'booked_by'] = "Agent"
hotel_bookings.loc[(hotel_bookings['booked_by_company'] == 0) & (hotel_bookings['booked_by_agent'] == 0), 'booked_by'] = "Other"
# Consolidate into hotel column
hotel_bookings.loc[hotel_bookings['hotel_City'] == 1, 'hotel'] = "City"
hotel_bookings.loc[hotel_bookings['hotel_Resort'] == 1, 'hotel'] = "Resort"
# Consolidate into meal column
hotel_bookings.loc[hotel_bookings['meal_BB'] == 1, 'meal'] = "Bed_and_Breakfast"
hotel_bookings.loc[hotel_bookings['meal_FB'] == 1, 'meal'] = "Full_Board"
hotel_bookings.loc[hotel_bookings['meal_HB'] == 1, 'meal'] = "Half_Board"
hotel_bookings.loc[hotel_bookings['meal_No_meal'] == 1, 'meal'] = "None"
# Consolidate into market_segment column
hotel_bookings.loc[hotel_bookings['market_segment_Aviation'] == 1, 'market_segment'] = "Aviation"
hotel_bookings.loc[hotel_bookings['market_segment_Complementary'] == 1, 'market_segment'] = "Complementary"
hotel_bookings.loc[hotel_bookings['market_segment_Corporate'] == 1, 'market_segment'] = "Corporate"
hotel_bookings.loc[hotel_bookings['market_segment_Direct'] == 1, 'market_segment'] = "Direct"
hotel_bookings.loc[hotel_bookings['market_segment_Groups'] == 1, 'market_segment'] = "Groups"
hotel_bookings.loc[hotel_bookings['market_segment_Offline_TA_TO'] == 1, 'market_segment'] = "Offline_Travel-Agent_Tour-Operator"
hotel_bookings.loc[hotel_bookings['market_segment_Online_TA'] == 1, 'market_segment'] = "Online_Travel-Agent"
hotel_bookings.loc[hotel_bookings['market_segment_Undefined'] == 1, 'market_segment'] = "Undefined"
# Consolidate into distribution_channel column
hotel_bookings.loc[hotel_bookings['distribution_channel_Corporate'] == 1, 'distribution_channel'] = "Corporate"
hotel_bookings.loc[hotel_bookings['distribution_channel_Direct'] == 1, 'distribution_channel'] = "Direct"
hotel_bookings.loc[hotel_bookings['distribution_channel_GDS'] == 1, 'distribution_channel'] = "Global_Distribution_System"
hotel_bookings.loc[hotel_bookings['distribution_channel_TA_TO'] == 1, 'distribution_channel'] = "Travel-Agent_Tour-Operator"
hotel_bookings.loc[hotel_bookings['distribution_channel_Undefined'] == 1, 'distribution_channel'] = "Undefined"
# Consolidate into reserved_room_type column
hotel_bookings.loc[hotel_bookings['reserved_room_type_A'] == 1, 'reserved_room_type'] = "A"
hotel_bookings.loc[hotel_bookings['reserved_room_type_B'] == 1, 'reserved_room_type'] = "B"
hotel_bookings.loc[hotel_bookings['reserved_room_type_C'] == 1, 'reserved_room_type'] = "C"
hotel_bookings.loc[hotel_bookings['reserved_room_type_D'] == 1, 'reserved_room_type'] = "D"
hotel_bookings.loc[hotel_bookings['reserved_room_type_E'] == 1, 'reserved_room_type'] = "E"
hotel_bookings.loc[hotel_bookings['reserved_room_type_F'] == 1, 'reserved_room_type'] = "F"
hotel_bookings.loc[hotel_bookings['reserved_room_type_G'] == 1, 'reserved_room_type'] = "G"
hotel_bookings.loc[hotel_bookings['reserved_room_type_H'] == 1, 'reserved_room_type'] = "H"
hotel_bookings.loc[hotel_bookings['reserved_room_type_L'] == 1, 'reserved_room_type'] = "L"
# Consolidate into deposit_type column
hotel_bookings.loc[hotel_bookings['deposit_type_No_Deposit'] == 1, 'deposit_type'] = "No_Deposit"
hotel_bookings.loc[hotel_bookings['deposit_type_Non_Refund'] == 1, 'deposit_type'] = "Non_Refund"
hotel_bookings.loc[hotel_bookings['deposit_type_Refundable'] == 1, 'deposit_type'] = "Refundable"
# Consolidate into customer_type column
hotel_bookings.loc[hotel_bookings['customer_type_Contract'] == 1, 'customer_type'] = "Contract"
hotel_bookings.loc[hotel_bookings['customer_type_Group'] == 1, 'customer_type'] = "Group"
hotel_bookings.loc[hotel_bookings['customer_type_Transient'] == 1, 'customer_type'] = "Transient"
hotel_bookings.loc[hotel_bookings['customer_type_Transient-Party'] == 1, 'customer_type'] = "Transient-Party"
# Drop newly-redundant columns
hotel_bookings.drop(hotel_bookings.iloc[:, 16:53], inplace=True, axis=1)
# Convert remaining binary columns to boolean data type
hotel_bookings['is_canceled'] = hotel_bookings['is_canceled'].astype(bool)
hotel_bookings['is_repeated_guest'] = hotel_bookings['is_repeated_guest'].astype(bool)
# Add stays_in_total_nights column, position immediately after component columns
hotel_bookings.insert(7, 'stays_in_total_nights', (hotel_bookings['stays_in_weekend_nights'] + hotel_bookings['stays_in_week_nights']))
# Add total_party_size column, position immediately after component columns
hotel_bookings.insert(11, 'total_party_size', (hotel_bookings['adults'] + hotel_bookings['children'] + hotel_bookings['babies']))
# Add total_revenue column, position immediately after avg_daily_rate column
hotel_bookings.insert(18, 'total_revenue', (hotel_bookings['avg_daily_rate'] * hotel_bookings['stays_in_total_nights']))
# Perform exploratory analysis to confirm updated data
print(hotel_bookings.info())
print(hotel_bookings.isna().sum())
print(hotel_bookings.dtypes)
for column in hotel_bookings.columns:
print(hotel_bookings[column].value_counts())Broad Data Cleaning
After performing a first broad round of data cleaning, the resulting dataset is much more compact, comprising 27 columns rather than 53 columns as previously. Additionally, having consolidated columns makes visualization of a particular variable easier. One new column, booked_by, did require further attention, as both of the pre-consolidation columns gave binary 0 values for certain records, representing the absence of these values. For these records, we created a new value, Other, and assigned it when both columns returned binary 0 values.
From the resulting dataset, we can now more closely analyze the data and conduct further cleaning, particularly checking for outliers in the data. For this we will be looking at three particular variables, average daily rate, total nights stayed, and total party size; the latter two having been created from aggregate functions based on variables already given.
# Plot & analyze avg_daily_rate column
sns.histplot(x="avg_daily_rate", data=hotel_bookings, binwidth=20)
plt.title("Distribution of Average Daily Rate")
plt.xlabel("Average Daily Rate ($)")
plt.show()
print(hotel_bookings['avg_daily_rate'].describe())
# Determine outliers on avg_daily_rate
plt.clf()
sns.boxplot(y="avg_daily_rate", data=hotel_bookings)
plt.title("Box Plot of Average Daily Rate")
plt.ylabel("Average Daily Rate")
plt.show()
adr_seventy_fifth = hotel_bookings['avg_daily_rate'].quantile(0.75)
adr_twenty_fifth = hotel_bookings['avg_daily_rate'].quantile(0.25)
avg_daily_rate_iqr = adr_seventy_fifth - adr_twenty_fifth
print("Interquartile Range of Average Daily Rate - " + str(avg_daily_rate_iqr))
adr_upper = adr_seventy_fifth + (1.5 * avg_daily_rate_iqr)
adr_lower = adr_twenty_fifth - (1.5 * avg_daily_rate_iqr)
print("Upper & Lower Threshold of Outliers - " + str((adr_upper, adr_lower)))
# Determine avg_daily_rate by reserved_room_type
print(hotel_bookings.groupby('reserved_room_type')['avg_daily_rate'].mean())
# Determine number of rooms booked with average daily rate of $0
print("Number of rooms booked with ADR of $0 - " + str(len(hotel_bookings[hotel_bookings['avg_daily_rate'] == 0])))
# Check info on complementary market_segment
print(hotel_bookings.loc[hotel_bookings['market_segment'] == "Complementary"].melt())Focused Analysis & Cleaning
Average Daily Rate
Delving deeper into certain variables, we find the average daily rate of booking within this dataset to be $110.79. This naturally differs based on room type, with the average rate of the cheapest room type being $97.20, and that of the most expensive room type being $159.47. The distribution of rates closely resembles a normal distribution, and there are a small number of outliers. The most expensive outlier rate is $225.00, slightly higher than the upper threshold of $191.50. Additionally, there are 7 bookings with a rate of $0, indicating a free stay. There could be a number of plausible reasons for these records, such as a loyalty program or courtesies afforded to guests. The purpose of cleaning outliers from the dataset is to ensure there is no unintended weighing in the dataset's statistics. Given that there are outliers on either side of the threshold, ensuring minimal if any weighing in the stats, and that the outliers may lend themselves to understanding the primary question, we would be justified in not removing outliers in this instance.
# Plot & analyze stays_in_total_nights column
sns.histplot(x="stays_in_total_nights", data=hotel_bookings, binwidth=2)
plt.title("Distribution of Total Nights Stayed")
plt.xlabel("Total Nights Stayed")
plt.show()
print(hotel_bookings['stays_in_total_nights'].describe())
# Determine outliers on stays_in_total_nights
plt.clf()
sns.boxplot(y="stays_in_total_nights", data=hotel_bookings)
plt.title("Box Plot of Total Nights Stayed")
plt.ylabel("Total Nights Stayed")
plt.show()
sitn_seventy_fifth = hotel_bookings['stays_in_total_nights'].quantile(0.75)
sitn_twenty_fifth = hotel_bookings['stays_in_total_nights'].quantile(0.25)
stays_in_total_nights_iqr = sitn_seventy_fifth - sitn_twenty_fifth
print("Interquartile Range of Total Nights Stayed - " + str(stays_in_total_nights_iqr))
sitn_upper = sitn_seventy_fifth + (1.5 * stays_in_total_nights_iqr)
sitn_lower = sitn_twenty_fifth - (1.5 * stays_in_total_nights_iqr)
print("Upper & Lower Threshold of Outliers - " + str((sitn_upper, sitn_lower)))
# Determine number of rooms booked with total nights stayed of 21
print("Number of rooms booked with 21 night stay - " + str(len(hotel_bookings[hotel_bookings['stays_in_total_nights'] == 21])))
# Remove row with stays_in_total_nights == 21 as outlier
hotel_bookings = hotel_bookings.drop(hotel_bookings.loc[hotel_bookings['stays_in_total_nights'] == 21, :].index)
# Plot & analyze total_party_size column after removing outlier
plt.clf()
sns.histplot(x="stays_in_total_nights", data=hotel_bookings, binwidth=2)
plt.title("Distribution of Total Nights Stayed")
plt.xlabel("Total Nights Stayed")
plt.show()
print(hotel_bookings['stays_in_total_nights'].describe())
Total Nights Stayed
Moving onto an analysis of total nights stayed, we see that the average booking has a stay of 4.92 nights. The distribution of nights stayed is generally right-tailed, and the upper threshold of outliers is 14.5 nights. There is one booking with a total nights stayed of 21, meaning a total period of 3 weeks. While this is noticeably higher than both the next highest value and the upper threshold of outliers, it is not absurdly higher. Regardless, without a clear reason to keep the outlier in the dataset, it would be advisable to remove it, so we will.
# Plot & analyze total_party_size column
sns.histplot(x="total_party_size", data=hotel_bookings)
plt.title("Distribution of Total Party Size")
plt.xlabel("Total Party Size")
plt.show()
print(hotel_bookings['total_party_size'].describe())
# Determine outliers on avg_daily_rate
tps_seventy_fifth = hotel_bookings['total_party_size'].quantile(0.75)
tps_twenty_fifth = hotel_bookings['total_party_size'].quantile(0.25)
total_party_size_iqr = tps_seventy_fifth - tps_twenty_fifth
print("Interquartile Range of Total Party Size - " + str(total_party_size_iqr))
tps_upper = tps_seventy_fifth + (1.5 * total_party_size_iqr)
tps_lower = tps_twenty_fifth - (1.5 * total_party_size_iqr)
print("Upper & Lower Threshold of Outliers - " + str((tps_upper, tps_lower)))
# Get more info on outlier with total_party_size == 12
print(hotel_bookings.loc[hotel_bookings['total_party_size'] == 12, :].melt())
# Determine avg_daily_rate by reserved_room_type
print(hotel_bookings.groupby('reserved_room_type')['total_party_size'].mean())
# Remove row with total_party_size == 12 due to suspected error in data collection
hotel_bookings = hotel_bookings.drop(hotel_bookings.loc[hotel_bookings['total_party_size'] == 12, :].index)
# Plot & analyze total_party_size column after removing outlier
plt.clf()
sns.histplot(x="total_party_size", data=hotel_bookings, bins=5)
plt.title("Distribution of Total Party Size")
plt.xlabel("Total Party Size")
plt.show()
print(hotel_bookings['total_party_size'].describe())
print(hotel_bookings.groupby('reserved_room_type')['total_party_size'].mean())Total Party Size
Looking at total party size, in the vast majority of cases, each party is comprised of 2 people, with less than a third including other cases. On intial exploratory analysis, we were able to identify one record with an apparent data outlier, a total party size of 12, including 10 children and 2 adults. To confirm this, we performed further analysis on the total party size variable, including plotting a histogram, computing summary statistics, and determining the outlier threshold. This confirmed that this record was indeed an outlier, and not a particularly close outlier at that. This by itself does not indicate an error in the data, so we decided to look through this record in detail.
A booking with a large party size is not unheard of, but when it does happen it normally does not look similar to other bookings with small party sizes, and when looking through the record, it does look similar to other bookings. Of note is the indicated room type, D, which is the second most common room type among the dataset, and which has a mean total party size of 2.3, just slightly higher than the overall mean total party size. So the booking with a party size of 12, comprised of 2 adults and 10 children, is accommodated in a room that in terms of daily rate and party size is overall pretty average, as opposed to a higher classed room. There is the possibility that this booking could include multiple rooms, which is a variable not accounted for in the dataset, and the booking does include a special request, which could conceivably include, for example, additional cots. However, the data on balance appears to be suggestive of an error, and in this case we would be inclined to remove the record from the dataset.
# Plot & analyze lead_time column
sns.histplot(x="lead_time", data=hotel_bookings, binwidth=30)
plt.title("Distribution of Lead Time")
plt.xlabel("Lead Time (days)")
plt.show()
print(hotel_bookings['lead_time'].describe())
# Determine outliers on avg_daily_rate
sns.boxplot(y="lead_time", data=hotel_bookings)
plt.title("Box Plot of Lead Time")
plt.ylabel("Lead Time (Days)")
plt.show()
lt_seventy_fifth = hotel_bookings['lead_time'].quantile(0.75)
lt_twenty_fifth = hotel_bookings['lead_time'].quantile(0.25)
lead_time_iqr = lt_seventy_fifth - lt_twenty_fifth
print("Interquartile Range of Lead Time - " + str(lead_time_iqr))
lt_upper = lt_seventy_fifth + (1.5 * lead_time_iqr)
lt_lower = lt_twenty_fifth - (1.5 * lead_time_iqr)
print("Upper & Lower Threshold of Outliers - " + str((lt_upper, lt_lower)))
# Remove rows with lead_time > 365 as outliers
hotel_bookings = hotel_bookings.drop(hotel_bookings.loc[hotel_bookings['lead_time'] > 365, :].index)
# Plot & analyze lead_time column after removing outliers
sns.histplot(x="lead_time", data=hotel_bookings, binwidth=20)
plt.title("Distribution of Lead Time")
plt.xlabel("Lead Time (Days)")
plt.show()
print(hotel_bookings['lead_time'].describe())
sns.boxplot(y="lead_time", data=hotel_bookings)
plt.title("Box Plot of Lead Time")
plt.ylabel("Lead Time (Days)")
plt.show()Lead Time
Finally, in looking at lead time, we can see that bookings on average occur 64 days, or just over 2 months, prior to the check-in date. The vast majority of bookings occur within 92 days of check-in, or approximately 3 months. However, the dataset also shows some notable outliers, including one booking that occurs 737 days prior, or just over 2 years. The distance between the lead time of that booking, and that of the next closest booking, is large enough that we might conceivably suspect an error. While there is not enough additional info to make a judgment call about the validity of this data point, we can safely remove it as not being representative of the whole data. And while doing so, we can also remove some additional outliers. While not removing all outliers that lie outside the threshold, as a rule, we can consider that people may typically plan and book vacations up to 1 year (or 365 days) in advance, and that those made outside of this time are not typical. Thus, we would be justified in dropping bookings made more than 365 days prior to check-in.
# Plot booking status, daily rate, and lead time as a scatter plot
sns.scatterplot(x="lead_time", y="avg_daily_rate", data=hotel_bookings, hue="is_canceled", hue_order=[True, False])
plt.title("Booking Status, Daily Rate, and Lead Time Compared")
plt.xlabel("Lead Time (Days)")
plt.ylabel("Average Daily Rate ($)")
plt.show()
# Plot booking status, daily rate, and room type as a bar plot
sns.catplot(x="reserved_room_type", y="avg_daily_rate", data=hotel_bookings, hue="is_canceled", hue_order=[True, False], kind="bar", order=["A", "C", "D", "E", "F", "G", "H", "L"])
plt.title("Booking Status, Daily Rate, and Room Type Compared")
plt.xlabel("Room Type")
plt.ylabel("Average Daily Rate ($)")
plt.show()
# Plot reserved room type counts by booking status as a count plot
sns.countplot(x="reserved_room_type", data=hotel_bookings, hue="is_canceled", hue_order=[True, False], order=["A", "C", "D", "E", "F", "G", "H", "L"])
plt.title("Room Type Counts by Booking Status")
plt.xlabel("Reserved Room Type")
plt.ylabel("# of Rooms")
plt.plot()
# Plot booking status, lead time, and party size as a point plot
sns.catplot(x="total_party_size", y="lead_time", data=hotel_bookings, hue="is_canceled", hue_order=[True, False], kind="point")
plt.title("Booking Status, Lead Time, and Party Size Compared")
plt.xlabel("Total Party Size")
plt.ylabel("Lead Time (Days)")
plt.show()
# Subset for relevant columns, determine correlations in dataset
hotel_bookings_condensed = hotel_bookings[['is_canceled', 'lead_time', 'stays_in_total_nights', 'stays_in_week_nights', 'stays_in_weekend_nights', 'total_party_size', 'total_of_special_requests', 'avg_daily_rate', 'total_revenue']]
plt.clf()
sns.heatmap(hotel_bookings_condensed.corr(), annot=True)
plt.title("Correlations Between Selected Variables")
plt.show()