Skip to content
CancelLess: Insights from Hotel Booking Data
1. Data review
# import libraries, read csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
hotels_before_drop = pd.read_csv("data/hotel_bookings.csv")Hidden output
display(hotels_before_drop.shape)
display(hotels_before_drop.describe())
display(hotels_before_drop.info())# Check catigorical data and specific weight of values
fig, ax = plt.subplots(nrows=2, ncols=2)
hotels_before_drop['type_of_meal_plan'].value_counts().plot.bar(ax=ax[0,0])
ax[0,0].set_title('Meal plan')
hotels_before_drop['room_type_reserved'].value_counts().plot.bar(ax=ax[0,1])
ax[0,1].set_title('Room type')
hotels_before_drop['market_segment_type'].value_counts().plot.bar(ax=ax[1,0])
ax[1,0].set_title('Market segment')
hotels_before_drop['no_of_special_requests'].value_counts().plot.bar(ax=ax[1,1])
ax[1,1].set_title('Special requests')
plt.tight_layout()
plt.show()Scale of problem: about 1/2 of bookings was canceled
# How much cancellations
plt.figure(figsize=(14,2))
sns.histplot(y=hotels_before_drop['booking_status'])
plt.title('Number of cancellation')
plt.show()Check empty values and clean table
solutions: drop some garbage rows, replace Nan with 0 where it make sence
# Check Nans
display('Hotels original shape: ', hotels_before_drop.shape)
# drop rows with more then 75% values is Nan
hotels = hotels_before_drop.dropna(thresh=13)
display('Hotels shape after dropna: ', hotels.shape)
threshold = len(hotels) * 0.05
print('threshold is', threshold)
# only one value fails the 5% criterion - Required parking
# it is acceptable to change to zero, assuming that the client does not specify this parameter when walking
# the same logic take place in special request option
hotels['required_car_parking_space'].fillna(0, inplace=True)
hotels['no_of_special_requests'].fillna(0, inplace=True)2. Mean values assessment
Compare mean values subsetting by Cancelled / Not Cancelled
# Replace column names
convinient_columns = ['Booking ID', 'Adults', 'Children', 'Weekend nights',
'Week nights', 'Meal plan', 'Parking space',
'Room type', 'Lead time', 'Arrival year', 'Arrival month',
'Arrival date', 'Market type', 'Repeated guest',
'Previous cancellations', 'Prev. books not canceled',
'Avg price per room', 'Special requests', 'Booking status']
hotels.columns = convinient_columns
# Replace catigorical variables
hotels['Booking status'] = hotels['Booking status'].replace({'Canceled': 0, 'Not_Canceled': 1})
# Replace catigorical values with numbers
hotels['Room type'].replace({'Room_Type 1': 1, 'Room_Type 2': 2,
'Room_Type 3': 3, 'Room_Type 4': 4,
'Room_Type 5': 5, 'Room_Type 6': 6, 'Room_Type 7': 7}, inplace=True)
hotels['Meal plan'].replace({'Not Selected': 0, 'Meal Plan 1': 1,
'Meal Plan 2': 2, 'Meal Plan 3': 3}, inplace=True)
# Group by booking status
hotels_groups = hotels.groupby('Booking status').mean()
hotels_tr = hotels_groups.transpose()
hotels_tr
# find difference in %
hotels_tr['compare_percentage'] = ((hotels_tr[0]-hotels_tr[1])/hotels_tr[0]*100).round(1)
hotels_cancel_impact = hotels_tr.sort_values('compare_percentage', ascending=False)
hotels_cancel_impact['compare_percentage'] = abs(hotels_cancel_impact['compare_percentage'])hotelshotels_dr.columnsCreate a plot of mean values for each factor vs booking status