Skip to content
Competition - predict hotel cancellation
Predicting Hotel Cancellations
🏨 Background
You are supporting a hotel with a project aimed to increase revenue from their room bookings. They believe that they can use data science to help them reduce the number of cancellations. This is where you come in!
They have asked you to use any appropriate methodology to identify what contributes to whether a booking will be fulfilled or cancelled. They intend to use the results of your work to reduce the chance someone cancels their booking.
The Data
They have provided you with their bookings data in a file called hotel_bookings.csv, which contains the following:
| Column | Description |
|---|---|
Booking_ID | Unique identifier of the booking. |
no_of_adults | The number of adults. |
no_of_children | The number of children. |
no_of_weekend_nights | Number of weekend nights (Saturday or Sunday). |
no_of_week_nights | Number of week nights (Monday to Friday). |
type_of_meal_plan | Type of meal plan included in the booking. |
required_car_parking_space | Whether a car parking space is required. |
room_type_reserved | The type of room reserved. |
lead_time | Number of days before the arrival date the booking was made. |
arrival_year | Year of arrival. |
arrival_month | Month of arrival. |
arrival_date | Date of the month for arrival. |
market_segment_type | How the booking was made. |
repeated_guest | Whether the guest has previously stayed at the hotel. |
no_of_previous_cancellations | Number of previous cancellations. |
no_of_previous_bookings_not_canceled | Number of previous bookings that were canceled. |
avg_price_per_room | Average price per day of the booking. |
no_of_special_requests | Count of special requests made as part of the booking. |
booking_status | Whether the booking was cancelled or not. |
Source (data has been modified): https://www.kaggle.com/datasets/ahsan81/hotel-reservations-classification-dataset
# Import the necessary library
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
%matplotlib inline# Read the CSV file into a DataFrame
hotels = pd.read_csv("data/hotel_bookings.csv")
hotels# Set the style for Seaborn
sns.set(style="whitegrid")
# Plot histograms for numerical columns
numerical_columns = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
'lead_time', 'arrival_year', 'arrival_month', 'arrival_date',
'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
'avg_price_per_room', 'no_of_special_requests']
plt.figure(figsize=(18, 16))
for i, col in enumerate(numerical_columns, 1):
plt.subplot(3, 5, i)
sns.histplot(data=hotels, x=col, kde=True)
plt.title(col)
# Plot count plots for categorical columns
categorical_columns = ['booking_status', 'type_of_meal_plan', 'required_car_parking_space',
'room_type_reserved', 'market_segment_type', 'repeated_guest']
plt.figure(figsize=(18, 16))
for i, col in enumerate(categorical_columns, 1):
plt.subplot(2, 3, i)
sns.countplot(data=hotels, x=col)
plt.title(col)
plt.tight_layout()
plt.show()
# Plot Correlation between Booking Status and Repeated Guest
plt.figure(figsize=(10, 6))
sns.countplot(x='booking_status', hue='repeated_guest', data=hotels)
plt.title('Correlation between Booking Status and Repeated Guest')
plt.xlabel('Booking Status')
plt.ylabel('Count')
plt.legend(title='Repeated Guest', loc='upper right')
plt.show()
The Challenge
- Use your skills to produce recommendations for the hotel on what factors affect whether customers cancel their booking.
# Plot Correlation between Booking Status and Lead Time
plt.figure(figsize=(12, 8))
sns.boxplot(x='booking_status', y='lead_time', data=hotels)
plt.title('Correlation between Booking Status and Lead Time')
plt.xlabel('Booking Status')
plt.ylabel('Lead Time (days)')
plt.show()
#Plot Correlation between Booking Status and Number of Weekend Nights
plt.figure(figsize=(12, 8))
sns.boxplot(x='booking_status', y='no_of_week_nights', data=hotels)
plt.title('Correlation between Booking Status and Number of Week Nights')
plt.xlabel('Booking Status')
plt.ylabel('Number of Weekend Nights')
plt.show()
#Plot Correlation between Booking Status and Number of Weekend Nights
plt.figure(figsize=(12, 8))
sns.boxplot(x='booking_status', y='no_of_weekend_nights', data=hotels)
plt.title('Correlation between Booking Status and Number of Weekend Nights')
plt.xlabel('Booking Status')
plt.ylabel('Number of Weekend Nights')
plt.show()
#Plot Correlation between Booking Status and Average Price per Room
plt.figure(figsize=(12, 8))
sns.boxplot(x='booking_status', y='avg_price_per_room', data=hotels)
plt.title('Correlation between Booking Status and Average Price per Room')
plt.xlabel('Booking Status')
plt.ylabel('Average Price per Room')
plt.show()
# Define lead_time_values
lead_time_values = hotels['lead_time']
# Plotting the lead time column
plt.figure(figsize=(10, 6))
plt.hist(lead_time_values, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Lead Time')
plt.xlabel('Lead Time (days)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()
#Plot Correlation between Booking Status and Room Type
plt.figure(figsize=(12, 8))
sns.countplot(x='booking_status', hue='room_type_reserved', data=hotels)
plt.title('Correlation between Booking Status and Room Type')
plt.xlabel('Booking Status')
plt.ylabel('Count')
plt.legend(title='Room Type Reserved', loc='upper right')
plt.show()
#Plot Correlation between Room Type and Number of Children
plt.figure(figsize=(12, 8))
sns.scatterplot(x='room_type_reserved', y='no_of_children', data=hotels)
plt.title('Correlation between Room Type and Number of Children')
plt.xlabel('Room Type Reserved')
plt.ylabel('Number of Children')
plt.show()
#Plot Correlation between Number of Children and Average Price per Room
plt.figure(figsize=(12, 8))
sns.regplot(x='no_of_children', y='avg_price_per_room', data=hotels, scatter_kws={'s': 20})
plt.title('Correlation between Number of Children and Average Price per Room')
plt.xlabel('Number of Children')
plt.ylabel('Average Price per Room')
correlation_coefficient = hotels['no_of_children'].corr(hotels['avg_price_per_room'])
plt.text(10, 200, f'Correlation: {correlation_coefficient:.2f}', fontsize=12, color='red')
plt.show()