Welcome to New York City, one of the most-visited cities in the world. There are many Airbnb listings in New York City to meet the high demand for temporary lodging for travelers, which can be anywhere between a few nights to many months. In this project, we will take a closer look at the New York Airbnb market by combining data from multiple file types like .csv, .tsv, and .xlsx.
Recall that CSV, TSV, and Excel files are three common formats for storing data. Three files containing data on 2019 Airbnb listings are available to you:
data/airbnb_price.csv This is a CSV file containing data on Airbnb listing prices and locations.
listing_id: unique identifier of listingprice: nightly listing price in USDnbhood_full: name of borough and neighborhood where listing is located
data/airbnb_room_type.xlsx This is an Excel file containing data on Airbnb listing descriptions and room types.
listing_id: unique identifier of listingdescription: listing descriptionroom_type: Airbnb has three types of rooms: shared rooms, private rooms, and entire homes/apartments
data/airbnb_last_review.tsv This is a TSV file containing data on Airbnb host names and review dates.
listing_id: unique identifier of listinghost_name: name of listing hostlast_review: date when the listing was last reviewed
# Import necessary packages
import pandas as pd
import numpy as npairbnb_price_df = pd.read_csv("data/airbnb_price.csv")# airbnb_price_dffrom pathlib import Path
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_airbnb_data(data_path: str) -> dict:
"""
Load Airbnb data from CSV, Excel, and TSV files into pandas DataFrames.
Parameters:
-----------
data_path : str
Path to the directory containing the data files
Returns:
--------
dict
Dictionary containing DataFrames with keys:
- 'price': DataFrame from airbnb_price.csv
- 'room_type': DataFrame from airbnb_room_type.xlsx
- 'last_review': DataFrame from airbnb_last_review.tsv
Raises:
-------
FileNotFoundError
If any of the expected files are not found
Exception
For other unexpected errors during file loading
"""
try:
# Initialize dictionary to store DataFrames
dataframes = {}
# Define file paths
data_path = Path(data_path)
price_file = data_path / 'airbnb_price.csv'
room_type_file = data_path / 'airbnb_room_type.xlsx'
last_review_file = data_path / 'airbnb_last_review.tsv'
# Load CSV file
if price_file.exists():
dataframes['price'] = pd.read_csv(price_file)
logger.info(f"Successfully loaded {price_file}")
else:
raise FileNotFoundError(f"File not found: {price_file}")
# Load Excel file
if room_type_file.exists():
dataframes['room_type'] = pd.read_excel(room_type_file)
logger.info(f"Successfully loaded {room_type_file}")
else:
raise FileNotFoundError(f"File not found: {room_type_file}")
# Load TSV file
if last_review_file.exists():
dataframes['last_review'] = pd.read_csv(last_review_file, sep='\t')
logger.info(f"Successfully loaded {last_review_file}")
else:
raise FileNotFoundError(f"File not found: {last_review_file}")
return dataframes
except FileNotFoundError as e:
logger.error(f"Error: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error while loading files: {str(e)}")
raise
# Example usage to load and display the DataFrames
if __name__ == "__main__":
try:
# Specify the path to your data folder
data_folder = "./data"
# Load the data
airbnb_data = load_airbnb_data(data_folder)
# Access and display the DataFrames
price_df = airbnb_data['price']
room_type_df = airbnb_data['room_type']
last_review_df = airbnb_data['last_review']
# Display basic information about each DataFrame
print("\nPrice DataFrame Info:")
print("-" * 50)
print(price_df.info())
print("\nFirst few rows:")
print(price_df.head())
print("\nRoom Type DataFrame Info:")
print("-" * 50)
print(room_type_df.info())
print("\nFirst few rows:")
print(room_type_df.head())
print("\nLast Review DataFrame Info:")
print("-" * 50)
print(last_review_df.info())
print("\nFirst few rows:")
print(last_review_df.head())
except Exception as e:
print(f"Failed to load or display data: {str(e)}")price_df.head()room_type_df.head()last_review_df.head()def merge_airbnb_data(price_df: pd.DataFrame, room_type_df: pd.DataFrame, last_review_df: pd.DataFrame) -> pd.DataFrame:
"""
Merge three Airbnb DataFrames (price, room_type, last_review) on the 'listing_id' column.
Parameters:
-----------
price_df : pd.DataFrame
DataFrame containing price data
room_type_df : pd.DataFrame
DataFrame containing room type data
last_review_df : pd.DataFrame
DataFrame containing last review data
Returns:
--------
pd.DataFrame
Merged DataFrame containing columns from all three input DataFrames
Raises:
-------
ValueError
If 'listing_id' is not present in any DataFrame or if merges result in empty DataFrames
Exception
For other unexpected errors during merging
"""
try:
# Validate that 'listing_id' exists in all DataFrames
for df, name in [(price_df, 'price'), (room_type_df, 'room_type'), (last_review_df, 'last_review')]:
if 'listing_id' not in df.columns:
raise ValueError(f"'listing_id' column not found in {name} DataFrame")
# Log initial shapes
logger.info(f"Initial shapes - Price: {price_df.shape}, Room Type: {room_type_df.shape}, Last Review: {last_review_df.shape}")
# First merge: price and room_type
merged_df = pd.merge(
price_df,
room_type_df,
on='listing_id',
how='inner',
suffixes=('_price', '_room')
)
if merged_df.empty:
raise ValueError("First merge (price and room_type) resulted in an empty DataFrame")
logger.info(f"Shape after price and room_type merge: {merged_df.shape}")
# Second merge: result with last_review
final_df = pd.merge(
merged_df,
last_review_df,
on='listing_id',
how='inner',
suffixes=('_merged', '_last_review')
)
if final_df.empty:
raise ValueError("Second merge (with last_review) resulted in an empty DataFrame")
logger.info(f"Final merged DataFrame shape: {final_df.shape}")
return final_df
except ValueError as e:
logger.error(f"ValueError during merge: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error during merge: {str(e)}")
raise
# Example usage with the previously loaded DataFrames
if __name__ == "__main__":
try:
# Assume DataFrames are loaded using the previous load_airbnb_data function
# For demonstration, replace with your actual DataFrame loading
data_folder = "./data"
airbnb_data = load_airbnb_data(data_folder) # From previous code
price_df = airbnb_data['price']
room_type_df = airbnb_data['room_type']
last_review_df = airbnb_data['last_review']
# Merge the DataFrames
merged_df = merge_airbnb_data(price_df, room_type_df, last_review_df)
# Display results
print("\nMerged DataFrame Info:")
print("-" * 50)
print(merged_df.info())
print("\nFirst few rows of merged DataFrame:")
print(merged_df.head())
except Exception as e:
print(f"Failed to merge or display data: {str(e)}")merged_df.head()merged_df.info()merged_df['last_review'] = pd.to_datetime(merged_df['last_review'], format='%B %d %Y')merged_df.head()earliest_review_date = merged_df['last_review'].min()
earliest_review_date most_recent_review_date = merged_df['last_review'].max()
most_recent_review_date