Skip to content
Exploratory Data Analytics
Missing data
Dealing with missing values based on the 5% rule
# Count the number of missing values in each column
print(planes.isna().sum())
# Find the five percent threshold
threshold = len(planes) * 0.05
# Create a filter
cols_to_drop = planes.columns[planes.isna().sum() <= threshold]
# Drop missing values for columns below the threshold
planes.dropna(subset=cols_to_drop, inplace=True)
print(planes.isna().sum())
Data imputation
# Calculate median plane ticket prices by Airline
airline_prices = planes.groupby("Airline")["Price"].median()
print(airline_prices)
# Convert to a dictionary
prices_dict = airline_prices.to_dict()
# Map the dictionary to missing values of Price by Airline
planes["Price"] = planes["Price"].fillna(planes["Airline"].map(prices_dict))
# Check for missing values
print(planes.isna().sum())
Finding the number of unique values
# Filter the DataFrame for object columns
non_numeric = planes.select_dtypes("object")
# Loop through columns
for c in non_numeric.columns:
# Print the number of unique values
print(f"Number of unique values in {c} column: ", non_numeric[c].nunique())
Categorical analysis
# Filter the DataFrame for object columns
non_numeric = planes.select_dtypes("object")
# Loop through columns
for c in non_numeric.columns:
# Print the number of unique values
print(f"Number of unique values in {c} column: ", non_numeric[c].nunique())
# Create a list of categories
flight_categories = ["Short-haul", "Medium", "Long-haul"]
# Create short-haul values
short_flights = "0h|1h|2h|3h|4h"
# Create medium-haul values
medium_flights = "5h|6h|7h|8h|9h"
# Create long-haul values
long_flights = "10h|11h|12h|13h|14h|15h|16h"
# Create conditions for values in flight_categories to be created
conditions = [
(planes["Duration"].str.contains(short_flights)),
(planes["Duration"].str.contains(medium_flights)),
(planes["Duration"].str.contains(long_flights))
]
# Apply the conditions list to the flight_categories
planes["Duration_Category"] = np.select(conditions,
flight_categories,
default="Extreme duration")
# Plot the counts of each category
sns.countplot(data=planes, x="Duration_Category")
plt.show()