Skip to content
Boat Study Case
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
df = pd.read_csv('boat_data.csv')
df.head()
Data Cleaning
df.rename(columns={'Number of views last 7 days': 'Views'}, inplace=True)
print(df.shape)
df.info()
# Checking for null
df.isna().sum()
Split Price into Amount (Price) and Currency
# Split price and currency
df[['Currency', 'Price']] = df.Price.str.split(expand = True)
# Pound Symbol error
print(df.Currency.value_counts())
# We can safely replace pound
df.loc[((df.Currency != 'EUR') & (df.Currency != 'CHF') & (df.Currency != 'DKK')), 'Currency'] = 'GBP'
df.Price = df.Price.astype(int)
df.head()
df.Currency.value_counts()
Since EUR CHF and GBP doesnt differ too much, we will only be scaling DKK to match the other currency
# Scale up Price for DKK
df.loc[df.Currency == 'DKK', 'Price'] = df.loc[df.Currency == 'DKK', 'Price'].apply(lambda x: int(x*0.13))
df.head()
Update Location base on currency
# Truncate Location base on currency
df.loc[df.Currency == 'EUR', 'Location'] = 'Euro'
df.loc[df.Currency == 'CHF', 'Location'] = 'Switzerland'
df.loc[df.Currency == 'GBP', 'Location'] = 'United Kingdom'
df.loc[df.Currency == 'DKK', 'Location'] = 'Denmark'
df.head()
Fill null value of Manufacturer
df.Manufacturer.unique()
As there are too many different manufacturer, we will put Unknown for null value