Skip to content
Dirty Data Cleaning
DataFrameas
df
variable
SELECT * FROM 'dirty_data.csv';import pandas as pd
df_raw = pd.read_csv("dirty_data.csv", header=[0,1])
# Fix first two columns names explicitly
df_raw.rename(columns={df_raw.columns[0]: "Order ID", df_raw.columns[1]: "Order Date"}, inplace=True)
# Fix unnamed ship mode headers: replace any "Unnamed..." with the previous non-empty header
# This forward-fills the first-level header row
new_first_level = []
last_valid = None
for col in df_raw.columns:
if "Unnamed" in col[0]:
new_first_level.append(last_valid)
else:
new_first_level.append(col[0])
last_valid = col[0]
# Combine new first-level header with second-level to make new column names
new_cols = []
for first, second in zip(new_first_level, [c[1] for c in df_raw.columns]):
new_cols.append(f"{first}|{second}")
# Rename columns
df_raw.columns = new_cols
# Rename first two columns back to proper names
df_raw.rename(columns={new_cols[0]: "Order ID", new_cols[1]: "Order Date"}, inplace=True)
# Melt as before
id_vars = ["Order ID", "Order Date"]
value_vars = df_raw.columns.drop(id_vars)
df_clean = df_raw.melt(id_vars=id_vars,
value_vars=value_vars,
var_name="ShipMode_Segment",
value_name="Sales")
df_clean[['Ship Mode', 'Segment']] = df_clean['ShipMode_Segment'].str.split('|', expand=True)
df_clean.drop(columns=['ShipMode_Segment'], inplace=True)
df_clean = df_clean.dropna(subset=['Sales'])
# Ensure the column is datetime (this is the key step)
df_clean["Order Date"] = pd.to_datetime(df_clean["Order Date"], errors='coerce')
# Drop rows where Order Date is null
df_clean = df_clean.dropna(subset=["Order Date"])
# Remove time, keep only date
df_clean["Order Date"] = df_clean["Order Date"].dt.date
# Check for nulls again
print("Missing values per column after cleaning Order Date:")
print(df_clean.isnull().sum())
print(df_clean.head())print(df_clean.isnull().sum())
df_clean
print(df_clean["Order Date"].head())
print(df_clean["Order Date"].dtype)
df_clean.to_csv("cleaned_data.csv", index=False)