Skip to content
Just Data Cleaning
🕵️♂️ Movie Dataset Cleaning Report
Introduction This report documents the step-by-step process used to clean and prepare the movie dataset for analysis. The dataset contained columns such as MOVIES, YEAR, GENRE, RATING, ONE-LINE, STARS, VOTES, RunTime, and Gross. Several columns had missing or messy data that required cleaning.
Step 1: Data Viewing
DataFrameas
df
variable
SELECT * FROM 'movies.csv';Step 2: Data Loading
import pandas as pd
df = pd.read_csv("movies.csv") # or pd.read_excel("your_file.xlsx")
print(df)
Step 3: Checking the Information on the Data
print(df.info())
print(df.head())
Step 4: Check for Missing Values
import pandas as pd
# Load the data
df = pd.read_csv("movies.csv")
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nAny missing values in the DataFrame?:", df.isnull().values.any())
print("\nTotal number of missing values in the DataFrame:", df.isnull().sum().sum())
print("\nRows with any missing values:")
print(df[df.isnull().any(axis=1)])
Step 5: Cleaning of the Data
import pandas as pd
# Load data
df = pd.read_csv("movies.csv")
# Clean YEAR column (extract 4-digit year and convert to numeric)
df['YEAR'] = df['YEAR'].astype(str).str.extract(r'(\d{4})')
df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')
# Fill GENRE missing values with 'Unknown'
df['GENRE'] = df['GENRE'].fillna('Unknown')
# RATING: fill missing per genre median, fallback to overall median
df['RATING'] = pd.to_numeric(df['RATING'], errors='coerce')
df['RATING'] = df.groupby('GENRE')['RATING'].transform(lambda x: x.fillna(x.median()))
df['RATING'] = df['RATING'].fillna(df['RATING'].median())
# VOTES: clean commas/spaces, then convert, keep NaNs
df['VOTES'] = df['VOTES'].astype(str).str.replace(',', '').str.strip()
df['VOTES'] = pd.to_numeric(df['VOTES'], errors='coerce')
# No filling for VOTES, keep NaNs as is
df['VOTES'] = df.groupby('GENRE')['VOTES'].transform(lambda x: x.fillna(x.median()))
df['VOTES'] = df['VOTES'].fillna(df['VOTES'].median())
df['RunTime'] = pd.to_numeric(df['RunTime'], errors='coerce')
df['RunTime'] = df.groupby('GENRE')['RunTime'].transform(lambda x: x.fillna(x.median()))
df['RunTime'] = df['RunTime'].fillna(df['RunTime'].median()) # fallback overall median
# Fill missing STARS with most frequent per genre
df['STARS'] = df.groupby('GENRE')['STARS'].transform(
lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown')
)
# --- CLEAN GROSS COLUMN SAFELY ---
def parse_gross(value):
if pd.isna(value) or str(value).strip() == '':
return pd.NA
value = str(value).strip().lower().replace(' ', '')
multiplier = 1
if value.endswith('m'):
multiplier = 1_000_000
value = value[:-1]
elif value.endswith('k'):
multiplier = 1_000
value = value[:-1]
value = value.replace('$', '')
try:
return float(value) * multiplier
except:
return pd.NA
# Convert Gross to numeric
df['Gross_num'] = df['Gross'].apply(parse_gross)
# Fill missing Gross_num using GENRE median first
df['Gross_num'] = df.groupby('GENRE')['Gross_num'].transform(lambda x: x.fillna(x.median()))
# Fill any remaining missing Gross_num with overall median
df['Gross_num'] = df['Gross_num'].fillna(df['Gross_num'].median())
# Format Gross back to string with dollar sign
df['Gross'] = df['Gross_num'].apply(lambda x: f"${x:,.0f}" if pd.notna(x) else "$0")
# Optional: Drop rows with missing MOVIES or YEAR
df = df.dropna(subset=['MOVIES', 'YEAR'])
# Final check
print(df.head())
print("\nRemaining missing values:\n", df.isnull().sum())
# Drop helper column
df.drop(columns=['Gross_num'], inplace=True)
Step 6: Check for Missing Values After cleaning
# Check for missing values after cleaning
print("Missing values per column after cleaning:")
print(df.isnull().sum())
print("\nAny missing values in the DataFrame?:", df.isnull().values.any())
print("\nTotal number of missing values in the DataFrame:", df.isnull().sum().sum())
Step 7: Confirm the New DataFrame
df