Netflix! What started in 1997 as a DVD rental service has since exploded into one of the largest entertainment and media companies.
Given the large number of movies and series available on the platform, it is a perfect opportunity to flex your exploratory data analysis skills and dive into the entertainment industry.
You work for a production company that specializes in nostalgic styles. You want to do some research on movies released in the 1990's. You'll delve into Netflix data and perform exploratory data analysis to better understand this awesome movie decade!
You have been supplied with the dataset netflix_data.csv, along with the following table detailing the column names and descriptions. Feel free to experiment further after submitting!
The data
netflix_data.csv
| Column | Description |
|---|---|
show_id | The ID of the show |
type | Type of show |
title | Title of the show |
director | Director of the show |
cast | Cast of the show |
country | Country of origin |
date_added | Date added to Netflix |
release_year | Year of Netflix release |
duration | Duration of the show in minutes |
description | Description of the show |
genre | Show genre |
# Importing pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
# Read in the Netflix CSV as a DataFrame
netflix_df = pd.read_csv("netflix_data.csv")from sklearn.model_selection import train_test_split
from transformers import pipeline
sns.set_theme(style="whitegrid")
# Convert 'date_added' to datetime
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'])
# Convert 'duration' to numeric, coercing errors to NaN
netflix_df['duration'] = pd.to_numeric(netflix_df['duration'], errors='coerce')
# Filter for movies from the 1990s
movies_1990s = netflix_df[(netflix_df['release_year'] >= 1990) & (netflix_df['release_year'] < 2000) & (netflix_df['type'] == 'Movie')].dropna(subset=['duration'])
# Identify top 5 directors
top_directors = movies_1990s['director'].value_counts().head(5)
# Identify top 5 actors
actor_counts = Counter([actor for sublist in movies_1990s['cast'].dropna().str.split(', ') for actor in sublist])
top_actors = pd.Series(actor_counts).nlargest(5)
# Identify top 10 countries producing 1990s movies
country_counts = movies_1990s['country'].value_counts().head(10)
# Extract the year when movies were added to Netflix
movies_1990s['year_added'] = movies_1990s['date_added'].dt.year
year_added_counts = movies_1990s['year_added'].value_counts().sort_index()
# Plot: Movie Duration by Genre (1990s)
plt.figure(figsize=(10, 6))
sns.boxplot(data=movies_1990s, x='genre', y='duration')
plt.xticks(rotation=45)
plt.title("Movie Duration by Genre (1990s)")
plt.show()
# Plot: Top 5 Directors and Actors of the 1990s
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.barplot(ax=axes[0], x=top_directors.index, y=top_directors.values, palette="viridis")
axes[0].set_title("Top 5 Directors of the 1990s")
axes[0].set_xlabel("Director")
axes[0].set_ylabel("Number of Movies")
axes[0].tick_params(axis='x', rotation=45)
sns.barplot(ax=axes[1], x=top_actors.index, y=top_actors.values, palette="magma")
axes[1].set_title("Top 5 Actors in 1990s Movies")
axes[1].set_xlabel("Actor")
axes[1].set_ylabel("Number of Movies")
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Plot: Top 10 Countries Producing 1990s Movies and When They Were Added to Netflix
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.barplot(ax=axes[0], x=country_counts.index, y=country_counts.values, palette="coolwarm")
axes[0].set_title("Top 10 Countries Producing 1990s Movies")
axes[0].set_xlabel("Country")
axes[0].set_ylabel("Number of Movies")
axes[0].tick_params(axis='x', rotation=45)
sns.lineplot(ax=axes[1], x=year_added_counts.index, y=year_added_counts.values, marker="o", color="red")
axes[1].set_title("When 1990s Movies Were Added to Netflix")
axes[1].set_xlabel("Year Added to Netflix")
axes[1].set_ylabel("Number of Movies")
plt.tight_layout()
plt.show()
# Filter for all movies and calculate the decade
movies = netflix_df[netflix_df['type'] == 'Movie'].dropna(subset=['duration', 'release_year'])
movies['decade'] = (movies['release_year'] // 10) * 10
# Calculate the number of movies and average duration per decade
movies_decade_counts = movies.groupby('decade').size().reset_index(name='count')
movies_avg_duration = movies.groupby('decade')['duration'].mean().reset_index()
# Plot: Number of Movies and Average Duration per Decade
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.barplot(ax=axes[0], data=movies_decade_counts, x='decade', y='count', palette="Blues_d")
axes[0].set_title("Number of Movies per Decade")
axes[0].set_xlabel("Decade")
axes[0].set_ylabel("Number of Movies")
sns.barplot(ax=axes[1], data=movies_avg_duration, x='decade', y='duration', palette="Oranges_d")
axes[1].set_title("Average Movie Duration per Decade")
axes[1].set_xlabel("Decade")
axes[1].set_ylabel("Average Duration (minutes)")
plt.tight_layout()
plt.show()
# Extract the primary genre
movies['primary_genre'] = movies['genre'].apply(lambda x: x.split(',')[0] if pd.notnull(x) else None)
# Calculate genre distribution by decade
genre_decade = movies.groupby(['decade', 'primary_genre']).size().reset_index(name='count')
pivot_genre = genre_decade.pivot(index='primary_genre', columns='decade', values='count').fillna(0)
# Plot: Primary Genre Distribution by Decade
pivot_genre.plot(kind='bar', figsize=(16, 8))
plt.title("Primary Genre Distribution by Decade")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title='Decade')
plt.tight_layout()
plt.show()
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Define candidate labels
candidate_labels = ["Action", "Comedy", "Drama", "Documentary", "Horror", "Romance", "Thriller", "Sci-Fi", "Animation", "Adventure"]
# Split the data into training and testing sets
train_movies, test_movies = train_test_split(movies, test_size=0.02, random_state=42)
# Ensure 'description' and 'primary_genre' columns are not null
test_movies = test_movies.dropna(subset=['description', 'primary_genre'])
# Perform zero-shot classification on the test set
zeroshot_preds = classifier(test_movies['description'].tolist(), candidate_labels=candidate_labels, multi_label=False)
# Extract the predicted labels
predicted_labels = [pred['labels'][0] for pred in zeroshot_preds]
# Calculate accuracy
accuracy = sum([pred == true for pred, true in zip(predicted_labels, test_movies['primary_genre'])]) / len(test_movies)
print(f"Accuracy: {accuracy * 100:.2f}%")