Skip to content

Netflix! What started in 1997 as a DVD rental service has since exploded into one of the largest entertainment and media companies.

Given the large number of movies and series available on the platform, it is a perfect opportunity to flex your exploratory data analysis skills and dive into the entertainment industry.

You work for a production company that specializes in nostalgic styles. You want to do some research on movies released in the 1990's. You'll delve into Netflix data and perform exploratory data analysis to better understand this awesome movie decade!

You have been supplied with the dataset netflix_data.csv, along with the following table detailing the column names and descriptions. Feel free to experiment further after submitting!

The data

netflix_data.csv

ColumnDescription
show_idThe ID of the show
typeType of show
titleTitle of the show
directorDirector of the show
castCast of the show
countryCountry of origin
date_addedDate added to Netflix
release_yearYear of Netflix release
durationDuration of the show in minutes
descriptionDescription of the show
genreShow genre
# Importing pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt

# Read in the Netflix CSV as a DataFrame
netflix_df = pd.read_csv("netflix_data.csv")
#1.Inspect the data
# Display the first few rows of the dataset to understand its structure
print(netflix_df.head())

# Check the data types and missing values
print(netflix_df.info())

# Check for unique values in relevant columns
print(netflix_df['type'].unique())  # Movies or TV Shows?
print(netflix_df['release_year'].unique())  # Year of release
#2. Filter the movies from the 1990s
# Filter for movies only
movies_df = netflix_df[netflix_df['type'] == 'Movie']

# Filter further for movies released in the 1990s
movies_1990s = movies_df[(movies_df['release_year'] >= 1990) & (movies_df['release_year'] <= 1999)]

# Display the first few rows of the filtered DataFrame
print(movies_1990s.head())

# Check the number of movies from the 1990s
print(f"Number of movies from the 1990s: {len(movies_1990s)}")
#3. Movies released each year
# Count the number of movies released each year
movies_by_year = movies_1990s['release_year'].value_counts().sort_index()

# Plot the distribution
plt.figure(figsize=(10, 6))
movies_by_year.plot(kind='bar', color='skyblue')
plt.title('Number of Movies Released Each Year (1990s)', fontsize=16)
plt.xlabel('Release Year', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=45)
plt.show()
#4. Explore genres
# Check unique genres
print(movies_1990s['genre'].unique())

# Count the number of movies per genre
genre_counts = movies_1990s['genre'].value_counts()

# Plot the genre distribution
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar', color='orange')
plt.title('Popular Genres of Movies (1990s)', fontsize=16)
plt.xlabel('Genre', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=45)
plt.show()
#5. Explore runtime
# Check for missing runtime values
print(movies_1990s['duration'].isnull().sum())

# Plot a histogram of runtimes
plt.figure(figsize=(10, 6))
movies_1990s['duration'].plot(kind='hist', bins=20, color='purple', alpha=0.7)
plt.title('Distribution of Movie Runtimes (1990s)', fontsize=16)
plt.xlabel('Runtime (minutes)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()
#6. Full Analysis
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt

# Read the Netflix dataset
netflix_df = pd.read_csv("netflix_data.csv")

# Inspect the dataset
print(netflix_df.head())
print(netflix_df.info())

# Filter for movies from the 1990s
movies_df = netflix_df[netflix_df['type'] == 'Movie']
movies_1990s = movies_df[(movies_df['release_year'] >= 1990) & (movies_df['release_year'] <= 1999)]

# Number of movies from the 1990s
print(f"Number of movies from the 1990s: {len(movies_1990s)}")

# Movies released each year
movies_by_year = movies_1990s['release_year'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
movies_by_year.plot(kind='bar', color='skyblue')
plt.title('Number of Movies Released Each Year (1990s)', fontsize=16)
plt.xlabel('Release Year', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=45)
plt.show()

# Popular genres
if 'genre' in movies_1990s.columns:
    genre_counts = movies_1990s['genre'].value_counts()
    plt.figure(figsize=(12, 6))
    genre_counts.plot(kind='bar', color='orange')
    plt.title('Popular Genres of Movies (1990s)', fontsize=16)
    plt.xlabel('Genre', fontsize=14)
    plt.ylabel('Number of Movies', fontsize=14)
    plt.xticks(rotation=45)
    plt.show()

# Runtime distribution
if 'duration' in movies_1990s.columns:
    plt.figure(figsize=(10, 6))
    movies_1990s['duration'].plot(kind='hist', bins=20, color='purple', alpha=0.7)
    plt.title('Distribution of Movie Runtimes (1990s)', fontsize=16)
    plt.xlabel('Runtime (minutes)', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.show()
#7. Inspect columns for further analysis
# List all the columns in the dataset
print(netflix_df.columns)
#8. Group by time period.
# Convert 'date_added' to datetime format
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'])

# Extract the year from 'date_added' for analysis
netflix_df['year_added'] = netflix_df['date_added'].dt.year

# Group data into time periods (e.g., pre-2000s, 2000s, 2010s, and 2020s)
def assign_time_period(year):
    if pd.isna(year):
        return 'Unknown'
    elif year < 2000:
        return 'Before 2000'
    elif 2000 <= year < 2010:
        return '2000-2009'
    elif 2010 <= year < 2020:
        return '2010-2019'
    else:
        return '2020+'

netflix_df['time_period'] = netflix_df['year_added'].apply(assign_time_period)

# Inspect the time periods
print(netflix_df['time_period'].value_counts())
#9.Growth overtime
# Count the number of movies and TV shows added each year
content_growth = netflix_df.groupby(['year_added', 'type']).size().unstack()

# Plot the content growth over time
plt.figure(figsize=(12, 6))
content_growth.plot(kind='line', marker='o', linewidth=2)
plt.title('Netflix Content Growth Over Time', fontsize=16)
plt.xlabel('Year Added to Netflix', fontsize=14)
plt.ylabel('Number of Titles', fontsize=14)
plt.legend(title='Content Type', labels=['Movies', 'TV Shows'])
plt.grid(alpha=0.3)
plt.show()
#10. # Group movies by release decade
movies_df = netflix_df[netflix_df['type'] == 'Movie']
movies_df['release_decade'] = (movies_df['release_year'] // 10) * 10

# Count the number of movies per decade
movies_by_decade = movies_df['release_decade'].value_counts().sort_index()

# Plot the distribution of movies by release decade
plt.figure(figsize=(12, 6))
movies_by_decade.plot(kind='bar', color='skyblue')
plt.title('Number of Movies by Release Decade', fontsize=16)
plt.xlabel('Decade of Release', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=45)
plt.show()
#11. Analyze the boom
# Count movies by release year
movies_by_release_year = movies_df['release_year'].value_counts().sort_index()

# Count movies added to Netflix by year
movies_added_by_year = movies_df['year_added'].value_counts().sort_index()

# Plot comparison
plt.figure(figsize=(12, 6))
plt.plot(movies_by_release_year.index, movies_by_release_year.values, label='Movies by Release Year', color='orange', marker='o')
plt.plot(movies_added_by_year.index, movies_added_by_year.values, label='Movies Added to Netflix', color='blue', marker='o')
plt.title('Movies by Release Year vs. Movies Added to Netflix', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.legend()
plt.grid(alpha=0.3)
plt.show()
#12.Genre trends over time
# Group by genre and year added
if 'genre' in netflix_df.columns:
    genre_trends = netflix_df.groupby(['year_added', 'genre']).size().unstack(fill_value=0)

    # Plot trends for the top 5 genres
    top_5_genres = genre_trends.sum().sort_values(ascending=False).head(5).index
    genre_trends[top_5_genres].plot(figsize=(12, 6), linewidth=2)
    plt.title('Top 5 Genres Added to Netflix Over Time', fontsize=16)
    plt.xlabel('Year Added', fontsize=14)
    plt.ylabel('Number of Titles', fontsize=14)
    plt.legend(title='Genre')
    plt.grid(alpha=0.3)
    plt.show()