Skip to content
Netflix Movie Data
This dataset contains more than 8,500 Netflix movies and TV shows, including cast members, duration, and genre. It contains titles added as recently as late September 2021.
Analyze: Has Netflix invested more in certain genres (see listed_in
) in recent years? What about certain age groups (see ratings
)
# Load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Load data
Netflix_data = pd.read_csv("netflix_dataset.csv", index_col=0)
Netflix_data.head()
# Explore:
# How much variety exists in Netflix's offering? Base this on three variables: type, country, and listed_in.
distinct_count_type = Netflix_data['type'].nunique()
distinct_countries = Netflix_data['country'].str.split(',').explode().str.strip().nunique()
distinct_listed_in = Netflix_data['listed_in'].str.split(',').explode().str.strip().nunique()
print('Distinct count of type:', distinct_count_type)
print('Distinct count of countries:', distinct_countries)
print('Distinct count of listed_in:', distinct_listed_in)
#pull type, country, listed in
type_nf = Netflix_data["type"].unique()
print(type_nf)
unique_countries = Netflix_data['country'].str.split(',').explode().str.strip().unique()
unique_countries_nf = unique_countries
distinct_countries_nf = list(set(unique_countries_nf))
print(distinct_countries_nf)
distinct_listed_in_nf = list(set(Netflix_data['listed_in'].str.split(',').explode().str.strip().unique()))
print(distinct_listed_in_nf)
# Import the necessary libraries for WORDCLOUD
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Concatenate all the descriptions into a single string
descriptions = ' '.join(Netflix_data['description'].dropna())
# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(descriptions)
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
#Variables
import pandas as pd
variables = pd.DataFrame(columns=['Variable','Number of unique values','Values'])
for i, var in enumerate(Netflix_data.columns):
variables.loc[i] = [var, Netflix_data[var].nunique(), Netflix_data[var].unique().tolist()]
print(variables)
#Checking data type and missing values
Netflix_data.info()
Movies and TV Shows on Netflix
nf_movies = Netflix_data[Netflix_data['type'] == 'Movie']
print('Number of movies in the dataset : ', nf_movies.shape[0])
nf_tv = Netflix_data[Netflix_data['type'] == 'TV Show']
print('Number of TV Shows in the dataset : ', nf_tv.shape[0])
nf_movies['duration'] = nf_movies['duration'].str.replace(' min', '')
duration = nf_movies['duration'].astype('float64')
duration.describe()
# Plotting the distribution of movie durations
import matplotlib.pyplot as plt
# Extracting the duration column from nf_movies dataframe
duration = nf_movies['duration'].str.replace(' min', '').astype('float64')
# Creating a histogram
plt.hist(duration, bins=100, edgecolor='black')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Movie Durations')
plt.show()
nf_tv['duration'] = nf_tv['duration'].str.replace(' Season', '')
nf_tv['duration'] = nf_tv['duration'].str.replace('s', '')
seasons = nf_tv['duration'].astype('float64')
seasons.describe()
# Plotting the distribution of number of seasons
# Extract the number of seasons
seasons = nf_tv['duration'].astype('float64')
# Plot the histogram
plt.hist(seasons, bins=30, edgecolor='black', color = 'red')
plt.xlabel('Number of Seasons')
plt.ylabel('Frequency')
plt.title('Distribution of TV Shows on Netflix')
plt.show()
Ratings