Skip to content
Netflix Movie Data
  • AI Chat
  • Code
  • Report
  • Netflix Movie Data

    This dataset contains more than 8,500 Netflix movies and TV shows, including cast members, duration, and genre. It contains titles added as recently as late September 2021.

    Analyze: Has Netflix invested more in certain genres (see listed_in) in recent years? What about certain age groups (see ratings)

    # Load packages
    import pandas as pd
    import numpy as np 
    import matplotlib.pyplot as plt
    
    #Load data
    Netflix_data = pd.read_csv("netflix_dataset.csv", index_col=0)
    Netflix_data.head()
    # Explore: 
    # How much variety exists in Netflix's offering? Base this on three variables: type, country, and listed_in.
    distinct_count_type = Netflix_data['type'].nunique()
    distinct_countries = Netflix_data['country'].str.split(',').explode().str.strip().nunique()
    distinct_listed_in = Netflix_data['listed_in'].str.split(',').explode().str.strip().nunique()
    
    print('Distinct count of type:', distinct_count_type)
    print('Distinct count of countries:', distinct_countries)
    print('Distinct count of listed_in:', distinct_listed_in)
    #pull type, country, listed in                          
    type_nf = Netflix_data["type"].unique()
    print(type_nf)
    
    unique_countries = Netflix_data['country'].str.split(',').explode().str.strip().unique()
    unique_countries_nf = unique_countries
    distinct_countries_nf = list(set(unique_countries_nf))
    print(distinct_countries_nf)
    
    distinct_listed_in_nf = list(set(Netflix_data['listed_in'].str.split(',').explode().str.strip().unique()))
    print(distinct_listed_in_nf)
    # Import the necessary libraries for WORDCLOUD
    import pandas as pd
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    # Concatenate all the descriptions into a single string
    descriptions = ' '.join(Netflix_data['description'].dropna())
    
    # Create a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(descriptions)
    
    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
    #Variables
    import pandas as pd
    
    variables = pd.DataFrame(columns=['Variable','Number of unique values','Values'])
    
    for i, var in enumerate(Netflix_data.columns):
        variables.loc[i] = [var, Netflix_data[var].nunique(), Netflix_data[var].unique().tolist()]
        
    print(variables)
    #Checking data type and missing values
    Netflix_data.info()

    Movies and TV Shows on Netflix

    nf_movies = Netflix_data[Netflix_data['type'] == 'Movie']
    print('Number of movies in the dataset : ', nf_movies.shape[0])
    
    nf_tv = Netflix_data[Netflix_data['type'] == 'TV Show']
    print('Number of TV Shows in the dataset : ', nf_tv.shape[0])
    nf_movies['duration'] = nf_movies['duration'].str.replace(' min', '')
    duration = nf_movies['duration'].astype('float64')
    duration.describe()
    # Plotting the distribution of movie durations
    import matplotlib.pyplot as plt
    
    # Extracting the duration column from nf_movies dataframe
    duration = nf_movies['duration'].str.replace(' min', '').astype('float64')
    
    # Creating a histogram
    plt.hist(duration, bins=100, edgecolor='black')
    plt.xlabel('Duration (minutes)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Movie Durations')
    plt.show()
    
    nf_tv['duration'] = nf_tv['duration'].str.replace(' Season', '')
    nf_tv['duration'] = nf_tv['duration'].str.replace('s', '')
    seasons = nf_tv['duration'].astype('float64')
    seasons.describe()
    # Plotting the distribution of number of seasons
    # Extract the number of seasons
    seasons = nf_tv['duration'].astype('float64')
    
    # Plot the histogram
    plt.hist(seasons, bins=30, edgecolor='black', color = 'red')
    plt.xlabel('Number of Seasons')
    plt.ylabel('Frequency')
    plt.title('Distribution of TV Shows on Netflix')
    plt.show()
    

    Ratings