Skip to content


1 hidden cell
# Importing pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt

# Start coding!
# Load CSV file and store as netflix_df
netflix_df = pd.read_csv("netflix_data.csv")
# Filter the data to remove TV shows and store as netflix_subset
netflix_subset = netflix_df[netflix_df['type'] != 'TV Show']

# Investigate the Netflix movie data, keeping only the columns "title", "country", "genre", "release_year", "duration", and saving this into a new DataFrame called netflix_movies
selected_netflixmovie = ["title", "country", "genre", "release_year", "duration"]
netflix_movies = netflix_df[netflix_df['type'] == 'Movie'][selected_netflixmovie]

# Filter netflix_movies to find the movies that are strictly shorter than 60 minutes, saving the resulting DataFrame as short_movies; inspect the result to find possible contributing factors.
short_movies = netflix_movies[netflix_movies['duration'] < 60]
# Possible factor: The movies are documentaries, children movies, or talkshows
# Using a for loop and if/elif statements, iterate through the rows of netflix_movies and assign colors of your choice to four genre groups ("Children", "Documentaries", "Stand-Up", and "Other" for everything else). Save the results in a colors list. Initialize a matplotlib figure object called fig and create a scatter plot for movie duration by release year using the colors list to color the points and using the labels "Release year" for the x-axis, "Duration (min)" for the y-axis, and the title "Movie Duration by Year of Release".

#Assigning colours
colors = []

for index, row in netflix_movies.iterrows():
    genre = row['genre']
    if genre == 'Children':
        colors.append('blue')
    elif genre == 'Documentaries':
        colors.append('green')
    elif genre == 'Stand-Up':
        colors.append('orange')
    else:
        colors.append('gray')

# Set the figure style and initalize a new figure
fig = plt.figure(figsize=(12,8))

# Create a scatter plot of duration versus release_year
plt.scatter(netflix_movies.release_year, netflix_movies.duration, c=colors, alpha=0.5)

# Set labels and title
plt.title("Movie Duration by Year of Release")
plt.xlabel("Release year")
plt.ylabel("Duration (min)")

# Show the plot
plt.show()

Are we certain that movies are getting shorter? No.

# Are we certain that movies are getting shorter?
answer = "No"