Investigating Netflix Movies and Guest Stars in The Office

#create a list of years of the data ranging from 2011 to 2020, and a list of durations 
years = []

[years.append(year) for year in range(2011,2021)]

durations = [103, 101, 99, 100, 100, 95, 95, 96, 93, 90]

movie_dict = {'years':years,'durations':durations}

print(movie_dict)

import pandas as pd 

# Create a DataFrame from the dictionary
durations_df = pd.DataFrame(movie_dict)

print(durations_df)

# Import matplotlib.pyplot under its usual alias and create a figure
import matplotlib.pyplot as plt 
fig = plt.figure()

# Draw a line plot of release_years and durations
plt.plot(durations_df['years'],durations_df['durations'])

# Create a title
plt.title('Netflix Movie Durations 2011-2020')

# Show the plot
plt.show()

# Read in the CSV as a DataFrame
netflix_df = pd.read_csv('datasets/netflix_data.csv')

# Print the first five rows of the DataFrame
print(netflix_df.head(5))

# Subset the DataFrame for type "Movie"
netflix_df_movies_only = netflix_df[netflix_df['type'] == 'Movie']

# Select only the columns of interest
netflix_movies_col_subset = netflix_df_movies_only[['title','country','genre','release_year','duration']]

# Print the first five rows of the new DataFrame
print(netflix_movies_col_subset.head(5))

# Create a figure and increase the figure size
fig = plt.figure(figsize=(12,8))

plt.scatter(netflix_movies_col_subset['release_year'],netflix_movies_col_subset['duration'])

plt.title('Movie Duration by Year of Release')

plt.show()

# Filter for durations shorter than 60 minutes
short_movies = netflix_movies_col_subset[netflix_movies_col_subset['duration'] < 60]

print(short_movies.head(20))

colors = []

for lab, row in netflix_movies_col_subset.iterrows() :
    if row['genre'] == 'Children' :
        colors.append('red')
    elif row['genre'] =='Documentaries' :
        colors.append('blue')
    elif row['genre'] == 'Stand-Up' :
        colors.append('green')
    else:
        colors.append('black')
        
# Inspect the first 10 values in your list        
print(colors[:10])

# Set the figure style and initalize a new figure
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12,8))

# Create a scatter plot of duration versus release_year
plt.scatter(netflix_movies_col_subset['release_year'],netflix_movies_col_subset['duration'],color = colors)

# Create a title and axis labels
plt.xlabel('Release year')
plt.ylabel('Duration (min) ')
plt.title('Movie duration by year of release')

# Show the plot
plt.show()

# Are we certain that movies are getting shorter?
are_movies_getting_shorter = 'We can not tell'

Hidden output