Skip to content
Investigating Netflix Movies and Guest Stars in The Office
# Create the years and durations lists
years = [x+1 for x in range(2010,2020)]
durations = [103, 101, 99, 100, 100, 95, 95, 96, 93,90]
# Create a dictionary with the two lists
movie_dict = {"years":years,"durations":durations}
# Print the dictionary
movie_dict2. Creating a DataFrame from a dictionary
To convert our dictionary movie_dict to a pandas DataFrame, we will first need to import the library under its usual alias. We'll also want to inspect our DataFrame to ensure it was created correctly. Let's perform these steps now.
# Import pandas under its usual alias
import pandas as pd
# Create a DataFrame from the dictionary
durations_df = pd.DataFrame(movie_dict)
# Print the DataFrame
print(durations_df)# Import matplotlib.pyplot under its usual alias and create a figure
import matplotlib.pyplot as plt
fig = plt.figure()
# Draw a line plot of release_years and durations
plt.plot(durations_df['years'],durations_df['durations'])
# Create a title
plt.xlabel("release_years")
plt.ylabel("durations")
# Show the plot
plt.show()# Read in the CSV as a DataFrame
netflix_df = pd.read_csv('datasets/netflix_data.csv')
# Print the first five rows of the DataFrame
netflix_df.head()# Subset the DataFrame for type "Movie"
netflix_df_movies_only=netflix_df[netflix_df.type=='Movie']
netflix_df_movies_only.head()
# Select only the columns of interest
netflix_movies_col_subset = netflix_df_movies_only.loc[:,['title','country','genre','release_year','duration']]
# Print the first five rows of the new DataFrame
netflix_movies_col_subset.head()# Create a figure and increase the figure size
fig = plt.figure(figsize=(12,8))
# Create a scatter plot of duration versus year
plt.scatter(netflix_movies_col_subset['release_year'],netflix_movies_col_subset['duration'])
# Create a title
plt.title("Netflix durations vs years")
# Show the plot
plt.show()# Filter for durations shorter than 60 minutes
short_movies = netflix_movies_col_subset[netflix_movies_col_subset.duration<60]
# Print the first 20 rows of short_movies
print(short_movies.iloc[:20])# Define an empty list
colors = []
# Iterate over rows of netflix_movies_col_subset
for lab,row in netflix_movies_col_subset.iterrows() :
if row['genre']== 'Children':
colors.append('red')
elif row['genre']== 'Documentaries':
colors.append('blue')
elif row['genre']== "Stand-Up":
colors.append('green')
else:
colors.append('black')
# Inspect the first 10 values in your list
print(colors[:10])# Set the figure style and initalize a new figure
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12,8))
# Create a scatter plot of duration versus release_year
plt.scatter(netflix_movies_col_subset['release_year'],netflix_movies_col_subset['duration'],color=colors)
# Create a title and axis labels
plt.xlabel('Release year')
plt.ylabel('Release year')
plt.title('Movie duration by year of release')
# Show the plot
plt.show()# Are we certain that movies are getting shorter?
are_movies_getting_shorter = "we are not sure"