Skip to content
Netflix Top 10
Duration to Netflix Rank!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Load the "netflix_top10.csv" dataset
netflix_top10 = pd.read_csv("netflix_top10.csv")
# Calculate the correlation between runtime and weekly rank
correlation_coefficient = netflix_top10['runtime'].corr(netflix_top10['weekly_rank'])
# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Define data points
x = netflix_top10['runtime']
y = netflix_top10['weekly_rank']
z = np.abs(x * y) # Custom metric combining runtime and weekly rank
# Create a custom colormap based on the combined metric (z)
colormap = plt.cm.viridis
# Scatter plot with color-coded points
sc = ax.scatter(x, y, z, c=z, cmap=colormap, alpha=0.7, s=30)
# Set axis labels and a title
ax.set_xlabel('Runtime (minutes)')
ax.set_ylabel('Weekly Rank')
ax.set_zlabel('Metric')
ax.set_title(f'Correlation Visualization\nCorrelation Coefficient: {correlation_coefficient:.2f}')
# Create a colorbar to explain the colormap
cbar = plt.colorbar(sc)
cbar.set_label('Metric Value')
plt.show()
import pandas as pd
# Load the CSV data into a pandas DataFrame
df = pd.read_csv('netflix_top10_country.csv')
# Sort the DataFrame by 'cumulative_weeks_in_top_10' in descending order
df = df.sort_values(by='cumulative_weeks_in_top_10', ascending=False)
# Limit the DataFrame to the top 3 records
top_3 = df.head(3)
# Calculate additional statistics or create new columns as needed
# For example, calculating the mean and median of 'cumulative_weeks_in_top_10'
mean_weeks = df['cumulative_weeks_in_top_10'].mean()
median_weeks = df['cumulative_weeks_in_top_10'].median()
# Create new columns or perform calculations
top_3['mean_weeks'] = mean_weeks
top_3['median_weeks'] = median_weeks
# Print the resulting DataFrame with the new columns
print(top_3)
import pandas as pd
# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')
# Summary statistics
summary_stats = df.describe()
# Print summary statistics
print(summary_stats)
import pandas as pd
# Read the CSV files into dataframes
global_top_10 = pd.read_csv("netflix_top10.csv")
countries_top_10 = pd.read_csv("netflix_top10_country.csv", index_col=0)
# Combine different categories into a single weekly top 10 list
combined_top_10 = pd.concat([global_top_10.reset_index(drop=True), countries_top_10.reset_index(drop=True)], axis=1)
# You can now explore the combined_top_10 dataframe.
Current Type: Bar
Current X-axis: weekly_hours_viewed
Current Y-axis: category
Current Color: None
Hours Viewed by Category
import pandas as pd
# Load the CSV data
netflix_top10 = pd.read_csv("netflix_top10.csv")
# Select relevant columns
data = netflix_top10[['show_title', 'weekly_rank']]
data['title_length'] = data['show_title'].apply(len)
# Calculate the correlation coefficient between title length and weekly rank
correlation_coefficient = data['title_length'].corr(data['weekly_rank'])
# Print the correlation coefficient
print(f"Correlation Coefficient: {correlation_coefficient}")
correlation_coefficient = netflix_top10['runtime'].corr(netflix_top10['weekly_rank'])
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')
# Select numeric columns for pair plots
numeric_columns = ['runtime', 'weekly_rank']
# Create pair plots
sns.pairplot(df[numeric_columns])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')
# Histogram for 'runtime' column
plt.figure(figsize=(8, 5))
plt.hist(df['runtime'], bins=20, edgecolor='k')
plt.title('Histogram of Runtime')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# Scatter plot of 'runtime' vs. 'weekly_rank'
plt.figure(figsize=(8, 5))
plt.scatter(df['runtime'], df['weekly_rank'], alpha=0.5)
plt.title('Scatter Plot of Runtime vs. Weekly Rank')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Weekly Rank')
plt.grid(True)
plt.show()
DataFrameas
df1
variable
SELECT * FROM 'netflix_top10_country.csv'
DataFrameas
df2
variable
SELECT category, AVG(runtime) AS average_runtime
FROM netflix_top10
GROUP BY category;