Netflix Top 10 — DataLab

Duration to Netflix Rank!

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load the "netflix_top10.csv" dataset
netflix_top10 = pd.read_csv("netflix_top10.csv")

# Calculate the correlation between runtime and weekly rank
correlation_coefficient = netflix_top10['runtime'].corr(netflix_top10['weekly_rank'])

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Define data points
x = netflix_top10['runtime']
y = netflix_top10['weekly_rank']
z = np.abs(x * y)  # Custom metric combining runtime and weekly rank

# Create a custom colormap based on the combined metric (z)
colormap = plt.cm.viridis

# Scatter plot with color-coded points
sc = ax.scatter(x, y, z, c=z, cmap=colormap, alpha=0.7, s=30)

# Set axis labels and a title
ax.set_xlabel('Runtime (minutes)')
ax.set_ylabel('Weekly Rank')
ax.set_zlabel('Metric')
ax.set_title(f'Correlation Visualization\nCorrelation Coefficient: {correlation_coefficient:.2f}')

# Create a colorbar to explain the colormap
cbar = plt.colorbar(sc)
cbar.set_label('Metric Value')

plt.show()

import pandas as pd

# Load the CSV data into a pandas DataFrame
df = pd.read_csv('netflix_top10_country.csv')

# Sort the DataFrame by 'cumulative_weeks_in_top_10' in descending order
df = df.sort_values(by='cumulative_weeks_in_top_10', ascending=False)

# Limit the DataFrame to the top 3 records
top_3 = df.head(3)

# Calculate additional statistics or create new columns as needed
# For example, calculating the mean and median of 'cumulative_weeks_in_top_10'
mean_weeks = df['cumulative_weeks_in_top_10'].mean()
median_weeks = df['cumulative_weeks_in_top_10'].median()

# Create new columns or perform calculations
top_3['mean_weeks'] = mean_weeks
top_3['median_weeks'] = median_weeks

# Print the resulting DataFrame with the new columns
print(top_3)

import pandas as pd

# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')

# Summary statistics
summary_stats = df.describe()

# Print summary statistics
print(summary_stats)

import pandas as pd

# Read the CSV files into dataframes

global_top_10 = pd.read_csv("netflix_top10.csv")

countries_top_10 = pd.read_csv("netflix_top10_country.csv", index_col=0)

# Combine different categories into a single weekly top 10 list

combined_top_10 = pd.concat([global_top_10.reset_index(drop=True), countries_top_10.reset_index(drop=True)], axis=1)

# You can now explore the combined_top_10 dataframe.

DataFrame

Current Type: Bar

Type

Current X-axis: weekly_hours_viewed

X-axis

Current Y-axis: category

Y-axis

Current Color: None

Color

Hours Viewed by Category

import pandas as pd

# Load the CSV data
netflix_top10 = pd.read_csv("netflix_top10.csv")

# Select relevant columns
data = netflix_top10[['show_title', 'weekly_rank']]

data['title_length'] = data['show_title'].apply(len)

# Calculate the correlation coefficient between title length and weekly rank
correlation_coefficient = data['title_length'].corr(data['weekly_rank'])

# Print the correlation coefficient
print(f"Correlation Coefficient: {correlation_coefficient}")

correlation_coefficient = netflix_top10['runtime'].corr(netflix_top10['weekly_rank'])

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')

# Select numeric columns for pair plots
numeric_columns = ['runtime', 'weekly_rank']

# Create pair plots
sns.pairplot(df[numeric_columns])
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Load the data from the 'netflix_top10.csv' file
df = pd.read_csv('netflix_top10.csv')

# Histogram for 'runtime' column
plt.figure(figsize=(8, 5))
plt.hist(df['runtime'], bins=20, edgecolor='k')
plt.title('Histogram of Runtime')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()




# Scatter plot of 'runtime' vs. 'weekly_rank'
plt.figure(figsize=(8, 5))
plt.scatter(df['runtime'], df['weekly_rank'], alpha=0.5)
plt.title('Scatter Plot of Runtime vs. Weekly Rank')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Weekly Rank')
plt.grid(True)
plt.show()

DataFrameas

df1

variable

SELECT * FROM 'netflix_top10_country.csv'

DataFrameas

df2

variable

SELECT category, AVG(runtime) AS average_runtime
FROM netflix_top10
GROUP BY category;

‌
‌
‌