Skip to content
EDA of Sabine Hossenfelder's dataset (Part 2)
Initial EDA of the data I cleaned in Part 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warningswarnings.filterwarnings('ignore')
content = pd.read_csv('SH_video_data.csv')content.head()content.drop(columns=['Unnamed: 0'], inplace=True)content.describe()content[content['duration']==0]timeseries_content = content.sort_values(['release_date'], ascending=True)timeseries_content.head()views = pd.DataFrame(timeseries_content.groupby('release_date')['views'].sum())
views.head()#te daty mniejsze od 2019-12-05 wez popraw pozniej
plt.figure(figsize=(14, 4))
ax = sns.scatterplot(data=content, x ='release_date', y ='views', hue='duration', palette='plasma', alpha=0.7)
plt.show()#te daty mniejsze od 2019-12-05 wez popraw pozniej
plt.figure(figsize=(14, 4))
plt.yscale("log")
ax = sns.scatterplot(data=content, x ='release_date', y ='views',size = 'duration', hue='duration', palette='plasma', alpha=0.7)
plt.show()