Skip to content

Initial EDA of the data I cleaned in Part 1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

content = pd.read_csv('SH_video_data.csv')
content.head()
content.drop(columns=['Unnamed: 0'], inplace=True)
content.describe()
content[content['duration']==0]
timeseries_content = content.sort_values(['release_date'], ascending=True)
timeseries_content.head()
views = pd.DataFrame(timeseries_content.groupby('release_date')['views'].sum())
views.head()
#te daty mniejsze od 2019-12-05 wez popraw pozniej
plt.figure(figsize=(14, 4))
ax = sns.scatterplot(data=content, x ='release_date', y ='views', hue='duration', palette='plasma', alpha=0.7)


plt.show()
#te daty mniejsze od 2019-12-05 wez popraw pozniej
plt.figure(figsize=(14, 4))
plt.yscale("log")
ax = sns.scatterplot(data=content, x ='release_date', y ='views',size = 'duration', hue='duration', palette='plasma', alpha=0.7)


plt.show()