Skip to content
1 hidden cell
2 hidden cells
Project: Analyzing Youtube Channel Engagement
pip install --upgrade google-api-python-client
Hidden output
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
from IPython.display import JSON
youtube_key = os.getenv("YOUTUBE_API")channel = ['UCth3HsaePBfeeB6KKBp6fwA']
youtube = build('youtube', 'v3', developerKey=youtube_key)
request = youtube.channels().list(
part="snippet,contentDetails,statistics",
id=','.join(channel)
)
response = request.execute()
print(response)
Hidden output
def channel_stats(youtube, channel):
request = youtube.channels().list(
part="snippet,contentDetails,statistics",
id=','.join(channel)
)
response = request.execute()
# getting the playlist ids
playlist_id = None
if 'items' in response and len(response['items']) > 0:
playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
print(f"The playlist ID is: {playlist_id}")
return playlist_id
#getting video ids
def get_video_ids(youtube, playlist_id):
video_ids = []
request = youtube.playlistItems().list(
part="snippet,contentDetails",
playlistId=playlist_id,
maxResults = 50
)
response = request.execute()
for item in response['items']:
video_ids.append(item['contentDetails']['videoId'])
return video_ids
playlist_id = channel_stats(youtube, channel)
video_ids = get_video_ids(youtube, playlist_id)
def get_video_details(youtube, video_ids):
if video_ids is None:
raise ValueError("video_ids is None. Please provide a valid list of video IDs.")
all_video_info = []
request = youtube.videos().list(
part="snippet,contentDetails,statistics",
id=','.join(video_ids[0:50])
)
response = request.execute()
for video in response['items']:
basic_stats = {
'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
'statistics': ['viewCount', 'likeCount', 'favoriteCount', 'commentCount'],
'contentDetails': ['duration', 'definition', 'caption']
}
video_info = {}
video_info['video_id'] = video['id']
for k in basic_stats.keys():
for v in basic_stats[k]:
try:
video_info[v] = video[k][v]
except KeyError:
video_info[v] = None
all_video_info.append(video_info)
return pd.DataFrame(all_video_info)
video_df = get_video_details(youtube, video_ids)
video_count = len(video_df)
print(video_count)
video_dfHidden output
def engagement_rate_analysis(video_df):
video_df['engagement_rate'] = (video_df['likeCount'] + video_df['commentCount']) / video_df['viewCount']
return video_df[['title', 'engagement_rate']].sort_values(by='engagement_rate', ascending=False)
top_engagement_rates = engagement_rate_analysis(video_df)
top_engagement_rates
1 hidden cell
numeric_cols = ['viewCount', 'likeCount', 'favoriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)Hidden output
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
def extract_keywords(comments_df):
stop_words = set(stopwords.words('english'))
all_words = ' '.join(comments_df['text']).lower()
word_tokens = word_tokenize(all_words)
filtered_words = [w for w in word_tokens if w.isalnum() and w not in stop_words]
word_counts = Counter(filtered_words)
return word_counts.most_common(20)
common_keywords = extract_keywords(comments_df)
keywords, counts = zip(*common_keywords)
plt.figure(figsize=(12, 6))
palette = sns.color_palette("husl", len(keywords))
sns.barplot(x=list(keywords), y=list(counts), palette=palette)
plt.title('Top 20 Most Common Keywords in YouTube Comments')
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
def sam_vs_shaan(comments_df):
names = ['sam', 'shaan']
all_words = ' '.join(comments_df['text']).lower()
word_tokens = word_tokenize(all_words)
filtered_words = [w for w in word_tokens if w.isalnum()]
names_counts = Counter(filtered_words)
sam_count = names_counts['sam']
shaan_count = names_counts['shaan']
return {'sam': sam_count, 'shaan': shaan_count}
sam_vs_shaan_counts = sam_vs_shaan(comments_df)
print(sam_vs_shaan_counts)
names, counts = zip(*sam_vs_shaan_counts.items())
plt.figure(figsize=(12, 6))
palette = sns.color_palette("husl", len(names))
sns.barplot(x=list(names), y=list(counts), palette=palette)
plt.title('Sam vs Shaan')
plt.xlabel('Names')
plt.ylabel('Frequency')
plt.show()
def top_commenters(comments_df, top_n=10):
commenter_counts = comments_df['author'].value_counts().head(top_n)
top_commenters_df = commenter_counts.reset_index()
top_commenters_df.columns = ['author', 'comment_count']
return top_commenters_df
top_commenters_df = top_commenters(comments_df)
print(top_commenters_df)
2 hidden cells
def content_performance_analysis(video_df):
video_df['viewCount'] = video_df['viewCount'].astype(float)
video_df['likeCount'] = video_df['likeCount'].astype(float)
video_df['commentCount'] = video_df['commentCount'].astype(float)
max_views = video_df['viewCount'].max()
max_likes = video_df['likeCount'].max()
max_comments = video_df['commentCount'].max()
video_df['view_score'] = video_df['viewCount'] / max_views
video_df['like_score'] = video_df['likeCount'] / max_likes
video_df['comment_score'] = video_df['commentCount'] / max_comments
video_df['performance_score'] = video_df['view_score'] + video_df['like_score'] + video_df['comment_score']
performance_metrics = video_df[['title', 'viewCount', 'likeCount', 'commentCount', 'performance_score']]
return performance_metrics.sort_values(by='performance_score', ascending=False)
top_performers = content_performance_analysis(video_df)
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)