Project: Analyzing Youtube Channel Engagement

pip install --upgrade google-api-python-client

Hidden output

import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
from IPython.display import JSON
youtube_key = os.getenv("YOUTUBE_API")

channel = ['UCth3HsaePBfeeB6KKBp6fwA']
youtube = build('youtube', 'v3', developerKey=youtube_key)
request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel)
    )
response = request.execute()
print(response)

Hidden output

def channel_stats(youtube, channel):
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel)
    )
    response = request.execute()
    
    # getting the playlist ids
    playlist_id = None
    if 'items' in response and len(response['items']) > 0:
        playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    
    print(f"The playlist ID is: {playlist_id}")
    return playlist_id

#getting video ids
def get_video_ids(youtube, playlist_id):
    video_ids = []
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()

    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
   
    return video_ids

playlist_id = channel_stats(youtube, channel)
video_ids = get_video_ids(youtube, playlist_id)

def get_video_details(youtube, video_ids):
    if video_ids is None:
        raise ValueError("video_ids is None. Please provide a valid list of video IDs.")

    all_video_info = []

    request = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=','.join(video_ids[0:50])
    )
    response = request.execute()

    for video in response['items']:
        basic_stats = {
            'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
            'statistics': ['viewCount', 'likeCount', 'favoriteCount', 'commentCount'],
            'contentDetails': ['duration', 'definition', 'caption']
        }
        video_info = {}
        video_info['video_id'] = video['id']

        for k in basic_stats.keys():
            for v in basic_stats[k]:
                try:
                    video_info[v] = video[k][v]
                except KeyError:
                    video_info[v] = None

        all_video_info.append(video_info)
    return pd.DataFrame(all_video_info)

video_df = get_video_details(youtube, video_ids)     
video_count = len(video_df)
print(video_count)

video_df

Hidden output

def engagement_rate_analysis(video_df):
    video_df['engagement_rate'] = (video_df['likeCount'] + video_df['commentCount']) / video_df['viewCount']
    return video_df[['title', 'engagement_rate']].sort_values(by='engagement_rate', ascending=False)

top_engagement_rates = engagement_rate_analysis(video_df)
top_engagement_rates

1 hidden cell


numeric_cols = ['viewCount', 'likeCount', 'favoriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

Hidden output

import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def extract_keywords(comments_df):
    stop_words = set(stopwords.words('english'))
    all_words = ' '.join(comments_df['text']).lower()
    word_tokens = word_tokenize(all_words)
    filtered_words = [w for w in word_tokens if w.isalnum() and w not in stop_words]
    word_counts = Counter(filtered_words)
    
    return word_counts.most_common(20)

common_keywords = extract_keywords(comments_df)

keywords, counts = zip(*common_keywords)

plt.figure(figsize=(12, 6))
palette = sns.color_palette("husl", len(keywords))
sns.barplot(x=list(keywords), y=list(counts), palette=palette)
plt.title('Top 20 Most Common Keywords in YouTube Comments')
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

def sam_vs_shaan(comments_df):
    names = ['sam', 'shaan']
    all_words = ' '.join(comments_df['text']).lower()
    word_tokens = word_tokenize(all_words)
    filtered_words = [w for w in word_tokens if w.isalnum()]
    names_counts = Counter(filtered_words)

    sam_count = names_counts['sam']
    shaan_count = names_counts['shaan']

    return {'sam': sam_count, 'shaan': shaan_count}

sam_vs_shaan_counts = sam_vs_shaan(comments_df)
print(sam_vs_shaan_counts)

names, counts = zip(*sam_vs_shaan_counts.items())

plt.figure(figsize=(12, 6))
palette = sns.color_palette("husl", len(names))
sns.barplot(x=list(names), y=list(counts), palette=palette)
plt.title('Sam vs Shaan')
plt.xlabel('Names')
plt.ylabel('Frequency')
plt.show()

def top_commenters(comments_df, top_n=10):
    commenter_counts = comments_df['author'].value_counts().head(top_n)
    
    top_commenters_df = commenter_counts.reset_index()
    top_commenters_df.columns = ['author', 'comment_count']
    
    return top_commenters_df

top_commenters_df = top_commenters(comments_df)
print(top_commenters_df)

2 hidden cells

def content_performance_analysis(video_df):
    video_df['viewCount'] = video_df['viewCount'].astype(float)
    video_df['likeCount'] = video_df['likeCount'].astype(float)
    video_df['commentCount'] = video_df['commentCount'].astype(float)
    
    max_views = video_df['viewCount'].max()
    max_likes = video_df['likeCount'].max()
    max_comments = video_df['commentCount'].max()
    
    video_df['view_score'] = video_df['viewCount'] / max_views
    video_df['like_score'] = video_df['likeCount'] / max_likes
    video_df['comment_score'] = video_df['commentCount'] / max_comments
    
    video_df['performance_score'] = video_df['view_score'] + video_df['like_score'] + video_df['comment_score']
    
    performance_metrics = video_df[['title', 'viewCount', 'likeCount', 'commentCount', 'performance_score']]
    return performance_metrics.sort_values(by='performance_score', ascending=False)

top_performers = content_performance_analysis(video_df)

ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)