Skip to content

Introduction:

This study examines 439 Aljazeera English news articles captured through RSS feeds, leveraging transformer-based NLP to analyze sentiment patterns in news coverage.

Our dataset (title, pubDate, guid, link, description)

# pip install ydata-profiling

# pip install ipywidgets
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

from wordcloud import WordCloud
import plotly.graph_objects as go
from ipywidgets import interact, Dropdown, IntSlider, ColorPicker
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from ydata_profiling import ProfileReport
from transformers import pipeline

import torch

import ipywidgets as widgets

print("All imports successful!")
data = pd.read_csv('aljazeera_news.csv')
data.head()
data.info()
data.describe()
df = data.copy()
profile = ProfileReport(df, title="Data Summary")
profile.to_file("report.html")  # Save as HTML
profile.to_notebook_iframe()
Let's categorize the news Articles
import pandas as pd
from urllib.parse import urlparse

def extract_category(link):
    """
    Extracts the primary category from a given URL.
    """
    if not isinstance(link, str):
        return None

    parsed_url = urlparse(link)
    domain = parsed_url.netloc.lower()
    path = parsed_url.path.strip('/').split('/')

    category = None

    # Only process Al Jazeera URLs now
    if 'aljazeera.com' in domain:
        if len(path) >= 1:
            category = path[0]


    return category if category != "https:" else None

# Apply the function
df['category'] = df['guid'].apply(extract_category)

df
df.duplicated().sum()
# !pip install wordcloud matplotlib plotly
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go
from ipywidgets import Dropdown, ColorPicker, IntSlider, interact

# Verified Kaggle font paths with fallback
KAGGLE_FONTS = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSans.ttf'
]

def interactive_wordcloud(data, title):
    text = ' '.join(data['title'].astype(str))

    # Widget controls
    colormap_dropdown = Dropdown(
        options=['twilight', 'viridis', 'plasma', 'magma', 'inferno'],
        value='twilight',
        description='Colormap:'
    )

    bg_color_picker = ColorPicker(
        value='black',
        description='Background:'
    )

    max_words_slider = IntSlider(
        min=100,
        max=1500,
        value=1000,
        description='Max Words:'
    )

    def update(colormap='twilight', bg_color='black', max_words=1000):
        wc = None  # Initialize variable
        
        # Try available fonts
        for font_path in KAGGLE_FONTS:
            try:
                wc = WordCloud(
                    font_path=font_path,
                    background_color=bg_color,
                    colormap=colormap,
                    max_words=max_words,
                    width=1200,
                    height=600
                ).generate(text)
                break
            except OSError:
                continue

        # Fallback if all fonts fail
        if wc is None:
            raise RuntimeError("""
            No working fonts found! Try:
            1. Add font to Kaggle Notebook (Settings -> Add Data)
            2. Update KAGGLE_FONTS list with correct path
            """)

        # Create plotly figure
        fig = go.Figure(go.Image(z=wc.to_array()))
        
        fig.update_layout(
            title=f'<b>{title}</b>',
            margin=dict(t=100, b=20),
            plot_bgcolor=bg_color,
            paper_bgcolor=bg_color
        )

        fig.show()

    interact(update,
             colormap=colormap_dropdown,
             bg_color=bg_color_picker,
             max_words=max_words_slider)

# Generate the dashboard
interactive_wordcloud(df, 'Interactive WordCloud - Title')
# Verified Kaggle font paths with fallback
KAGGLE_FONTS = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSans.ttf'
]

def interactive_wordcloud(data, description):
    text = ' '.join(data['description'].astype(str))

    # Widget controls
    colormap_dropdown = Dropdown(
        options=['twilight', 'viridis', 'plasma', 'magma', 'inferno'],
        value='twilight',
        description='Colormap:'
    )

    bg_color_picker = ColorPicker(
        value='black',
        description='Background:'
    )

    max_words_slider = IntSlider(
        min=100,
        max=1500,
        value=1000,
        description='Max Words:'
    )

    def update(colormap='twilight', bg_color='black', max_words=1000):
        wc = None  # Initialize variable
        
        # Try available fonts
        for font_path in KAGGLE_FONTS:
            try:
                wc = WordCloud(
                    font_path=font_path,
                    background_color=bg_color,
                    colormap=colormap,
                    max_words=max_words,
                    width=1200,
                    height=600
                ).generate(text)
                break
            except OSError:
                continue

        # Fallback if all fonts fail
        if wc is None:
            raise RuntimeError("""
            No working fonts found! Try:
            1. Add font to Kaggle Notebook (Settings -> Add Data)
            2. Update KAGGLE_FONTS list with correct path
            """)

        # Create plotly figure
        fig = go.Figure(go.Image(z=wc.to_array()))
        
        fig.update_layout(
            title=f'<b>{description}</b>',
            margin=dict(t=100, b=20),
            plot_bgcolor=bg_color,
            paper_bgcolor=bg_color
        )

        fig.show()

    interact(update,
             colormap=colormap_dropdown,
             bg_color=bg_color_picker,
             max_words=max_words_slider)

# Generate the dashboard
interactive_wordcloud(df, 'Interactive WordCloud - description')
โ€Œ
โ€Œ
โ€Œ