3-fake-news-classifiers_Distilbert-XGBc-Spacy-Pegasus

import pandas as pd

df = pd.read_csv('fakenews.csv')
df.head()

Run cancelled

df['label'].value_counts(normalize=True) #{0 : 'real', 1 : 'fake'}

Run cancelled

from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer 
vectorizer = CountVectorizer(lowercase=True, ngram_range=(2,2))
# Generate matrix of word 
bow_matrix = vectorizer.fit_transform(df['text'])
print(bow_matrix.toarray())

Run cancelled

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_classifier.fit(X_train, y_train)

# Predictions
predictions = xgb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

Run cancelled

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generating the normalized confusion matrix
cm_normalized = confusion_matrix(y_test, predictions, normalize='true')

# Plotting the normalized confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title('Normalized Confusion Matrix')
plt.show()

Run cancelled

from transformers import AutoTokenizer, PegasusForConditionalGeneration
import torch

model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Run cancelled

def summtext(text, model, tokenizer):
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to(device)
    # Generate Summary
    summary_ids = model.generate(inputs["input_ids"])
    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return summary

Run cancelled

df['textSum'] = df['text'].apply(lambda x: summtext(x, model=model, tokenizer=tokenizer))

Run cancelled

df.to_csv('fakenews_summarized.csv', index=False)

Run cancelled

import spacy
from tqdm import tqdm

# Load the default English model
nlp = spacy.load("en_core_web_sm")


vec_dict = dict()

for i in tqdm(range(len(df['textSum']))):
    doc = nlp(df['textSum'][i])
    vec_dict[i] = doc.vector

Run cancelled

X = pd.DataFrame(vec_dict).T.values
y = df['label'].values

Run cancelled

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_classifier.fit(X_train, y_train)

# Predictions
predictions = xgb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

Run cancelled

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generating the normalized confusion matrix
cm_normalized = confusion_matrix(y_test, predictions, normalize='true')

# Plotting the normalized confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title('Normalized Confusion Matrix')
plt.show()

Run cancelled

from datasets import Dataset
from sklearn.model_selection import train_test_split

df['text'] = df['text'].apply(lambda x: x[:500] if len(x)>500 else x)

df = df[['label', 'text']]

# Splitting the dataset into training and testing sets
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

Run cancelled

train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

‌
‌
‌