Skip to content
3-fake-news-classifiers_Distilbert-XGBc-Spacy-Pegasus
import pandas as pd
df = pd.read_csv('fakenews.csv')
df.head()
Run cancelled
df['label'].value_counts(normalize=True) #{0 : 'real', 1 : 'fake'}
Run cancelled
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer
vectorizer = CountVectorizer(lowercase=True, ngram_range=(2,2))
# Generate matrix of word
bow_matrix = vectorizer.fit_transform(df['text'])
print(bow_matrix.toarray())
Run cancelled
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, df['label'], test_size=0.2, random_state=42, stratify=df['label'])
# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Fit the model
xgb_classifier.fit(X_train, y_train)
# Predictions
predictions = xgb_classifier.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy
Run cancelled
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Generating the normalized confusion matrix
cm_normalized = confusion_matrix(y_test, predictions, normalize='true')
# Plotting the normalized confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title('Normalized Confusion Matrix')
plt.show()
Run cancelled
from transformers import AutoTokenizer, PegasusForConditionalGeneration
import torch
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
Run cancelled
def summtext(text, model, tokenizer):
inputs = tokenizer(text, truncation=True, return_tensors="pt").to(device)
# Generate Summary
summary_ids = model.generate(inputs["input_ids"])
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return summary
Run cancelled
df['textSum'] = df['text'].apply(lambda x: summtext(x, model=model, tokenizer=tokenizer))
Run cancelled
df.to_csv('fakenews_summarized.csv', index=False)
Run cancelled
import spacy
from tqdm import tqdm
# Load the default English model
nlp = spacy.load("en_core_web_sm")
vec_dict = dict()
for i in tqdm(range(len(df['textSum']))):
doc = nlp(df['textSum'][i])
vec_dict[i] = doc.vector
Run cancelled
X = pd.DataFrame(vec_dict).T.values
y = df['label'].values
Run cancelled
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Fit the model
xgb_classifier.fit(X_train, y_train)
# Predictions
predictions = xgb_classifier.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy
Run cancelled
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Generating the normalized confusion matrix
cm_normalized = confusion_matrix(y_test, predictions, normalize='true')
# Plotting the normalized confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title('Normalized Confusion Matrix')
plt.show()
Run cancelled
from datasets import Dataset
from sklearn.model_selection import train_test_split
df['text'] = df['text'].apply(lambda x: x[:500] if len(x)>500 else x)
df = df[['label', 'text']]
# Splitting the dataset into training and testing sets
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
Run cancelled
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)