Skip to content

NLP with Financial News Analysis

import pandas as pd

# URL of the raw CSV file on GitHub
url = 'https://raw.githubusercontent.com/SayelAbualigah/SFT_Dataset/main/dataset.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(url)

# Display the first few rows of the DataFrame
data.head(2)

Extract content related to biology

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = data['Content'].to_list()

# Create a Tfidf matrix
vectorizer = TfidfVectorizer(stop_words='english',     #
                             ngram_range = (1,1),      #
                             max_df = .6,              #
                             min_df = .01)             #

X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.iloc[:2,:5]
# Find the top words in each document
top_dict = {}
dt = df.transpose()

for i in range(dt.shape[1]):
    top = dt.loc[:,i].sort_values(ascending=False).head(30)
    top_dict[i]= top.index

df_res = pd.DataFrame(top_dict)

# Looking for agriculture
col = []
for i in range(dt.shape[1]):
    test = df_res.iloc[:,i].str.contains('health|agri|biolo|medic')
    if test.sum() > 0:
        col.append(i)
    else:
        None
bio = data.iloc[col, :]
bio.head()

Cloud of words

%%capture
!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
all_speeches = ' '.join(bio['Content'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_speeches)


plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Content')
plt.show()

Sentiment analysis

%%capture
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
import seaborn as sns
sid = SentimentIntensityAnalyzer()

bio['SentimentScores'] = bio['Content'].apply(sid.polarity_scores)

bio['Compound'] = bio['SentimentScores'].apply(lambda x: x['compound'])
bio['Positive'] = bio['SentimentScores'].apply(lambda x: x['pos'])
bio['Neutral'] = bio['SentimentScores'].apply(lambda x: x['neu'])
bio['Negative'] = bio['SentimentScores'].apply(lambda x: x['neg'])

sent = bio[['Date','Subject', 'Compound', 'Positive','Negative','Neutral']]
sent = sent.melt(id_vars=['Date', 'Subject'], value_vars=['Compound', 'Positive','Negative','Neutral'])

fig, ax = plt.subplots() 
sns.set()
sns.boxplot(data = sent, x = 'variable', y = 'value', hue = 'variable')
ax.yaxis.grid(True)
ax.xaxis.grid(True) 

summarizing

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

summaries = []

for i in bio['Content'].to_list():
    inputs_ids = tokenizer.encode(
        'summarize: ' + i,
        return_tensors = 'pt', max_length = 500, truncation = True
    )
    
    summary_ids = model.generate(inputs_ids, max_length = 20)
    summaries.append(tokenizer.decode(
        summary_ids[0], skip_special_token = True
    ))
bio['Summary_model'] = summaries
from wordcloud import WordCloud
import matplotlib.pyplot as plt
all_speeches = ' '.join(bio['Summary_model'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_speeches)


plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Content summary')
plt.show()