Skip to content
Financial News Analysis(NLP)
NLP with Financial News Analysis
import pandas as pd
# URL of the raw CSV file on GitHub
url = 'https://raw.githubusercontent.com/SayelAbualigah/SFT_Dataset/main/dataset.csv'
# Read the CSV file into a DataFrame
data = pd.read_csv(url)
# Display the first few rows of the DataFrame
data.head(2)Extract content related to biology
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = data['Content'].to_list()
# Create a Tfidf matrix
vectorizer = TfidfVectorizer(stop_words='english', #
ngram_range = (1,1), #
max_df = .6, #
min_df = .01) #
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.iloc[:2,:5]# Find the top words in each document
top_dict = {}
dt = df.transpose()
for i in range(dt.shape[1]):
top = dt.loc[:,i].sort_values(ascending=False).head(30)
top_dict[i]= top.index
df_res = pd.DataFrame(top_dict)
# Looking for agriculture
col = []
for i in range(dt.shape[1]):
test = df_res.iloc[:,i].str.contains('health|agri|biolo|medic')
if test.sum() > 0:
col.append(i)
else:
None
bio = data.iloc[col, :]
bio.head()Cloud of words
%%capture
!pip install wordcloudfrom wordcloud import WordCloud
import matplotlib.pyplot as plt
all_speeches = ' '.join(bio['Content'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_speeches)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Content')
plt.show()Sentiment analysis
%%capture
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')import seaborn as sns
sid = SentimentIntensityAnalyzer()
bio['SentimentScores'] = bio['Content'].apply(sid.polarity_scores)
bio['Compound'] = bio['SentimentScores'].apply(lambda x: x['compound'])
bio['Positive'] = bio['SentimentScores'].apply(lambda x: x['pos'])
bio['Neutral'] = bio['SentimentScores'].apply(lambda x: x['neu'])
bio['Negative'] = bio['SentimentScores'].apply(lambda x: x['neg'])
sent = bio[['Date','Subject', 'Compound', 'Positive','Negative','Neutral']]
sent = sent.melt(id_vars=['Date', 'Subject'], value_vars=['Compound', 'Positive','Negative','Neutral'])
fig, ax = plt.subplots()
sns.set()
sns.boxplot(data = sent, x = 'variable', y = 'value', hue = 'variable')
ax.yaxis.grid(True)
ax.xaxis.grid(True)
summarizing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summaries = []
for i in bio['Content'].to_list():
inputs_ids = tokenizer.encode(
'summarize: ' + i,
return_tensors = 'pt', max_length = 500, truncation = True
)
summary_ids = model.generate(inputs_ids, max_length = 20)
summaries.append(tokenizer.decode(
summary_ids[0], skip_special_token = True
))
bio['Summary_model'] = summariesfrom wordcloud import WordCloud
import matplotlib.pyplot as plt
all_speeches = ' '.join(bio['Summary_model'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_speeches)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Content summary')
plt.show()