Skip to content
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from keras.models import Sequential
from keras.layers import Dense,Embedding,Conv1D,MaxPooling1D,LSTM,Dropout
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
df = pd.read_csv('bitcointweets.csv', header=None)[[1, 7]]
df.columns = ['tweet', 'label']
df['label'] = df.label.str.extract(r'(\w+)')
df.head()
import nltk
nltk.download('stopwords')
stpwds_en = list(set(stopwords.words('english')))
df['clean'] = df.tweet.str.lower()
df['clean'] = df.clean.apply(lambda x: re.sub(r'@\w+', '', x))
df['clean'] = df.clean.apply(lambda x: re.sub(r'http\S+', '', x))
df['clean'] = df.clean.apply(lambda x: re.sub('\d+', '', x))
df['clean'] = df.clean.apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['clean'] = df.clean.apply(lambda x: re.sub('rt', '', x))
df['clean'] = df.clean.apply(lambda x: re.sub('dum b a ss', 'dumbass', x))
df['clean'] = df.clean.apply(lambda x: ' '.join([word for word in x.split(' ') if word not in stpwds_en]))
df.head()
df['length'] = df.clean.apply(lambda x: len(x.split(' ')))
df = df[df.length > 6]
nodub = df.drop_duplicates(subset='clean')
nodub.head()
y = pd.get_dummies(nodub['label']).values
xtrain, xtest, ytrain, ytest = train_test_split(nodub.clean, y, stratify=y, random_state=23, test_size=0.2)
max_features = 10000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(xtrain))
X_train = tokenizer.texts_to_sequences(xtrain)
X_test = tokenizer.texts_to_sequences(xtest)
max_words = 10
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)
train_set = tf.data.Dataset.from_tensor_slices((X_train,ytrain))
test_set = tf.data.Dataset.from_tensor_slices((X_test,ytest))
model = Sequential()
model.add(Embedding(max_features, 50, input_length=X_train.shape[1]))
model.add(Conv1D(filters=18, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(LSTM(25, dropout=0.5, recurrent_dropout=0.4))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adamax(learning_rate=0.0005, epsilon=1e-06), metrics=['accuracy'])
model_history = model.fit(X_train, ytrain, validation_data=(X_test, ytest), epochs=20, batch_size=23, callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])
metrics = pd.DataFrame(model_history.history)
fig, ax = plt.subplots(1, 2, figsize=(25, 6))
metrics[['loss', 'val_loss']].plot(ax=ax[0], title='Loss Curve', xlabel='Epoch', ylabel='Loss')
metrics[['accuracy', 'val_accuracy']].plot(ax=ax[1], title='Accuracy Curve', xlabel='Epoch', ylabel='Accuracy')
plt.show()