Skip to content
Bitcoin
  • AI Chat
  • Code
  • Report
  • import pandas as pd
    import re
    from nltk.corpus import stopwords
    from sklearn.model_selection import train_test_split
    from keras.preprocessing.text import Tokenizer
    import matplotlib.pyplot as plt
    import seaborn as sns
    from keras_preprocessing.sequence import pad_sequences
    from tensorflow.keras.callbacks import EarlyStopping
    import tensorflow as tf
    from nltk.stem import WordNetLemmatizer
    nltk.download('wordnet')
    from keras.models import Sequential
    from keras.layers import Dense,Embedding,Conv1D,MaxPooling1D,LSTM,Dropout
    from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
    df = pd.read_csv('bitcointweets.csv', header=None)[[1, 7]]
    df.columns = ['tweet', 'label']
    df['label'] = df.label.str.extract(r'(\w+)')
    df.head()
    import nltk
    nltk.download('stopwords')
    stpwds_en = list(set(stopwords.words('english')))
    
    df['clean'] = df.tweet.str.lower()
    df['clean'] = df.clean.apply(lambda x: re.sub(r'@\w+', '', x))
    df['clean'] = df.clean.apply(lambda x: re.sub(r'http\S+', '', x))
    df['clean'] = df.clean.apply(lambda x: re.sub('\d+', '', x))
    df['clean'] = df.clean.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['clean'] = df.clean.apply(lambda x: re.sub('rt', '', x))
    df['clean'] = df.clean.apply(lambda x: re.sub('dum b a ss', 'dumbass', x))
    df['clean'] = df.clean.apply(lambda x: ' '.join([word for word in x.split(' ') if word not in stpwds_en]))
    df.head()
    df['length'] = df.clean.apply(lambda x: len(x.split(' ')))
    df = df[df.length > 6]
    nodub = df.drop_duplicates(subset='clean')
    nodub.head()
    y = pd.get_dummies(nodub['label']).values
    xtrain, xtest, ytrain, ytest = train_test_split(nodub.clean, y, stratify=y, random_state=23, test_size=0.2)
    max_features = 10000
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(xtrain))
    X_train = tokenizer.texts_to_sequences(xtrain)
    X_test = tokenizer.texts_to_sequences(xtest)
    max_words = 10
    X_train = pad_sequences(X_train, maxlen=max_words)
    X_test = pad_sequences(X_test, maxlen=max_words)
    train_set = tf.data.Dataset.from_tensor_slices((X_train,ytrain))
    test_set = tf.data.Dataset.from_tensor_slices((X_test,ytest))
    model = Sequential()
    model.add(Embedding(max_features, 50, input_length=X_train.shape[1]))
    model.add(Conv1D(filters=18, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))    
    model.add(Dropout(0.5))
    model.add(LSTM(25, dropout=0.5, recurrent_dropout=0.4))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adamax(learning_rate=0.0005, epsilon=1e-06), metrics=['accuracy'])
    model_history = model.fit(X_train, ytrain, validation_data=(X_test, ytest), epochs=20, batch_size=23, callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])
    metrics = pd.DataFrame(model_history.history)
    
    fig, ax = plt.subplots(1, 2, figsize=(25, 6))
    metrics[['loss', 'val_loss']].plot(ax=ax[0], title='Loss Curve', xlabel='Epoch', ylabel='Loss')
    metrics[['accuracy', 'val_accuracy']].plot(ax=ax[1], title='Accuracy Curve', xlabel='Epoch', ylabel='Accuracy')
    plt.show()