Skip to content
!pip install torchtext
!pip install torchmetrics
!pip install sentencepiece
Hidden output
#import libraries
from torchtext.data.utils import get_tokenizer
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

text = "In the city of Dataville, a data analyst named Alex explores hidden insights within vast data. With determination, Alex uncovers patterns, cleanses the data, and unlocks innovation. Join this adventure to unleash the power of data-driven decisions."

# Initialize the tokenizer and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)

threshold = 1
# Remove rare words and print common tokens
freq_dist = FreqDist(tokens)
common_tokens = [token for token in tokens if freq_dist[token] > threshold]
print(common_tokens)
text1 = 'Dear Parents and Carers,Thank you to all those who attended the West Yorkshire Combined Authority Demystifying Apprenticeships webinar on 10 July and taking the time to learn more about apprenticeships.We’d like to offer our thanks to our fantastic panellists from Jet2, Forvis Mazars, ITV, ASDA, and our host, Chris from Impellam Group, for sharing their valuable insights and experiences.We have attached the slides from the presentation for your reference, with links to further sources of information.The webinar was recorded and is available to view here. We are also creating a resource that will answer some of the questions raised during the webinar and provide further information on the topics covered. Please do share your feedback by completing the event evaluation survey here as this will be used to inform future sessions.Thank you again for your interest in apprenticeships. We hope the webinar was informative and helped you to understand the opportunities available. Kind regards, West Yorkshire Careers Hub'

# Initialize and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text1)

# Remove any stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Perform stemming on the filtered tokens
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)
import torch

genres = ['Fiction','Non-fiction','Biography', 'Children','Mystery']

# Define the size of the vocabulary
vocab_size = len(genres)

# Create one-hot vectors
one_hot_vectors = torch.eye(vocab_size)

# Create a dictionary mapping genres to their one-hot vectors
one_hot_dict = {genre: one_hot_vectors[i] for i, genre in enumerate(genres)}

for genre, vector in one_hot_dict.items():
    print(f'{genre}: {vector.numpy()}')
    
# Import from sklearn
from sklearn.feature_extraction.text import CountVectorizer

titles = ['The Great Gatsby','To Kill a Mockingbird','1984','The Catcher in the Rye','The Hobbit', 'Great Expectations']

# Initialize Bag-of-words with the list of book titles
vectorizer = CountVectorizer()
bow_encoded_titles = vectorizer.fit_transform(titles)

# Extract and print the first five features
print("\n", vectorizer.get_feature_names_out()[:5])
print(bow_encoded_titles.toarray()[0, :5])

descriptions =['A portrait of the Jazz Age in all of its decadence and excess.', 'A gripping, heart-wrenching, and wholly remarkable tale of coming-of-age in a South poisoned by virulent prejudice.', 'A startling and haunting vision of the world.', 'A story of lost innocence.', 'A timeless adventure story.']

# Importing TF-IDF from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF encoding vectorizer
vectorizer = TfidfVectorizer()
tfidf_encoded_descriptions = vectorizer.fit_transform(descriptions)

# Extract and print the first five features
print("\n", vectorizer.get_feature_names_out()[:5])
print(tfidf_encoded_descriptions.toarray()[0, :5])
#import libraries
from torch.utils.data import Dataset, DataLoader
import regex as re

#Create a class
class TextDataset(Dataset): #create custom class TextDataset serving as the data container
    def __init__ (self, text):  #init method initialises the dataset with input text data
        self.text = text
    def __len__(self):          #len method returns total number of samples in the dataset
        return len(self.text)
    def __getitem__(self, idx): #getitem method allows us to access a specific sample at a given index
        return self.text[idx]
    
def preprocess_sentences(sentences):
    processed_sentences=[]
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        freq_dist = FreqDist(tokens)
        threshold = 2
        tokens = [token for token in tokens if freq_dist[token] > threshold]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences   

def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encoded_sentences = X.toarray()
    return encoded_sentences, vectorizer

def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data)
    return sentences

def text_processing_pipeline(text):
    tokens = preprocess_sentences(text)
    encoded_sentences = encode_sentences(tokens)
    dataset = TextDataset(encoded_sentences)
    dataloader= DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from torchtext.data.utils import get_tokenizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

email = "Dear Parents and Carers,Thank you to all those who attended the West Yorkshire Combined Authority Demystifying Apprenticeships webinar on 10 July and taking the time to learn more about apprenticeships.We’d like to offer our thanks to our fantastic panellists from Jet2, Forvis Mazars, ITV, ASDA, and our host, Chris from Impellam Group, for sharing their valuable insights and experiences.We have attached the slides from the presentation for your reference, with links to further sources of information.The webinar was recorded and is available to view here. We are also creating a resource that will answer some of the questions raised during the webinar and provide further information on the topics covered. Please do share your feedback by completing the event evaluation survey here as this will be used to inform future sessions.Thank you again for your interest in apprenticeships. We hope the webinar was informative and helped you to understand the opportunities available. Kind regards, West Yorkshire Careers Hub"

# Extract sentences from text2
text2 = sent_tokenize(email)

# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

# Complete the function to preprocess sentences
def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

processed_text2 = preprocess_sentences(text2)
print(processed_text2[:5])
# Define your Dataset class
class EmailDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

# Complete the encoding function
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer
    
# Complete the text processing pipeline
def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = EmailDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

dataloader, vectorizer = text_processing_pipeline(processed_text2)

# Print the vectorizer's feature names and the first 10 components of the first item
print(vectorizer.get_feature_names_out()[:10]) 
print(next(iter(dataloader))[0, :10])

TEXT CLASSIFICATION


1 hidden cell
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#no pooling layer as dataset is small

class SentimentAnalysisCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()                      #super() initialises the base class nn.Module
        self.embedding = nn.Embedding(vocab_size, embed_dim)  #embedding creates dense word vectors
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1) #conv1d for 1 dimensional data
        self.fc = nn.Linear(embed_dim,2)
    def forward(self,text):
        embedded = self.embedding(text).permute(0,2,1) #embedding layer converts text to embedding, match tesnors to convolution layers expected input, in this case0,,2,1
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)
    
#to prepare our data, we use word to index mapping, one-hot and tf-idf less efficient as they do not capture word relationships    
vocab = ['i','love','this','book','do','not','like','shallow','page']
word_to_idx = {word:i for i, word in enumerate(vocab)}
vocab_size=len(word_to_idx)
embed_dim=10
book_reviews = ["the story was captivating and kept me hooked till the end.".split(),
                "i found the characters shallow and the plot predictable.".split()]

#initialise our model
model=SentimentAnalysisCNN(vocab_size, embed_dim)
criterion=nn.CrossEntropyLoss()
#for training
optimizer = optim.SGD(model.parameters(), lr=0.1)

#training data
data=[(['I', 'love', 'this', 'book'], 1),
 (['story','was','captivating','was','hooked'], 1),     
 (['This', 'is', 'an', 'amazing', 'novel'], 1),
 (['I', 'really', 'like', 'this', 'story'], 1),
 (['I', 'do', 'not', 'like', 'this', 'book'], 0),
 (['I', 'hate', 'this', 'novel'], 0),
 (['characters','shallow','plot','was','predictable'] ,0),     
 (['This', 'is', 'a', 'terrible', 'story'], 0)]

#train the model
for epoch in range(10):
    for sentence,label in data:
        model.zero_grad()
        sentence=torch.LongTensor([word_to_idx.get(w.lower(),0) for w in sentence]).unsqueeze(0)
        outputs=model(sentence)
        label=torch.LongTensor([int(label)])
        loss=criterion(outputs,label)
        loss.backward()
        optimizer.step()
print('Training complete!')        
        
for review in book_reviews:
    #convert the review words into tensor form
    input_tensor=torch.tensor([word_to_idx.get(w.lower(), 0) for w in review], dtype=torch.long).unsqueeze(0)
    #get models output
    outputs = model(input_tensor)
    #find index of the most likely sentiment category
    _, predicted_label = torch.max(outputs.data,1)
    #convert predicted label into a sentiment string
    sentiment = "Positive" if predicted_label.item() == 1 else "Negative"
    print(f"Book Review: {' '.join(review)}")
    print(f"Sentiment: {sentiment}\n")
#CNN model for text classification

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Define the word_to_ix dictionary
word_to_ix = {
    "I": 0,
    "love": 1,
    "this": 2,
    "book": 3,
    "do": 4,
    "not": 5,
    "like": 6
}

class TextClassificationCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TextClassificationCNN, self).__init__()
        # Initialize the embedding layer 
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, 2)
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        # Pass the embedded text through the convolutional layer and apply a ReLU
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2) 
        return self.fc(conved)

# Initialize the model, criterion, and optimizer
vocab_size = len(word_to_ix)
embed_dim = 10
model = TextClassificationCNN(vocab_size, embed_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Dummy data for training
data = [
    (["I", "love", "this", "book"], 1),
    (["I", "do", "not", "like", "this", "book"], 0)
]

for epoch in range(10):
    for sentence, label in data:     
        # Clear the gradients
        model.zero_grad()
        sentence = torch.LongTensor([word_to_ix.get(w, 0) for w in sentence]).unsqueeze(0) 
        label = torch.LongTensor([int(label)])
        outputs = model(sentence)
        loss = criterion(outputs, label)
        loss.backward()
        # Update the parameters
        optimizer.step()
print('Training complete!')    

book_reviews = [
    "I love this book".split(),
    "I do not like this book".split()
]
for review in book_reviews:
    # Convert the review words into tensor form
    input_tensor = torch.tensor([word_to_ix[w] for w in review], dtype=torch.long).unsqueeze(0) 
    # Get the model's output
    outputs = model(input_tensor)
    # Find the index of the most likely sentiment category
    _, predicted_label = torch.max(outputs.data, 1)
    # Convert the predicted label into a sentiment string
    sentiment = "Positive" if predicted_label.item() == 1 else "Negative"
    print(f"Book Review: {' '.join(review)}")
    print(f"Sentiment: {sentiment}\n")
#RNN for text classification

class TextDataset(Dataset):
    def __init__(self,text):
        self.text = text
    def __len__(self):
        return len(self.text)
    def _getitem__(self,idx):
        return self.text[idx]
    
sample_tweet = ["This", "movie", "had", "a", "great", "plot", "and", "amazing", "action"]

#preprocess the review and convert to tensor
word_to_idx = {word:i for i, word in enumerate(sample_tweet)}

inputs=torch.LongTensor([word_to_idx[w] for w in sample_tweet])

print(word_to_idx)
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)  # Set the random seed for reproducibility

input_size = 100
hidden_size = 32
num_layers = 2
num_classes = 3

# Dummy data for X_train_seq and y_train_seq
X_train_seq = torch.randn(10, 5, input_size)  # (batch_size, sequence_length, input_size)
y_train_seq = torch.randint(0, num_classes, (10,))  # (batch_size,)

# Complete the RNN class
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :] 
        out = self.fc(out)
        return out

# Initialize the model
rnn_model = RNNModel(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.01)

# Train the model for ten epochs and zero the gradients
for epoch in range(10): 
    optimizer.zero_grad()
    outputs = rnn_model(X_train_seq)
    loss = criterion(outputs, y_train_seq)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

print("end of RNN training epochs\n")

# Initialize the LSTM and the output layer with parameters
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :] 
        out = self.fc(out)
        return out

# Initialize model with required parameters
lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.01)

# Train the model by passing the correct parameters and zeroing the gradient
for epoch in range(10): 
    optimizer.zero_grad()
    outputs = lstm_model(X_train_seq)
    loss = criterion(outputs, y_train_seq)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')  

print("end of LSTM training epochs \n")   

# Complete the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)       
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = out[:, -1, :] 
        out = self.fc(out)
        return out

# Initialize the model
gru_model = GRUModel(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(gru_model.parameters(), lr=0.01)

# Train the model and backpropagate the loss after initialization
for epoch in range(15): 
    optimizer.zero_grad()
    outputs = gru_model(X_train_seq)
    loss = criterion(outputs, y_train_seq)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')  

print("end of GRU training eppochs \n")
    
# Dummy data for X_test
X_test = torch.randn(10, 5, input_size)  # (batch_size, sequence_length, input_size)
y_test = torch.randint(0, num_classes, (10,))  # (batch_size,)

# Initialize the LSTM model for testing
lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
outputs = lstm_model(X_test)
_, y_pred_lstm = torch.max(outputs, 1)

#initialise the GRU model for testing
gru_model = GRUModel(input_size, hidden_size, num_layers, num_classes)
outputs = gru_model(X_test)
_, y_pred_gru = torch.max(outputs, 1)
Hidden output
from torchmetrics import Accuracy, Precision, Recall, F1Score
actual = torch.tensor([0,1,1,0,1,0])
predicted = torch.tensor([0,0,1,0,1,1])

# Create an instance of the metrics
accuracy = Accuracy(task="multiclass", num_classes=3)
precision = Precision(task="multiclass", num_classes=3)
recall = Recall(task="multiclass", num_classes=3)
f1 = F1Score(task="multiclass", num_classes=3)

# Calculate metrics for the LSTM model
accuracy_1 = accuracy(y_pred_lstm, y_test)
precision_1 = precision(y_pred_lstm, y_test)
recall_1 = recall(y_pred_lstm, y_test)
f1_1 = f1(y_pred_lstm, y_test)
print("LSTM Model - Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(accuracy_1, precision_1, recall_1, f1_1))

# Calculate metrics for the GRU model
accuracy_2 = accuracy(y_pred_gru, y_test)
precision_2 = precision(y_pred_gru, y_test)
recall_2 = recall(y_pred_gru, y_test)
f1_2 = f1(y_pred_gru, y_test)
print("GRU Model - Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(accuracy_2, precision_2, recall_2, f1_2))

INTRO TO TEXT GENERATION