Project: Service Desk Ticket Classification with Deep Learning

CleverSupport is a company at the forefront of AI innovation, specializing in the development of AI-driven solutions to enhance customer support services. Their latest endeavor is to engineer a text classification system that can automatically categorize customer complaints.

Your role as a data scientist involves the creation of a sophisticated machine learning model that can accurately assign complaints to specific categories, such as mortgage, credit card, money transfers, debt collection, etc.

!pip install torchmetrics

Hidden output

from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

nltk.download('punkt')

Hidden output

# Import data and labels
with open("words.json", 'r') as f1:
    words = json.load(f1)
with open("text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load('labels.npy')

# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

# Splitting dataset
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long())
test_data = TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long())

# Start coding here
class TicketClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_labels):
        super(TicketClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # TODO Check kernel_size here.
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.fc = nn.Linear(embed_dim, n_labels)
        
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = F.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.fc(conved)

num_classes = len(set(labels))

model = TicketClassifier(len(words), 10, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in range(3):
    for sentence, label in train_data:
        model.zero_grad()
        sentence = sentence.unsqueeze(0)
        label = torch.LongTensor([int(label)])
        outputs = model(sentence)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

predicted = []

model.eval()

with torch.no_grad():
    for sentence, label in test_data:
        sentence = sentence.unsqueeze(0)
        label = torch.LongTensor([label])
        outputs = model(sentence)
        _, predicted_label = torch.max(outputs.data, 1)
        predicted.append(predicted_label)

torch_test_labels = torch.from_numpy(test_label)
torch_predicted_labels = torch.from_numpy(np.array(predicted, dtype='int8'))

accuracy_metric = Accuracy(task='multiclass', num_classes=num_classes)
precision_metric = Precision(task='multiclass', num_classes=num_classes, average='none')
recall_metric = Recall(task='multiclass', num_classes=num_classes, average='none')

accuracy = accuracy_metric(torch_predicted_labels, torch_test_labels)
precision = precision_metric(torch_predicted_labels, torch_test_labels).tolist()
recall = recall_metric(torch_predicted_labels, torch_test_labels).tolist()

print("RNN Model - Accuracy: {}, Precision: {}, Recall: {}".format(accuracy, precision, recall))