Project: Developing Multi-Input Models For OCR

DigiNsure Inc. is an innovative insurance company focused on enhancing the efficiency of processing claims and customer service interactions. Their newest initiative is digitizing all historical insurance claim documents, which includes improving the labeling of some IDs scanned from paper documents and identifying them as primary or secondary IDs.

To help them in their effort, you'll be using multi-modal learning to train an Optical Character Recognition (OCR) model. To improve the classification, the model will use images of the scanned documents as input and their insurance type (home, life, auto, health, or other). Integrating different data modalities (such as image and text) enables the model to perform better in complex scenarios, helping to capture more nuanced information. The labels that the model will be trained to identify are of two types: a primary and a secondary ID, for each image-insurance type pair.

! pip install torchvision

Hidden output

# Import the necessary libraries
import matplotlib.pyplot as plt
import numpy as np
from project_utils import ProjectDataset
import pickle 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Load the data
dataset = pickle.load(open('ocr_insurance_dataset.pkl', 'rb'))

# Define a function to visualize codes with their corresponding types and labels 
def show_dataset_images(dataset, num_images=5):
    fig, axes = plt.subplots(1, min(num_images, len(dataset)), figsize=(20, 4))
    for ax, idx in zip(axes, np.random.choice(len(dataset), min(num_images, len(dataset)), False)):
        img, lbl = dataset[idx]
        ax.imshow((img[0].numpy() * 255).astype(np.uint8).reshape(64,64), cmap='gray'), ax.axis('off')
        ax.set_title(f"Type: {list(dataset.type_mapping.keys())[img[1].tolist().index(1)]}\nLabel: {list(dataset.label_mapping.keys())[list(dataset.label_mapping.values()).index(lbl)]}")
    plt.show()

# Inspect 5 codes images from the dataset
show_dataset_images(dataset)

Model Architecture:

Image Processing Branch: The image is passed through two convolutional layers with ReLU activations and max-pooling, followed by a flattening operation.

Insurance Type Processing Branch: The insurance type is passed through an embedding layer, followed by a fully connected layer with ReLU activation.

Combined Features: The outputs from both branches are concatenated and passed through additional fully connected layers to produce the final output.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the OCRModel
class OCRModel(nn.Module):
    def __init__(self, num_insurance_types, num_classes):
        super(OCRModel, self).__init__()
        
        # Image processing layers
        self.image_layer = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1),  # 1 input channel, 16 output channels
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Reduces image size to 32x32
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1),  # 16 input channels, 32 output channels
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Reduces image size to 16x16
            nn.Flatten()  # Flattens the output to a 1D vector
        )
        
        # Insurance type processing layers
        self.type_layer = nn.Sequential(
            nn.Embedding(num_embeddings=num_insurance_types, embedding_dim=16),  # Embedding for insurance type
            nn.Flatten(),
            nn.Linear(16, 32),  # Fully connected layer
            nn.ReLU()
        )
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(32 * 16 * 16 + 32, 128),  # Concatenated features from image and type
            nn.ReLU(),
            nn.Linear(128, num_classes)  # Final output layer
        )
    
    def forward(self, image, insurance_type):
        # Process image
        image_features = self.image_layer(image)  # Output shape: (batch_size, 32 * 16 * 16)
        
        # Process insurance type
        type_features = self.type_layer(insurance_type)  # Output shape: (batch_size, 32)
        
        # Concatenate features
        combined = torch.cat((image_features, type_features), dim=1)  # Concatenate along the feature dimension
        
        # Final classification
        output = self.fc(combined)
        return output

# Load the dataset
dataset = pickle.load(open('ocr_insurance_dataset.pkl', 'rb'))

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, loss function, and optimizer
num_insurance_types = len(dataset.type_mapping)
num_classes = len(dataset.label_mapping)
model = OCRModel(num_insurance_types, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for images, labels in dataloader:
        # Extract image and insurance type
        image_data = images[0]  # Shape: (batch_size, 1, 64, 64)
        insurance_type = images[1].argmax(dim=1)  # Convert one-hot to indices
        
        # Forward pass
        outputs = model(image_data, insurance_type)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("Training complete.")

Training:

The model is trained using the CrossEntropyLoss function, which is suitable for classification tasks.

The Adam optimizer is used for optimization.

The training loop runs for ten epochs, with the loss printed at the end of each epoch.