Project: Developing Multi-Input Models For OCR

DigiNsure Inc. is an innovative insurance company focused on enhancing the efficiency of processing claims and customer service interactions. Their newest initiative is digitizing all historical insurance claim documents, which includes improving the labeling of some IDs scanned from paper documents and identifying them as primary or secondary IDs.

To help them in their effort, you'll be using multi-modal learning to train an Optical Character Recognition (OCR) model. To improve the classification, the model will use images of the scanned documents as input and their insurance type (home, life, auto, health, or other). Integrating different data modalities (such as image and text) enables the model to perform better in complex scenarios, helping to capture more nuanced information. The labels that the model will be trained to identify are of two types: a primary and a secondary ID, for each image-insurance type pair.

# Import the necessary libraries
import matplotlib.pyplot as plt
import numpy as np
from project_utils import ProjectDataset
import pickle 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Load the data
dataset = pickle.load(open('ocr_insurance_dataset.pkl', 'rb'))

# Define a function to visualize codes with their corresponding types and labels 
def show_dataset_images(dataset, num_images=5):
    fig, axes = plt.subplots(1, min(num_images, len(dataset)), figsize=(20, 4))
    for ax, idx in zip(axes, np.random.choice(len(dataset), min(num_images, len(dataset)), False)):
        img, lbl = dataset[idx]
        ax.imshow((img[0].numpy() * 255).astype(np.uint8).reshape(64,64), cmap='gray'), ax.axis('off')
        ax.set_title(f"Type: {list(dataset.type_mapping.keys())[img[1].tolist().index(1)]}\nLabel: {list(dataset.label_mapping.keys())[list(dataset.label_mapping.values()).index(lbl)]}")
    plt.show()

# Inspect 5 codes images from the dataset
show_dataset_images(dataset)

# Start coding here

class OCRModel(nn.Module):
    def __init__(self):
        super(OCRModel, self).__init__()
        
        # Capa para procesar la imagen
        self.image_layer = nn.Sequential(
            # Capa convolucional: 1 canal entrada (escala de grises) → 16 canales
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 64x64 → 32x32
            
            # Segunda capa convolucional
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 32x32 → 16x16
            
            # Aplanar para capa lineal
            nn.Flatten(),  # 32 * 16 * 16 = 8192
            nn.Linear(32 * 16 * 16, 128),
            nn.ReLU()
        )
        
        # Capa para procesar el tipo de seguro (5 tipos codificados one-hot)
        self.type_layer = nn.Sequential(
            nn.Linear(5, 16),
            nn.ReLU()
        )
        
        # Clasificador final (combina imagen + tipo → 2 clases)
        self.classifier = nn.Sequential(
            nn.Linear(128 + 16, 64),  # 128 de imagen + 16 de tipo
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)  # 2 clases: Primary o Secondary ID
        )
    
    def forward(self, image, insurance_type):
        # Procesar imagen
        x_image = self.image_layer(image)
        
        # Procesar tipo de seguro
        x_type = self.type_layer(insurance_type)
        
        # Concatenar ambas representaciones
        x_combined = torch.cat([x_image, x_type], dim=1)
        
        # Clasificación final
        output = self.classifier(x_combined)
        
        return output

# Crea DataLoader para entrenamiento
# Ajusta batch_size según tu memoria disponible
dataloader_train = DataLoader(
    dataset, 
    batch_size=32, 
    shuffle=True
)

# Verificar la estructura de los datos
for images, labels in dataloader_train:
    print(f"Batch de imágenes: {images[0].shape}")  # Imagen
    print(f"Tipo de seguro: {images[1].shape}")     # One-hot encoding
    print(f"Etiquetas: {labels.shape}")
    break

# Instanciar el modelo
model = OCRModel()

# Definir la función de pérdida (Cross Entropy para clasificación)
criterion = nn.CrossEntropyLoss()

# Definir el optimizador (Adam es una buena opción)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Verificar si hay GPU disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Entrenando en: {device}")

# Número de épocas
num_epochs = 10

# Listas para guardar métricas
train_losses = []
train_accuracies = []

print("Iniciando entrenamiento...")
print("=" * 50)

for epoch in range(num_epochs):
    model.train()  # Modo entrenamiento
    
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, labels) in enumerate(dataloader_train):
        # Separar imagen y tipo de seguro
        images = inputs[0].to(device)           # Imágenes
        insurance_types = inputs[1].to(device)  # Tipos de seguro
        labels = labels.to(device)
        
        # Limpiar gradientes
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images, insurance_types)
        
        # Calcular pérdida
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Actualizar pesos
        optimizer.step()
        
        # Estadísticas
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    # Calcular métricas de la época
    epoch_loss = running_loss / len(dataloader_train)
    epoch_accuracy = 100 * correct / total
    
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)
    
    print(f"Época [{epoch+1}/{num_epochs}] - "
          f"Pérdida: {epoch_loss:.4f} - "
          f"Precisión: {epoch_accuracy:.2f}%")

print("=" * 50)
print("¡Entrenamiento completado!")

# Graficar pérdida y precisión
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Gráfico de pérdida
ax1.plot(range(1, num_epochs + 1), train_losses, 'b-', marker='o')
ax1.set_xlabel('Época')
ax1.set_ylabel('Pérdida')
ax1.set_title('Pérdida durante el Entrenamiento')
ax1.grid(True)

# Gráfico de precisión
ax2.plot(range(1, num_epochs + 1), train_accuracies, 'g-', marker='o')
ax2.set_xlabel('Época')
ax2.set_ylabel('Precisión (%)')
ax2.set_title('Precisión durante el Entrenamiento')
ax2.grid(True)

plt.tight_layout()
plt.show()

def evaluate_model(model, dataloader):
    model.eval()  # Modo evaluación
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            images = inputs[0].to(device)
            insurance_types = inputs[1].to(device)
            labels = labels.to(device)
            
            outputs = model(images, insurance_types)
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Precisión final del modelo: {accuracy:.2f}%")
    return accuracy

# Evaluar
final_accuracy = evaluate_model(model, dataloader_train)

# Guardar el modelo entrenado
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': train_losses[-1],
    'accuracy': train_accuracies[-1]
}, 'ocr_model_digiNsure.pth')

print("Modelo guardado exitosamente!")

# ---
# 🎓 Conceptos Clave del Proyecto

# Multi-Modal Learning
# - Combina imagen (visual) + tipo de seguro (categórico)
# - Cada modalidad se procesa por separado y luego se fusiona
# - Mejora el rendimiento al capturar información complementaria

# Arquitectura
# Imagen (64x64) → image_layer → 128 features
#                                         ↓ concatenar
# Tipo (5 clases) → type_layer → 16 features
#                                         ↓
#                                 144 features → classifier → 2 clases