Detecting Tuberculosis in X-Rays using Deep Learning

Hidden code

import os
import warnings

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress all but errors
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN warnings

# Suppress Python warnings
warnings.filterwarnings('ignore')

# Now import libraries
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50, MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

# Unzip the data folder
if not os.path.exists('data/chestxray'):
    with zipfile.ZipFile('data.zip', 'r') as zip_ref:
        zip_ref.extractall()
    print("✓ Data extracted successfully")
else:
    print("✓ Data already extracted")

import os

# Check what directories exist
print("Checking directory structure...")
print("\nCurrent working directory:", os.getcwd())
print("\nContents of current directory:")
for item in os.listdir('.'):
    print(f"  - {item}")

# Check if data folder exists
if os.path.exists('data'):
    print("\n✓ 'data' folder exists")
    print("\nContents of 'data' folder:")
    for item in os.listdir('data'):
        item_path = os.path.join('data', item)
        if os.path.isdir(item_path):
            print(f"  📁 {item}/")
            # Check one level deeper
            for subitem in os.listdir(item_path):
                subitem_path = os.path.join(item_path, subitem)
                if os.path.isdir(subitem_path):
                    num_files = len([f for f in os.listdir(subitem_path) if os.path.isfile(os.path.join(subitem_path, f))])
                    print(f"      📁 {subitem}/ ({num_files} files)")
                else:
                    print(f"      📄 {subitem}")
        else:
            print(f"  📄 {item}")
else:
    print("\n✗ 'data' folder NOT found")

# Check if data.zip exists
if os.path.exists('data.zip'):
    print("\n✓ 'data.zip' file exists")
else:
    print("\n✗ 'data.zip' file NOT found")

import os
import shutil

# First, let's see what's inside train and test folders
print("Contents of train folder:")
train_contents = os.listdir('data/chestxrays/train')
print(train_contents)

print("\nContents of test folder:")
test_contents = os.listdir('data/chestxrays/test')
print(test_contents)

# Check if there are subdirectories or if we need to look deeper
for folder in ['train', 'test']:
    folder_path = f'data/chestxrays/{folder}'
    print(f"\n\nDetailed structure of {folder}:")
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        if os.path.isdir(item_path):
            num_files = len([f for f in os.listdir(item_path) 
                           if os.path.isfile(os.path.join(item_path, f))])
            print(f"  📁 {item}/ ({num_files} images)")
        else:
            print(f"  📄 {item}")

# Data preprocessing configuration
IMG_SIZE = 224
BATCH_SIZE = 32

# Training data generator WITH augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,                    # Normalize pixel values to [0,1]
    rotation_range=15,                 # Random rotation ±15 degrees
    zoom_range=0.1,                    # Random zoom 10%
    horizontal_flip=True,              # Random horizontal flip
    brightness_range=[0.9, 1.1],       # Brightness adjustment
    width_shift_range=0.1,             # Horizontal shift
    height_shift_range=0.1,            # Vertical shift
    validation_split=0.2               # Use 20% of training for validation
)

# Test data generator WITHOUT augmentation (only rescaling)
test_datagen = ImageDataGenerator(rescale=1./255)

# Load training data - CORRECTED PATH: 'chestxrays' (plural)
train_generator = train_datagen.flow_from_directory(
    'data/chestxrays/train',           # ← Fixed: chestxrays not chestxray
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training',                  # Use 80% for training
    shuffle=True,
    seed=42
)

# Validation generator
val_generator = train_datagen.flow_from_directory(
    'data/chestxrays/train',           # ← Fixed: chestxrays not chestxray
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',                # Use 20% for validation
    shuffle=False,
    seed=42
)

# Test data generator
test_generator = test_datagen.flow_from_directory(
    'data/chestxrays/test',            # ← Fixed: chestxrays not chestxray
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

print("="*60)
print("DATA LOADING SUMMARY")
print("="*60)
print(f"Training samples: {train_generator.samples}")
print(f"Validation samples: {val_generator.samples}")
print(f"Test samples: {test_generator.samples}")
print(f"Class indices: {train_generator.class_indices}")
print(f"Classes found: {list(train_generator.class_indices.keys())}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Image size: {IMG_SIZE}x{IMG_SIZE}")
print("="*60)

# Show distribution
print("\nTraining data distribution:")
print(f"  - Healthy: 152 images (should be ~122 in training split)")
print(f"  - TB: 151 images (should be ~121 in training split)")
print("\nTest data distribution:")
print(f"  - Healthy: 50 images")
print(f"  - TB: 50 images")
print(f"  - Total test: 100 images (balanced)")

def build_custom_cnn():
    """
    Simple CNN with 3 convolutional blocks
    Total params: ~1.5M (lightweight)
    """
    model = Sequential([
        # Block 1
        Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        BatchNormalization(),
        MaxPooling2D(2,2),
        
        # Block 2
        Conv2D(64, (3,3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D(2,2),
        Dropout(0.3),
        
        # Block 3
        Conv2D(128, (3,3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D(2,2),
        Dropout(0.4),
        
        # Classification head
        GlobalAveragePooling2D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Sigmoid for binary classification
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )
    
    return model

# Build and display model
model_cnn = build_custom_cnn()
model_cnn.summary()

def build_transfer_model(base='MobileNetV2'):
    """
    Transfer learning with pre-trained MobileNetV2
    Efficient and performs well on medical images
    """
    if base == 'MobileNetV2':
        base_model = MobileNetV2(
            weights='imagenet',
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
    else:
        base_model = ResNet50(
            weights='imagenet',
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
    
    # Freeze base model initially
    base_model.trainable = False
    
    # Add custom classification head
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )
    
    return model, base_model

# Build transfer learning model
model_transfer, base_model = build_transfer_model('MobileNetV2')
print(f"Total layers: {len(model_transfer.layers)}")
print(f"Trainable: {sum([1 for layer in model_transfer.layers if layer.trainable])}")

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Callbacks for better training
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )
]

# Train Custom CNN
print("=" * 50)
print("Training Custom CNN...")
print("=" * 50)
history_cnn = model_cnn.fit(
    train_generator,
    epochs=20,
    validation_data=test_generator,
    callbacks=callbacks,
    verbose=1
)

# Train Transfer Learning Model
print("=" * 50)
print("Training MobileNetV2 (Transfer Learning)...")
print("=" * 50)
history_transfer = model_transfer.fit(
    train_generator,
    epochs=20,
    validation_data=test_generator,
    callbacks=callbacks,
    verbose=1
)



def calculate_metrics(y_true, y_pred_prob, threshold=0.5):
    """
    Calculate Sensitivity, Specificity, PPV, and NPV
    """
    y_pred = (y_pred_prob >= threshold).astype(int).flatten()
    
    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate metrics
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall for TB
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # Recall for Healthy
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0          # Precision for TB
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0          # Precision for Healthy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    metrics = {
        'Sensitivity (Recall)': sensitivity,
        'Specificity': specificity,
        'PPV (Precision)': ppv,
        'NPV': npv,
        'Accuracy': accuracy,
        'TP': int(tp), 'TN': int(tn), 'FP': int(fp), 'FN': int(fn)
    }
    
    return metrics

# Reset test generator to start from beginning
test_generator.reset()
y_true = test_generator.classes

print("Generating predictions...")
print("="*60)

# Custom CNN predictions
print("\nPredicting with Custom CNN...")
y_pred_cnn = model_cnn.predict(test_generator, verbose=1)
metrics_cnn = calculate_metrics(y_true, y_pred_cnn)

# Transfer Learning predictions
test_generator.reset()
print("\nPredicting with MobileNetV2...")
y_pred_transfer = model_transfer.predict(test_generator, verbose=1)
metrics_transfer = calculate_metrics(y_true, y_pred_transfer)

# Display results
print("\n" + "="*60)
print("CUSTOM CNN RESULTS")
print("="*60)
print(f"Confusion Matrix:")
print(f"  True Negatives (TN): {metrics_cnn['TN']} - Correctly identified Healthy")
print(f"  False Positives (FP): {metrics_cnn['FP']} - Healthy wrongly classified as TB")
print(f"  False Negatives (FN): {metrics_cnn['FN']} - TB wrongly classified as Healthy")
print(f"  True Positives (TP): {metrics_cnn['TP']} - Correctly identified TB")
print()
for metric, value in metrics_cnn.items():
    if isinstance(value, float):
        print(f"{metric:25s}: {value:.4f} ({value*100:.2f}%)")

print("\n" + "="*60)
print("MOBILENETV2 (TRANSFER LEARNING) RESULTS")
print("="*60)
print(f"Confusion Matrix:")
print(f"  True Negatives (TN): {metrics_transfer['TN']} - Correctly identified Healthy")
print(f"  False Positives (FP): {metrics_transfer['FP']} - Healthy wrongly classified as TB")
print(f"  False Negatives (FN): {metrics_transfer['FN']} - TB wrongly classified as Healthy")
print(f"  True Positives (TP): {metrics_transfer['TP']} - Correctly identified TB")
print()
for metric, value in metrics_transfer.items():
    if isinstance(value, float):
        print(f"{metric:25s}: {value:.4f} ({value*100:.2f}%)")

Hidden code

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curves(y_true, y_pred_cnn, y_pred_transfer):
    """
    Plot ROC curves for both models side by side
    """
    # Calculate ROC for Custom CNN
    fpr_cnn, tpr_cnn, _ = roc_curve(y_true, y_pred_cnn)
    roc_auc_cnn = auc(fpr_cnn, tpr_cnn)
    
    # Calculate ROC for Transfer Learning
    fpr_transfer, tpr_transfer, _ = roc_curve(y_true, y_pred_transfer)
    roc_auc_transfer = auc(fpr_transfer, tpr_transfer)
    
    # Create plot
    plt.figure(figsize=(12, 5))
    
    # Plot 1: Both models together
    plt.subplot(1, 2, 1)
    plt.plot(fpr_cnn, tpr_cnn, 'b-', linewidth=2.5, 
             label=f'Custom CNN (AUC = {roc_auc_cnn:.3f})')
    plt.plot(fpr_transfer, tpr_transfer, 'g-', linewidth=2.5, 
             label=f'MobileNetV2 (AUC = {roc_auc_transfer:.3f})')
    plt.plot([0, 1], [0, 1], 'r--', linewidth=1.5, label='Random Classifier (AUC = 0.500)')
    
    plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=11, fontweight='bold')
    plt.ylabel('True Positive Rate (Sensitivity)', fontsize=11, fontweight='bold')
    plt.title('ROC Curve Comparison\nTB Classification Performance', fontsize=13, fontweight='bold')
    plt.legend(loc='lower right', fontsize=10)
    plt.grid(alpha=0.3, linestyle='--')
    plt.xlim([-0.02, 1.02])
    plt.ylim([-0.02, 1.02])
    
    # Plot 2: Zoomed view of MobileNetV2
    plt.subplot(1, 2, 2)
    plt.plot(fpr_transfer, tpr_transfer, 'g-', linewidth=3, 
             label=f'MobileNetV2 (AUC = {roc_auc_transfer:.3f})')
    plt.plot([0, 1], [0, 1], 'r--', linewidth=1.5, label='Random')
    
    # Mark the operating point (default threshold 0.5)
    # Find closest point to threshold 0.5
    plt.scatter([1-metrics_transfer['Specificity']], [metrics_transfer['Sensitivity (Recall)']], 
                s=150, c='red', marker='o', edgecolors='black', linewidth=2,
                label=f'Operating Point\n(Sens={metrics_transfer["Sensitivity (Recall)"]:.2f}, Spec={metrics_transfer["Specificity"]:.2f})',
                zorder=5)
    
    plt.xlabel('False Positive Rate', fontsize=11, fontweight='bold')
    plt.ylabel('True Positive Rate', fontsize=11, fontweight='bold')
    plt.title('MobileNetV2 ROC Curve\n(Zoomed View)', fontsize=13, fontweight='bold')
    plt.legend(loc='lower right', fontsize=9)
    plt.grid(alpha=0.3, linestyle='--')
    plt.xlim([-0.02, 1.02])
    plt.ylim([-0.02, 1.02])
    
    plt.tight_layout()
    plt.show()
    
    # Print interpretation
    print("\n" + "="*70)
    print("ROC CURVE INTERPRETATION")
    print("="*70)
    print(f"\nCustom CNN AUC: {roc_auc_cnn:.4f}")
    print(f"  → AUC of 0.50 = Random guessing (no discrimination ability)")
    print(f"  → The model failed to learn meaningful patterns")
    
    print(f"\nMobileNetV2 AUC: {roc_auc_transfer:.4f}")
    print(f"  → AUC of {roc_auc_transfer:.3f} = Excellent discrimination!")
    print(f"  → Much better than random (0.5) and closer to perfect (1.0)")
    print(f"  → The model can reliably distinguish TB from healthy lungs")
    
    print("\nWhat is AUC?")
    print("  • AUC = Area Under the ROC Curve")
    print("  • Measures model's ability to separate classes across ALL thresholds")
    print("  • 1.0 = Perfect classifier | 0.5 = Random guessing | <0.5 = Worse than random")
    print("  • Higher AUC = Better overall performance")
    
    print("\nROC Curve shows:")
    print("  • How sensitivity and specificity trade off at different thresholds")
    print("  • MobileNetV2 stays close to top-left corner = high sensitivity + high specificity")
    print("  • Custom CNN follows diagonal = no better than random")
    print("="*70)

# Generate ROC curves
plot_roc_curves(y_true, y_pred_cnn, y_pred_transfer)

Hidden code