Skip to content

You are a member of an elite group of data scientists, specialising in advanced facial recognition technology, this firm is dedicated to identifying and safeguarding prominent individuals from various spheres—ranging from entertainment and sports to politics and philanthropy. The team's mission is to deploy AI-driven solutions that can accurately distinguish between images of notable personalities and the general populace, enhancing the personal security of such high-profile individuals. You're to focus on Arnold Schwarzenegger, a figure whose accomplishments span from bodybuilding champion to Hollywood icon, and from philanthropist to the Governor of California.

The Data

The data/lfw_arnie_nonarnie.csv dataset contains processed facial image data derived from the "Labeled Faces in the Wild" (LFW) dataset, focusing specifically on images of Arnold Schwarzenegger and other individuals not identified as him. This dataset has been prepared to aid in the development and evaluation of facial recognition models. There are 40 images of Arnold Schwarzenegger and 150 of other people.

Column NameDescription
PC1, PC2, ... PCNPrincipal components from PCA, capturing key image features.
LabelBinary indicator: 1 for Arnold Schwarzenegger, 0 for others.
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file 
df = pd.read_csv("data/lfw_arnie_nonarnie.csv")

# Seperate the predictor and class label
X = df.drop('Label', axis=1)
y = df['Label'] 

# Split the data into training and testing sets using stratify to balance the class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
print(df.shape)
# Visulizing the imbalance in the data
sns.countplot(x='Label', data=df)
plt.title("Class Distribution: 1 = Arnold, 0 = Others")
plt.show()
# First Model to be used is the Logistic Regression. It is the baseline model for this project.
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred_logist = logistic_model.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_logist))
# Second Model to be used is the Random Forest Classifier.
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators = 100, random_state = 21)
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)

print("Random Forest:")
print(classification_report(y_test, y_pred_forest))
# Third Model to be used is the MLP Classifier (Neural Network).
from sklearn.neural_network import MLPClassifier

MLP_model = MLPClassifier(hidden_layer_sizes = (128, 64), max_iter = 500, random_state = 21)
MLP_model.fit(X_train, y_train)
y_pred_MLP = MLP_model.predict(X_test)

print("Neural Network (MLP):")
print(classification_report(y_test, y_pred_MLP))
# Visual Representation of the Models compared to show the best performing model.
def models_confusion(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot= True, fmt = 'd', cmap='Blues')
    plt.xlabel('predicted')
    plt.ylabel('acual')
    plt.title(title)
    plt.show()

models_confusion(y_test, y_pred_logist, 'Logistic Regression')
models_confusion(y_test, y_pred_forest, 'Random Forest')
models_confusion(y_test, y_pred_MLP, 'Neural Network (MLP)')
data = {'models': ['Logistic Regression', 'Random Forest', 'Neural Network (MLP)'], 'accuracy': [accuracy_score(y_test, y_pred_logist), accuracy_score(y_test, y_pred_forest), accuracy_score(y_test, y_pred_MLP)]}

sns.catplot(x = 'models', y = 'accuracy', data = data, kind= 'bar')
plt.title('Models Accuracy Comparison')
plt.show()
#1 Define KFold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=21)

#2 Create models dictionary
models = {
    'logistic': LogisticRegression(max_iter=1000),
    'forest': RandomForestClassifier(random_state=21),
    'mlp': MLPClassifier(max_iter=1000, random_state=21)
}

#3 Create parameter grids
params = {
    'logistic': {
        'model__C': [0.1, 1, 10],
        'model__penalty': ['l2'],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'forest': {
        'model__n_estimators': [50, 100],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    },
    'mlp': {
        'model__hidden_layer_sizes': [(64,), (128, 64)],
        'model__activation': ['relu', 'tanh'],
        'model__alpha': [0.0001, 0.001]
    }
}

#4 Loop through models and apply GridSearchCV
best_models = {}

for name, model in models.items():
    print(f"Running GridSearchCV for: {name}")
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    grid = GridSearchCV(
        pipeline,
        params[name],
        cv=kfold,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    
    print(f"Best Score for {name}: {grid.best_score_:.4f}")
    print(f"Best Parameters: {grid.best_params_}\n")
    
    best_models[name] = grid
# Find best model overall
best_model_name = max(best_models, key=lambda k: best_models[k].best_score_)
best_model = best_models[best_model_name]
best_model_info = best_model.best_params_
score = best_model.best_score_
best_model_cv_score = best_model.best_score_

y_pred = best_model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Best overall model: {best_model_name}")
print(f"Best accuracy: {score:.4f}")
print(f"Best parameters: {best_model_info}")

plt.figure(figsize=(4, 6))
plt.bar(['Model Accuracy'], [accuracy * 100], color='skyblue', label=f"{accuracy*100:.2f}%")
plt.axhline(100, color='gray', linestyle='--', label='Perfect Accuracy')

# Add text and labels
plt.ylim(0, 110)
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracy vs. Perfect Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

Visualizing the pictures in the Data set

import math
import matplotlib.pyplot as plt

print("Total pixels per image:", 150)
factors = [(i, 150 // i) for i in range(1, 151) if 150 % i == 0]
print("Possible reshape dimensions:", factors)

# Try reshaping to 15x10
h, w = 15, 10  # 15 * 10 = 150 pixels

# Assuming X and y are numpy arrays
import numpy as np

# Example data
X = np.random.rand(5, 150)  # 5 images, each with 150 pixels
y = np.random.randint(0, 2, 5)  # 5 labels, either 0 or 1

# Plot 5 example faces
fig, axes = plt.subplots(1, 5, figsize=(12, 6))
for i, ax in enumerate(axes):
    image = X[i].reshape((h, w))
    ax.imshow(image, cmap='gray')
    ax.set_title("Arnie" if y[i] == 1 else "Not Arnie")
    ax.axis("off")

plt.suptitle("Sample Images from Dataset")
plt.tight_layout()
plt.show()