You are a member of an elite group of data scientists, specialising in advanced facial recognition technology, this firm is dedicated to identifying and safeguarding prominent individuals from various spheres—ranging from entertainment and sports to politics and philanthropy. The team's mission is to deploy AI-driven solutions that can accurately distinguish between images of notable personalities and the general populace, enhancing the personal security of such high-profile individuals. You're to focus on Arnold Schwarzenegger, a figure whose accomplishments span from bodybuilding champion to Hollywood icon, and from philanthropist to the Governor of California.
The Data
The data/lfw_arnie_nonarnie.csv
dataset contains processed facial image data derived from the "Labeled Faces in the Wild" (LFW) dataset, focusing specifically on images of Arnold Schwarzenegger and other individuals not identified as him. This dataset has been prepared to aid in the development and evaluation of facial recognition models. There are 40 images of Arnold Schwarzenegger and 150 of other people.
Column Name | Description |
---|---|
PC1, PC2, ... PCN | Principal components from PCA, capturing key image features. |
Label | Binary indicator: 1 for Arnold Schwarzenegger, 0 for others. |
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Read the CSV file
df = pd.read_csv("data/lfw_arnie_nonarnie.csv")
# Seperate the predictor and class label
X = df.drop('Label', axis=1)
y = df['Label']
# Split the data into training and testing sets using stratify to balance the class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
kf = KFold(n_splits=5, random_state=42, shuffle=True)
# Start coding here
df.head()
Part 1 - Logistic Regression
# Run Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
# Build pipeline
lr_steps = [('Scaler', StandardScaler()), ('lr', LogisticRegression())]
lr_pipeline = Pipeline(lr_steps)
# param_grid
lr_param_grid = {'lr__C': [0.01, 0.1, 1, 10]}
# Grid Search
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lr_grid_search = GridSearchCV(lr_pipeline, param_grid=lr_param_grid, cv=kf, scoring='accuracy')
# Performing Grid Search
lr_grid_search.fit(X_train, y_train)
lr_accuracy = lr_grid_search.best_score_
lr_best_params = lr_grid_search.best_params_
print(lr_accuracy, lr_best_params)
# Key metrics
lr_pred = lr_grid_search.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
print(lr_accuracy)
print(lr_precision)
print(lr_f1)
Model 2 - KNeighborsClassifier
# import model
from sklearn.neighbors import KNeighborsClassifier
# Build pipeline
knn_steps = [('Scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
knn_pipeline = Pipeline(knn_steps)
# param_grid
knn_param_grid = {'knn__n_neighbors': range(1, 10)}
# Grid Search
kf = KFold(n_splits=5, shuffle=True, random_state=42)
knn_grid_search = GridSearchCV(knn_pipeline, param_grid=knn_param_grid, cv=kf, scoring='accuracy')
# Performing Grid Search
knn_grid_search.fit(X_train, y_train)
knn_accuracy = knn_grid_search.best_score_
knn_best_params = knn_grid_search.best_params_
print(knn_accuracy, knn_best_params)
Model 3 - DecisionTree Classifier
# import model
from sklearn.tree import DecisionTreeClassifier
# Build pipeline
dt_steps = [('Scaler', StandardScaler()), ('dt', DecisionTreeClassifier())]
dt_pipeline = Pipeline(dt_steps)
# param_grid
dt_param_grid = {"dt__max_depth": [2, 5, 10], "dt__min_samples_split": [2, 5, 10, 20], "dt__random_state": [42]}
# Grid Search
kf = KFold(n_splits=5, shuffle=True, random_state=42)
dt_grid_search = GridSearchCV(dt_pipeline, param_grid=dt_param_grid, cv=kf, scoring='accuracy')
# Performing Grid Search
dt_grid_search.fit(X_train, y_train)
dt_accuracy = knn_grid_search.best_score_
dt_best_params = knn_grid_search.best_params_
print(dt_accuracy, dt_best_params)
Combine all 3 Models
# Store initialized models in a dictionary
# This approach allows for easy expansion and comparison of different models
models = {"LogisticRegression": LogisticRegression(),
"KNeighborsClassifier": KNeighborsClassifier(),
"DecisionTreeClassifier": DecisionTreeClassifier()}
# Store the model parameters in a dictionary
# Parameters are tailored to each model to explore a range of options during Grid Search
param_grid = {"LogisticRegression": {"LogisticRegression__C": [0.01, 0.1, 1, 10]},
"KNeighborsClassifier": {"KNeighborsClassifier__n_neighbors": range(1,10)},
"DecisionTreeClassifier": {"DecisionTreeClassifier__max_depth": [2, 5, 10],
"DecisionTreeClassifier__min_samples_split": [2, 5, 10, 20],
"DecisionTreeClassifier__random_state": [42]}}
# Define cross-validation parameters
# KFold is used here to ensure that our model generalizes well on unseen data
kf = KFold(n_splits=5, random_state=42, shuffle=True)
# Prepare to collect GridSearch CV results
pipe_accuracies = {}
pipe_params = {}
pipelines = {}
# Create separate pipelines for each model, loop through the models and perform GridSearchCV
# Grid Search helps find the best parameter combination for each model
# Pipelines integrate preprocessing (e.g., scaling) with the model for cleaner code and to prevent data leakage
for name, model in models.items():
pipeline = Pipeline(steps=[
('scaler', StandardScaler()),
(name, model)
])
# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid[name], cv=kf, scoring='accuracy')
# Perform grid search and fit the model and store the results
grid_search.fit(X_train, y_train)
pipe_accuracies[name] = grid_search.best_score_
pipe_params[name] = grid_search.best_params_
pipelines[name] = grid_search
# Select the best model based on the best cross-validation score
best_model_name = max(pipe_accuracies)
best_model_cv_score = max(pipe_accuracies.values())
best_model_info = pipe_params[best_model_name]
# Print the best model name, parameters, and CV score
print(f"Best Model: {best_model_name}")
print(f"Best Model Parameters: {best_model_info}")
print(f"Best Model CV Score: {best_model_cv_score}")
# Compute and print key performance metrics
# These metrics help us understand the model's effectiveness in classification
y_pred = pipelines[best_model_name].predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
# print
print(pipe_accuracies)
print(pipe_params)
print(pipelines)