You are a member of an elite group of data scientists, specialising in advanced facial recognition technology, this firm is dedicated to identifying and safeguarding prominent individuals from various spheres—ranging from entertainment and sports to politics and philanthropy. The team's mission is to deploy AI-driven solutions that can accurately distinguish between images of notable personalities and the general populace, enhancing the personal security of such high-profile individuals. You're to focus on Arnold Schwarzenegger, a figure whose accomplishments span from bodybuilding champion to Hollywood icon, and from philanthropist to the Governor of California.
The Data
The data/lfw_arnie_nonarnie.csv dataset contains processed facial image data derived from the "Labeled Faces in the Wild" (LFW) dataset, focusing specifically on images of Arnold Schwarzenegger and other individuals not identified as him. This dataset has been prepared to aid in the development and evaluation of facial recognition models. There are 40 images of Arnold Schwarzenegger and 150 of other people.
| Column Name | Description |
|---|---|
| PC1, PC2, ... PCN | Principal components from PCA, capturing key image features. |
| Label | Binary indicator: 1 for Arnold Schwarzenegger, 0 for others. |
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
# Read the CSV file
df = pd.read_csv("data/lfw_arnie_nonarnie.csv")
# Seperate the predictor and class label
X = df.drop('Label', axis=1)
y = df['Label']
# Split the data into training and testing sets using stratify to balance the class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)# Start coding here
# Define the classification models
models = {
'Logistic Regression': Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(random_state=21))
]),
'Random Forest': Pipeline([
('classifier', RandomForestClassifier(random_state=21, class_weight= 'balanced'))
]),
'SVM': Pipeline([
('scaler', StandardScaler()),
('classifier', SVC(random_state=21))
]),
'XGBoost' : Pipeline([
('scaler', StandardScaler()),
('classifier', XGBClassifier(
random_state=21,
scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1])))
])
}
# Dictionary to store cross-validation scores
cv_scores = {}
# Perform cross-validation and find the best model
for model_name, model in models.items():
# Create a pipeline with SMOTE and the model
smote_model = make_pipeline(
SMOTE(random_state=21),
model
)
cv = KFold(n_splits=3, shuffle=True, random_state=21)
scores = cross_val_score(smote_model, X_train, y_train, cv=cv, scoring='accuracy')
cv_scores[model_name] = scores.mean()
best_model_name = max(cv_scores, key=cv_scores.get)
best_model_cv_score = cv_scores[best_model_name]
best_model_info = models[best_model_name].get_params()print(best_model_name)# Train the best model using SMOTE on the entire training data
best_model = make_pipeline(SMOTE(random_state=21), models[best_model_name])
best_model.fit(X_train, y_train)# Evaluate on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Best Model: {best_model_name}")
print(f"Cross-Validation Score: {best_model_cv_score:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
# Optional: Confusion matrix visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()