Skip to content
Course Notes: Dimensionality Reduction in Python
Dimensionality Reduction
T-SNE
# Non-numerical columns in the dataset
non_numeric = ['Branch', 'Gender', 'Component']
# Drop the non-numerical columns from df
df_numeric = df.drop(non_numeric, axis=1)
# Create a t-SNE model with learning rate 50
m = TSNE(learning_rate=50)
# Fit and transform the t-SNE model on the numeric dataset
tsne_features = m.fit_transform(df_numeric)
print(tsne_features.shape)
Curse of Dimensionality
Models tend to overfit badly on high-dimensional data.
from sklearn.feature_selection import VarianceThreshold
# Create a VarianceThreshold feature selector
sel = VarianceThreshold(threshold=0.001)
# Fit the selector to normalized head_df
sel.fit(head_df / head_df.mean())
# Create a boolean mask
mask = sel.get_support()
# Apply the mask to create a reduced DataFrame
reduced_df = head_df.loc[:, mask]
print(f"Dimensionality reduced from {head_df.shape[1]} to {reduced_df.shape[1]}.")
Correlation Matrix
# Create the correlation matrix
corr = ansur_df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Add the mask to the heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f")
plt.show()
# Calculate the correlation matrix and take the absolute value
corr_df = ansur_df.corr().abs()
# Create a True/False mask and apply it
mask = np.triu(np.ones_like(corr_df, dtype=bool))
tri_df = corr_df.mask(mask)
# List column names of highly correlated features (r > 0.95)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)]
# Drop the features in the to_drop list
reduced_df = ansur_df.drop(to_drop, axis=1)
print(f"The reduced_df DataFrame has {reduced_df.shape[1]} columns.")
Recursive Feature Eliminator (RFE)
# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)
# Fits the eliminator to the data
rfe.fit(X_train, y_train)
# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))
# Print the features that are not eliminated
print(X.columns[rfe.support_])
# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print(f"{acc:.1%} accuracy on test set.")
Combine Feature Selection from Multiple Models
from sklearn.linear_model import LassoCV
# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print(f'Optimal alpha = {lcv.alpha_:.3f}')
# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print(f'The model explains {r_squared:.1%} of the test set variance')
# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print(f'{sum(lcv_mask)} features out of {len(lcv_mask)} selected')
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(),
n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)
# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')
# Assign the support array to gb_mask
gb_mask = rfe_gb.support_
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(),
n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)
# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')
# Assign the support array to rf_mask
rf_mask = rfe_rf.support_
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)
# Create a mask for features selected by all 3 models
meta_mask = votes == 3
# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]
# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set using {len(lm.coef_)} features.')
PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Create the scaler
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)
# Create the PCA instance and fit and transform the data with pca
pca = PCA()
pc = pca.fit_transform(ansur_std)
pc_df = pd.DataFrame(pc, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4'])
# Create a pairplot of the principal component DataFrame
sns.pairplot(pc_df)
plt.show()
# Inspect the explained variance ratio per component
print(pca.explained_variance_ratio_.cumsum())
#extract the component vectors
vectors = pca.components_.round(2)
# Print feature effects
print('PC 1 effects = ' + str(dict(zip(ansur_df.columns, vectors[0]))))
print('PC 2 effects = ' + str(dict(zip(ansur_df.columns, vectors[1]))))
pipe = Pipeline([('scaler', StandardScaler()),
('reducer', PCA(n_components=2))])
# Fit the pipeline to poke_df and transform the data
pc = pipe.fit_transform(poke_df)
# Add the 2 components to poke_cat_df
poke_cat_df['PC 1'] = pc[:, 0]
poke_cat_df['PC 2'] = pc[:, 1]
# Use the Legendary feature to color the PC 1 vs. PC 2 scatterplot
sns.scatterplot(data=poke_cat_df,
x='PC 1', y='PC 2', hue='Legendary')
plt.show()