196 lines
7.3 KiB
Python
196 lines
7.3 KiB
Python
"""
|
|
============================================
|
|
Model-based and sequential feature selection
|
|
============================================
|
|
|
|
This example illustrates and compares two approaches for feature selection:
|
|
:class:`~sklearn.feature_selection.SelectFromModel` which is based on feature
|
|
importance, and
|
|
:class:`~sklearn.feature_selection.SequentialFeatureSelector` which relies
|
|
on a greedy approach.
|
|
|
|
We use the Diabetes dataset, which consists of 10 features collected from 442
|
|
diabetes patients.
|
|
|
|
Authors: `Manoj Kumar <mks542@nyu.edu>`_,
|
|
`Maria Telenczuk <https://github.com/maikia>`_, Nicolas Hug.
|
|
|
|
License: BSD 3 clause
|
|
|
|
"""
|
|
|
|
# %%
|
|
# Loading the data
|
|
# ----------------
|
|
#
|
|
# We first load the diabetes dataset which is available from within
|
|
# scikit-learn, and print its description:
|
|
from sklearn.datasets import load_diabetes
|
|
|
|
diabetes = load_diabetes()
|
|
X, y = diabetes.data, diabetes.target
|
|
print(diabetes.DESCR)
|
|
|
|
# %%
|
|
# Feature importance from coefficients
|
|
# ------------------------------------
|
|
#
|
|
# To get an idea of the importance of the features, we are going to use the
|
|
# :class:`~sklearn.linear_model.RidgeCV` estimator. The features with the
|
|
# highest absolute `coef_` value are considered the most important.
|
|
# We can observe the coefficients directly without needing to scale them (or
|
|
# scale the data) because from the description above, we know that the features
|
|
# were already standardized.
|
|
# For a more complete example on the interpretations of the coefficients of
|
|
# linear models, you may refer to
|
|
# :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`. # noqa: E501
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
from sklearn.linear_model import RidgeCV
|
|
|
|
ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y)
|
|
importance = np.abs(ridge.coef_)
|
|
feature_names = np.array(diabetes.feature_names)
|
|
plt.bar(height=importance, x=feature_names)
|
|
plt.title("Feature importances via coefficients")
|
|
plt.show()
|
|
|
|
# %%
|
|
# Selecting features based on importance
|
|
# --------------------------------------
|
|
#
|
|
# Now we want to select the two features which are the most important according
|
|
# to the coefficients. The :class:`~sklearn.feature_selection.SelectFromModel`
|
|
# is meant just for that. :class:`~sklearn.feature_selection.SelectFromModel`
|
|
# accepts a `threshold` parameter and will select the features whose importance
|
|
# (defined by the coefficients) are above this threshold.
|
|
#
|
|
# Since we want to select only 2 features, we will set this threshold slightly
|
|
# above the coefficient of third most important feature.
|
|
from time import time
|
|
|
|
from sklearn.feature_selection import SelectFromModel
|
|
|
|
threshold = np.sort(importance)[-3] + 0.01
|
|
|
|
tic = time()
|
|
sfm = SelectFromModel(ridge, threshold=threshold).fit(X, y)
|
|
toc = time()
|
|
print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")
|
|
print(f"Done in {toc - tic:.3f}s")
|
|
|
|
# %%
|
|
# Selecting features with Sequential Feature Selection
|
|
# ----------------------------------------------------
|
|
#
|
|
# Another way of selecting features is to use
|
|
# :class:`~sklearn.feature_selection.SequentialFeatureSelector`
|
|
# (SFS). SFS is a greedy procedure where, at each iteration, we choose the best
|
|
# new feature to add to our selected features based a cross-validation score.
|
|
# That is, we start with 0 features and choose the best single feature with the
|
|
# highest score. The procedure is repeated until we reach the desired number of
|
|
# selected features.
|
|
#
|
|
# We can also go in the reverse direction (backward SFS), *i.e.* start with all
|
|
# the features and greedily choose features to remove one by one. We illustrate
|
|
# both approaches here.
|
|
|
|
from sklearn.feature_selection import SequentialFeatureSelector
|
|
|
|
tic_fwd = time()
|
|
sfs_forward = SequentialFeatureSelector(
|
|
ridge, n_features_to_select=2, direction="forward"
|
|
).fit(X, y)
|
|
toc_fwd = time()
|
|
|
|
tic_bwd = time()
|
|
sfs_backward = SequentialFeatureSelector(
|
|
ridge, n_features_to_select=2, direction="backward"
|
|
).fit(X, y)
|
|
toc_bwd = time()
|
|
|
|
print(
|
|
"Features selected by forward sequential selection: "
|
|
f"{feature_names[sfs_forward.get_support()]}"
|
|
)
|
|
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
|
|
print(
|
|
"Features selected by backward sequential selection: "
|
|
f"{feature_names[sfs_backward.get_support()]}"
|
|
)
|
|
print(f"Done in {toc_bwd - tic_bwd:.3f}s")
|
|
|
|
# %%
|
|
# Interestingly, forward and backward selection have selected the same set of
|
|
# features. In general, this isn't the case and the two methods would lead to
|
|
# different results.
|
|
#
|
|
# We also note that the features selected by SFS differ from those selected by
|
|
# feature importance: SFS selects `bmi` instead of `s1`. This does sound
|
|
# reasonable though, since `bmi` corresponds to the third most important
|
|
# feature according to the coefficients. It is quite remarkable considering
|
|
# that SFS makes no use of the coefficients at all.
|
|
#
|
|
# To finish with, we should note that
|
|
# :class:`~sklearn.feature_selection.SelectFromModel` is significantly faster
|
|
# than SFS. Indeed, :class:`~sklearn.feature_selection.SelectFromModel` only
|
|
# needs to fit a model once, while SFS needs to cross-validate many different
|
|
# models for each of the iterations. SFS however works with any model, while
|
|
# :class:`~sklearn.feature_selection.SelectFromModel` requires the underlying
|
|
# estimator to expose a `coef_` attribute or a `feature_importances_`
|
|
# attribute. The forward SFS is faster than the backward SFS because it only
|
|
# needs to perform `n_features_to_select = 2` iterations, while the backward
|
|
# SFS needs to perform `n_features - n_features_to_select = 8` iterations.
|
|
#
|
|
# Using negative tolerance values
|
|
# -------------------------------
|
|
#
|
|
# :class:`~sklearn.feature_selection.SequentialFeatureSelector` can be used
|
|
# to remove features present in the dataset and return a
|
|
# smaller subset of the original features with `direction="backward"`
|
|
# and a negative value of `tol`.
|
|
#
|
|
# We begin by loading the Breast Cancer dataset, consisting of 30 different
|
|
# features and 569 samples.
|
|
import numpy as np
|
|
|
|
from sklearn.datasets import load_breast_cancer
|
|
|
|
breast_cancer_data = load_breast_cancer()
|
|
X, y = breast_cancer_data.data, breast_cancer_data.target
|
|
feature_names = np.array(breast_cancer_data.feature_names)
|
|
print(breast_cancer_data.DESCR)
|
|
|
|
# %%
|
|
# We will make use of the :class:`~sklearn.linear_model.LogisticRegression`
|
|
# estimator with :class:`~sklearn.feature_selection.SequentialFeatureSelector`
|
|
# to perform the feature selection.
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import roc_auc_score
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
for tol in [-1e-2, -1e-3, -1e-4]:
|
|
start = time()
|
|
feature_selector = SequentialFeatureSelector(
|
|
LogisticRegression(),
|
|
n_features_to_select="auto",
|
|
direction="backward",
|
|
scoring="roc_auc",
|
|
tol=tol,
|
|
n_jobs=2,
|
|
)
|
|
model = make_pipeline(StandardScaler(), feature_selector, LogisticRegression())
|
|
model.fit(X, y)
|
|
end = time()
|
|
print(f"\ntol: {tol}")
|
|
print(f"Features selected: {feature_names[model[1].get_support()]}")
|
|
print(f"ROC AUC score: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.3f}")
|
|
print(f"Done in {end - start:.3f}s")
|
|
|
|
# %%
|
|
# We can see that the number of features selected tend to increase as negative
|
|
# values of `tol` approach to zero. The time taken for feature selection also
|
|
# decreases as the values of `tol` come closer to zero.
|