sklearn/examples/cluster/plot_feature_agglomeration_...

"""
==============================================
Feature agglomeration vs. univariate selection
==============================================

This example compares 2 dimensionality reduction strategies:

- univariate feature selection with Anova

- feature agglomeration with Ward hierarchical clustering

Both methods are compared in a regression problem using
a BayesianRidge as supervised estimator.

"""

# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause

# %%
import shutil
import tempfile

import matplotlib.pyplot as plt
import numpy as np
from joblib import Memory
from scipy import linalg, ndimage

from sklearn import feature_selection
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

# %%
# Set parameters
n_samples = 200
size = 40  # image size
roi_size = 15
snr = 5.0
np.random.seed(0)

# %%
# Generate data
coef = np.zeros((size, size))
coef[0:roi_size, 0:roi_size] = -1.0
coef[-roi_size:, -roi_size:] = 1.0

X = np.random.randn(n_samples, size**2)
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())

# %%
# add noise
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)
y += noise_coef * noise

# %%
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(location=cachedir, verbose=1)

# %%
# Ward agglomeration followed by BayesianRidge
connectivity = grid_to_graph(n_x=size, n_y=size)
ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)
clf = Pipeline([("ward", ward), ("ridge", ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {"ward__n_clusters": [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# %%
# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([("anova", anova), ("ridge", ridge)])
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, {"anova__percentile": [5, 10, 20]}, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
coef_selection_ = coef_.reshape(size, size)

# %%
# Inverse the transformation to plot the results on an image
plt.close("all")
plt.figure(figsize=(7.3, 2.7))
plt.subplot(1, 3, 1)
plt.imshow(coef, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("True weights")
plt.subplot(1, 3, 2)
plt.imshow(coef_selection_, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("Feature Selection")
plt.subplot(1, 3, 3)
plt.imshow(coef_agglomeration_, interpolation="nearest", cmap=plt.cm.RdBu_r)
plt.title("Feature Agglomeration")
plt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)
plt.show()

# %%
# Attempt to remove the temporary cachedir, but don't worry if it fails
shutil.rmtree(cachedir, ignore_errors=True)
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`==============================================`
			`Feature agglomeration vs. univariate selection`
			`==============================================`

			`This example compares 2 dimensionality reduction strategies:`

			`- univariate feature selection with Anova`

			`- feature agglomeration with Ward hierarchical clustering`

			`Both methods are compared in a regression problem using`
			`a BayesianRidge as supervised estimator.`

			`"""`

			`# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>`
			`# License: BSD 3 clause`

			`# %%`
			`import shutil`
			`import tempfile`

			`import matplotlib.pyplot as plt`
			`import numpy as np`
			`from joblib import Memory`
			`from scipy import linalg, ndimage`

			`from sklearn import feature_selection`
			`from sklearn.cluster import FeatureAgglomeration`
			`from sklearn.feature_extraction.image import grid_to_graph`
			`from sklearn.linear_model import BayesianRidge`
			`from sklearn.model_selection import GridSearchCV, KFold`
			`from sklearn.pipeline import Pipeline`

			`# %%`
			`# Set parameters`
			`n_samples = 200`
			`size = 40 # image size`
			`roi_size = 15`
			`snr = 5.0`
			`np.random.seed(0)`

			`# %%`
			`# Generate data`
			`coef = np.zeros((size, size))`
			`coef[0:roi_size, 0:roi_size] = -1.0`
			`coef[-roi_size:, -roi_size:] = 1.0`

			`X = np.random.randn(n_samples, size**2)`
			`for x in X: # smooth data`
			`x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()`
			`X -= X.mean(axis=0)`
			`X /= X.std(axis=0)`

			`y = np.dot(X, coef.ravel())`

			`# %%`
			`# add noise`
			`noise = np.random.randn(y.shape[0])`
			`noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)`
			`y += noise_coef * noise`

			`# %%`
			`# Compute the coefs of a Bayesian Ridge with GridSearch`
			`cv = KFold(2) # cross-validation generator for model selection`
			`ridge = BayesianRidge()`
			`cachedir = tempfile.mkdtemp()`
			`mem = Memory(location=cachedir, verbose=1)`

			`# %%`
			`# Ward agglomeration followed by BayesianRidge`
			`connectivity = grid_to_graph(n_x=size, n_y=size)`
			`ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)`
			`clf = Pipeline([("ward", ward), ("ridge", ridge)])`
			`# Select the optimal number of parcels with grid search`
			`clf = GridSearchCV(clf, {"ward__n_clusters": [10, 20, 30]}, n_jobs=1, cv=cv)`
			`clf.fit(X, y) # set the best parameters`
			`coef_ = clf.best_estimator_.steps[-1][1].coef_`
			`coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)`
			`coef_agglomeration_ = coef_.reshape(size, size)`

			`# %%`
			`# Anova univariate feature selection followed by BayesianRidge`
			`f_regression = mem.cache(feature_selection.f_regression) # caching function`
			`anova = feature_selection.SelectPercentile(f_regression)`
			`clf = Pipeline([("anova", anova), ("ridge", ridge)])`
			`# Select the optimal percentage of features with grid search`
			`clf = GridSearchCV(clf, {"anova__percentile": [5, 10, 20]}, cv=cv)`
			`clf.fit(X, y) # set the best parameters`
			`coef_ = clf.best_estimator_.steps[-1][1].coef_`
			`coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))`
			`coef_selection_ = coef_.reshape(size, size)`

			`# %%`
			`# Inverse the transformation to plot the results on an image`
			`plt.close("all")`
			`plt.figure(figsize=(7.3, 2.7))`
			`plt.subplot(1, 3, 1)`
			`plt.imshow(coef, interpolation="nearest", cmap=plt.cm.RdBu_r)`
			`plt.title("True weights")`
			`plt.subplot(1, 3, 2)`
			`plt.imshow(coef_selection_, interpolation="nearest", cmap=plt.cm.RdBu_r)`
			`plt.title("Feature Selection")`
			`plt.subplot(1, 3, 3)`
			`plt.imshow(coef_agglomeration_, interpolation="nearest", cmap=plt.cm.RdBu_r)`
			`plt.title("Feature Agglomeration")`
			`plt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)`
			`plt.show()`

			`# %%`
			`# Attempt to remove the temporary cachedir, but don't worry if it fails`
			`shutil.rmtree(cachedir, ignore_errors=True)`