sklearn/examples/decomposition/plot_kernel_pca.py

"""
==========
Kernel PCA
==========

This example shows the difference between the Principal Components Analysis
(:class:`~sklearn.decomposition.PCA`) and its kernelized version
(:class:`~sklearn.decomposition.KernelPCA`).

On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
to find a projection of the data which linearly separates them while it is not the case
with :class:`~sklearn.decomposition.PCA`.

Finally, we show that inverting this projection is an approximation with
:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
:class:`~sklearn.decomposition.PCA`.
"""

# Authors: Mathieu Blondel
#          Andreas Mueller
#          Guillaume Lemaitre
# License: BSD 3 clause

# %%
# Projecting data: `PCA` vs. `KernelPCA`
# --------------------------------------
#
# In this section, we show the advantages of using a kernel when
# projecting data using a Principal Component Analysis (PCA). We create a
# dataset made of two nested circles.
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split

X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

# %%
# Let's have a quick first look at the generated dataset.
import matplotlib.pyplot as plt

_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))

train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
train_ax.set_ylabel("Feature #1")
train_ax.set_xlabel("Feature #0")
train_ax.set_title("Training data")

test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
test_ax.set_xlabel("Feature #0")
_ = test_ax.set_title("Testing data")

# %%
# The samples from each class cannot be linearly separated: there is no
# straight line that can split the samples of the inner set from the outer
# set.
#
# Now, we will use PCA with and without a kernel to see what is the effect of
# using such a kernel. The kernel used here is a radial basis function (RBF)
# kernel.
from sklearn.decomposition import PCA, KernelPCA

pca = PCA(n_components=2)
kernel_pca = KernelPCA(
    n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1
)

X_test_pca = pca.fit(X_train).transform(X_test)
X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)

# %%
fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(
    ncols=3, figsize=(14, 4)
)

orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
orig_data_ax.set_ylabel("Feature #1")
orig_data_ax.set_xlabel("Feature #0")
orig_data_ax.set_title("Testing data")

pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
pca_proj_ax.set_ylabel("Principal component #1")
pca_proj_ax.set_xlabel("Principal component #0")
pca_proj_ax.set_title("Projection of testing data\n using PCA")

kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
kernel_pca_proj_ax.set_ylabel("Principal component #1")
kernel_pca_proj_ax.set_xlabel("Principal component #0")
_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")

# %%
# We recall that PCA transforms the data linearly. Intuitively, it means that
# the coordinate system will be centered, rescaled on each component
# with respected to its variance and finally be rotated.
# The obtained data from this transformation is isotropic and can now be
# projected on its *principal components*.
#
# Thus, looking at the projection made using PCA (i.e. the middle figure), we
# see that there is no change regarding the scaling; indeed the data being two
# concentric circles centered in zero, the original data is already isotropic.
# However, we can see that the data have been rotated. As a
# conclusion, we see that such a projection would not help if define a linear
# classifier to distinguish samples from both classes.
#
# Using a kernel allows to make a non-linear projection. Here, by using an RBF
# kernel, we expect that the projection will unfold the dataset while keeping
# approximately preserving the relative distances of pairs of data points that
# are close to one another in the original space.
#
# We observe such behaviour in the figure on the right: the samples of a given
# class are closer to each other than the samples from the opposite class,
# untangling both sample sets. Now, we can use a linear classifier to separate
# the samples from the two classes.
#
# Projecting into the original feature space
# ------------------------------------------
#
# One particularity to have in mind when using
# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
# (i.e. the back projection in the original feature space). With
# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
# `n_components` is the same than the number of original features.
# This is the case in this example.
#
# We can investigate if we get the original dataset when back projecting with
# :class:`~sklearn.decomposition.KernelPCA`.
X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))

# %%
fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(
    ncols=3, sharex=True, sharey=True, figsize=(13, 4)
)

orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
orig_data_ax.set_ylabel("Feature #1")
orig_data_ax.set_xlabel("Feature #0")
orig_data_ax.set_title("Original test data")

pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
pca_back_proj_ax.set_xlabel("Feature #0")
pca_back_proj_ax.set_title("Reconstruction via PCA")

kernel_pca_back_proj_ax.scatter(
    X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test
)
kernel_pca_back_proj_ax.set_xlabel("Feature #0")
_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")

# %%
# While we see a perfect reconstruction with
# :class:`~sklearn.decomposition.PCA` we observe a different result for
# :class:`~sklearn.decomposition.KernelPCA`.
#
# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
# rely on an analytical back-projection and thus an exact reconstruction.
# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
# to learn a mapping from the kernalized PCA basis to the original feature
# space. This method therefore comes with an approximation introducing small
# differences when back projecting in the original feature space.
#
# To improve the reconstruction using
# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
# which controls the reliance on the training data during the training of
# the mapping.
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`==========`
			`Kernel PCA`
			`==========`

			`This example shows the difference between the Principal Components Analysis`
			(:class:`~sklearn.decomposition.PCA`) and its kernelized version
			(:class:`~sklearn.decomposition.KernelPCA`).

			On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
			`to find a projection of the data which linearly separates them while it is not the case`
			with :class:`~sklearn.decomposition.PCA`.

			`Finally, we show that inverting this projection is an approximation with`
			:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
			:class:`~sklearn.decomposition.PCA`.
			`"""`

			`# Authors: Mathieu Blondel`
			`# Andreas Mueller`
			`# Guillaume Lemaitre`
			`# License: BSD 3 clause`

			`# %%`
			# Projecting data: `PCA` vs. `KernelPCA`
			`# --------------------------------------`
			`#`
			`# In this section, we show the advantages of using a kernel when`
			`# projecting data using a Principal Component Analysis (PCA). We create a`
			`# dataset made of two nested circles.`
			`from sklearn.datasets import make_circles`
			`from sklearn.model_selection import train_test_split`

			`X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0)`
			`X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)`

			`# %%`
			`# Let's have a quick first look at the generated dataset.`
			`import matplotlib.pyplot as plt`

			`_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))`

			`train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)`
			`train_ax.set_ylabel("Feature #1")`
			`train_ax.set_xlabel("Feature #0")`
			`train_ax.set_title("Training data")`

			`test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)`
			`test_ax.set_xlabel("Feature #0")`
			`_ = test_ax.set_title("Testing data")`

			`# %%`
			`# The samples from each class cannot be linearly separated: there is no`
			`# straight line that can split the samples of the inner set from the outer`
			`# set.`
			`#`
			`# Now, we will use PCA with and without a kernel to see what is the effect of`
			`# using such a kernel. The kernel used here is a radial basis function (RBF)`
			`# kernel.`
			`from sklearn.decomposition import PCA, KernelPCA`

			`pca = PCA(n_components=2)`
			`kernel_pca = KernelPCA(`
			`n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1`
			`)`

			`X_test_pca = pca.fit(X_train).transform(X_test)`
			`X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)`

			`# %%`
			`fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(`
			`ncols=3, figsize=(14, 4)`
			`)`

			`orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)`
			`orig_data_ax.set_ylabel("Feature #1")`
			`orig_data_ax.set_xlabel("Feature #0")`
			`orig_data_ax.set_title("Testing data")`

			`pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)`
			`pca_proj_ax.set_ylabel("Principal component #1")`
			`pca_proj_ax.set_xlabel("Principal component #0")`
			`pca_proj_ax.set_title("Projection of testing data\n using PCA")`

			`kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)`
			`kernel_pca_proj_ax.set_ylabel("Principal component #1")`
			`kernel_pca_proj_ax.set_xlabel("Principal component #0")`
			`_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")`

			`# %%`
			`# We recall that PCA transforms the data linearly. Intuitively, it means that`
			`# the coordinate system will be centered, rescaled on each component`
			`# with respected to its variance and finally be rotated.`
			`# The obtained data from this transformation is isotropic and can now be`
			`# projected on its principal components.`
			`#`
			`# Thus, looking at the projection made using PCA (i.e. the middle figure), we`
			`# see that there is no change regarding the scaling; indeed the data being two`
			`# concentric circles centered in zero, the original data is already isotropic.`
			`# However, we can see that the data have been rotated. As a`
			`# conclusion, we see that such a projection would not help if define a linear`
			`# classifier to distinguish samples from both classes.`
			`#`
			`# Using a kernel allows to make a non-linear projection. Here, by using an RBF`
			`# kernel, we expect that the projection will unfold the dataset while keeping`
			`# approximately preserving the relative distances of pairs of data points that`
			`# are close to one another in the original space.`
			`#`
			`# We observe such behaviour in the figure on the right: the samples of a given`
			`# class are closer to each other than the samples from the opposite class,`
			`# untangling both sample sets. Now, we can use a linear classifier to separate`
			`# the samples from the two classes.`
			`#`
			`# Projecting into the original feature space`
			`# ------------------------------------------`
			`#`
			`# One particularity to have in mind when using`
			# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
			`# (i.e. the back projection in the original feature space). With`
			# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
			# `n_components` is the same than the number of original features.
			`# This is the case in this example.`
			`#`
			`# We can investigate if we get the original dataset when back projecting with`
			# :class:`~sklearn.decomposition.KernelPCA`.
			`X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))`
			`X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))`

			`# %%`
			`fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(`
			`ncols=3, sharex=True, sharey=True, figsize=(13, 4)`
			`)`

			`orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)`
			`orig_data_ax.set_ylabel("Feature #1")`
			`orig_data_ax.set_xlabel("Feature #0")`
			`orig_data_ax.set_title("Original test data")`

			`pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)`
			`pca_back_proj_ax.set_xlabel("Feature #0")`
			`pca_back_proj_ax.set_title("Reconstruction via PCA")`

			`kernel_pca_back_proj_ax.scatter(`
			`X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test`
			`)`
			`kernel_pca_back_proj_ax.set_xlabel("Feature #0")`
			`_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")`

			`# %%`
			`# While we see a perfect reconstruction with`
			# :class:`~sklearn.decomposition.PCA` we observe a different result for
			# :class:`~sklearn.decomposition.KernelPCA`.
			`#`
			# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
			`# rely on an analytical back-projection and thus an exact reconstruction.`
			# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
			`# to learn a mapping from the kernalized PCA basis to the original feature`
			`# space. This method therefore comes with an approximation introducing small`
			`# differences when back projecting in the original feature space.`
			`#`
			`# To improve the reconstruction using`
			# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
			# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
			`# which controls the reliance on the training data during the training of`
			`# the mapping.`