166 lines
6.7 KiB
Python
166 lines
6.7 KiB
Python
|
"""
|
||
|
==========
|
||
|
Kernel PCA
|
||
|
==========
|
||
|
|
||
|
This example shows the difference between the Principal Components Analysis
|
||
|
(:class:`~sklearn.decomposition.PCA`) and its kernelized version
|
||
|
(:class:`~sklearn.decomposition.KernelPCA`).
|
||
|
|
||
|
On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
|
||
|
to find a projection of the data which linearly separates them while it is not the case
|
||
|
with :class:`~sklearn.decomposition.PCA`.
|
||
|
|
||
|
Finally, we show that inverting this projection is an approximation with
|
||
|
:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
|
||
|
:class:`~sklearn.decomposition.PCA`.
|
||
|
"""
|
||
|
|
||
|
# Authors: Mathieu Blondel
|
||
|
# Andreas Mueller
|
||
|
# Guillaume Lemaitre
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
# %%
|
||
|
# Projecting data: `PCA` vs. `KernelPCA`
|
||
|
# --------------------------------------
|
||
|
#
|
||
|
# In this section, we show the advantages of using a kernel when
|
||
|
# projecting data using a Principal Component Analysis (PCA). We create a
|
||
|
# dataset made of two nested circles.
|
||
|
from sklearn.datasets import make_circles
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0)
|
||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
|
||
|
|
||
|
# %%
|
||
|
# Let's have a quick first look at the generated dataset.
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
|
||
|
|
||
|
train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
|
||
|
train_ax.set_ylabel("Feature #1")
|
||
|
train_ax.set_xlabel("Feature #0")
|
||
|
train_ax.set_title("Training data")
|
||
|
|
||
|
test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
|
||
|
test_ax.set_xlabel("Feature #0")
|
||
|
_ = test_ax.set_title("Testing data")
|
||
|
|
||
|
# %%
|
||
|
# The samples from each class cannot be linearly separated: there is no
|
||
|
# straight line that can split the samples of the inner set from the outer
|
||
|
# set.
|
||
|
#
|
||
|
# Now, we will use PCA with and without a kernel to see what is the effect of
|
||
|
# using such a kernel. The kernel used here is a radial basis function (RBF)
|
||
|
# kernel.
|
||
|
from sklearn.decomposition import PCA, KernelPCA
|
||
|
|
||
|
pca = PCA(n_components=2)
|
||
|
kernel_pca = KernelPCA(
|
||
|
n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1
|
||
|
)
|
||
|
|
||
|
X_test_pca = pca.fit(X_train).transform(X_test)
|
||
|
X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
|
||
|
|
||
|
# %%
|
||
|
fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(
|
||
|
ncols=3, figsize=(14, 4)
|
||
|
)
|
||
|
|
||
|
orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
|
||
|
orig_data_ax.set_ylabel("Feature #1")
|
||
|
orig_data_ax.set_xlabel("Feature #0")
|
||
|
orig_data_ax.set_title("Testing data")
|
||
|
|
||
|
pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
|
||
|
pca_proj_ax.set_ylabel("Principal component #1")
|
||
|
pca_proj_ax.set_xlabel("Principal component #0")
|
||
|
pca_proj_ax.set_title("Projection of testing data\n using PCA")
|
||
|
|
||
|
kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
|
||
|
kernel_pca_proj_ax.set_ylabel("Principal component #1")
|
||
|
kernel_pca_proj_ax.set_xlabel("Principal component #0")
|
||
|
_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
|
||
|
|
||
|
# %%
|
||
|
# We recall that PCA transforms the data linearly. Intuitively, it means that
|
||
|
# the coordinate system will be centered, rescaled on each component
|
||
|
# with respected to its variance and finally be rotated.
|
||
|
# The obtained data from this transformation is isotropic and can now be
|
||
|
# projected on its *principal components*.
|
||
|
#
|
||
|
# Thus, looking at the projection made using PCA (i.e. the middle figure), we
|
||
|
# see that there is no change regarding the scaling; indeed the data being two
|
||
|
# concentric circles centered in zero, the original data is already isotropic.
|
||
|
# However, we can see that the data have been rotated. As a
|
||
|
# conclusion, we see that such a projection would not help if define a linear
|
||
|
# classifier to distinguish samples from both classes.
|
||
|
#
|
||
|
# Using a kernel allows to make a non-linear projection. Here, by using an RBF
|
||
|
# kernel, we expect that the projection will unfold the dataset while keeping
|
||
|
# approximately preserving the relative distances of pairs of data points that
|
||
|
# are close to one another in the original space.
|
||
|
#
|
||
|
# We observe such behaviour in the figure on the right: the samples of a given
|
||
|
# class are closer to each other than the samples from the opposite class,
|
||
|
# untangling both sample sets. Now, we can use a linear classifier to separate
|
||
|
# the samples from the two classes.
|
||
|
#
|
||
|
# Projecting into the original feature space
|
||
|
# ------------------------------------------
|
||
|
#
|
||
|
# One particularity to have in mind when using
|
||
|
# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
|
||
|
# (i.e. the back projection in the original feature space). With
|
||
|
# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
|
||
|
# `n_components` is the same than the number of original features.
|
||
|
# This is the case in this example.
|
||
|
#
|
||
|
# We can investigate if we get the original dataset when back projecting with
|
||
|
# :class:`~sklearn.decomposition.KernelPCA`.
|
||
|
X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
|
||
|
X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))
|
||
|
|
||
|
# %%
|
||
|
fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(
|
||
|
ncols=3, sharex=True, sharey=True, figsize=(13, 4)
|
||
|
)
|
||
|
|
||
|
orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
|
||
|
orig_data_ax.set_ylabel("Feature #1")
|
||
|
orig_data_ax.set_xlabel("Feature #0")
|
||
|
orig_data_ax.set_title("Original test data")
|
||
|
|
||
|
pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
|
||
|
pca_back_proj_ax.set_xlabel("Feature #0")
|
||
|
pca_back_proj_ax.set_title("Reconstruction via PCA")
|
||
|
|
||
|
kernel_pca_back_proj_ax.scatter(
|
||
|
X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test
|
||
|
)
|
||
|
kernel_pca_back_proj_ax.set_xlabel("Feature #0")
|
||
|
_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")
|
||
|
|
||
|
# %%
|
||
|
# While we see a perfect reconstruction with
|
||
|
# :class:`~sklearn.decomposition.PCA` we observe a different result for
|
||
|
# :class:`~sklearn.decomposition.KernelPCA`.
|
||
|
#
|
||
|
# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
|
||
|
# rely on an analytical back-projection and thus an exact reconstruction.
|
||
|
# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
|
||
|
# to learn a mapping from the kernalized PCA basis to the original feature
|
||
|
# space. This method therefore comes with an approximation introducing small
|
||
|
# differences when back projecting in the original feature space.
|
||
|
#
|
||
|
# To improve the reconstruction using
|
||
|
# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
|
||
|
# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
|
||
|
# which controls the reliance on the training data during the training of
|
||
|
# the mapping.
|