sklearn/examples/manifold/plot_lle_digits.py

"""
=============================================================================
Manifold learning on handwritten digits: Locally Linear Embedding, Isomap...
=============================================================================

We illustrate various embedding techniques on the digits dataset.

"""

# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Mathieu Blondel <mathieu@mblondel.org>
#          Gael Varoquaux
#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD 3 clause (C) INRIA 2011


# %%
# Load digits dataset
# -------------------
# We will load the digits dataset and only use six first of the ten available classes.
from sklearn.datasets import load_digits

digits = load_digits(n_class=6)
X, y = digits.data, digits.target
n_samples, n_features = X.shape
n_neighbors = 30

# %%
# We can plot the first hundred digits from this data set.
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(6, 6))
for idx, ax in enumerate(axs.ravel()):
    ax.imshow(X[idx].reshape((8, 8)), cmap=plt.cm.binary)
    ax.axis("off")
_ = fig.suptitle("A selection from the 64-dimensional digits dataset", fontsize=16)

# %%
# Helper function to plot embedding
# ---------------------------------
# Below, we will use different techniques to embed the digits dataset. We will plot
# the projection of the original data onto each embedding. It will allow us to
# check whether or digits are grouped together in the embedding space, or
# scattered across it.
import numpy as np
from matplotlib import offsetbox

from sklearn.preprocessing import MinMaxScaler


def plot_embedding(X, title):
    _, ax = plt.subplots()
    X = MinMaxScaler().fit_transform(X)

    for digit in digits.target_names:
        ax.scatter(
            *X[y == digit].T,
            marker=f"${digit}$",
            s=60,
            color=plt.cm.Dark2(digit),
            alpha=0.425,
            zorder=2,
        )
    shown_images = np.array([[1.0, 1.0]])  # just something big
    for i in range(X.shape[0]):
        # plot every digit on the embedding
        # show an annotation box for a group of digits
        dist = np.sum((X[i] - shown_images) ** 2, 1)
        if np.min(dist) < 4e-3:
            # don't show points that are too close
            continue
        shown_images = np.concatenate([shown_images, [X[i]]], axis=0)
        imagebox = offsetbox.AnnotationBbox(
            offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i]
        )
        imagebox.set(zorder=1)
        ax.add_artist(imagebox)

    ax.set_title(title)
    ax.axis("off")


# %%
# Embedding techniques comparison
# -------------------------------
#
# Below, we compare different techniques. However, there are a couple of things
# to note:
#
# * the :class:`~sklearn.ensemble.RandomTreesEmbedding` is not
#   technically a manifold embedding method, as it learn a high-dimensional
#   representation on which we apply a dimensionality reduction method.
#   However, it is often useful to cast a dataset into a representation in
#   which the classes are linearly-separable.
# * the :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis` and
#   the :class:`~sklearn.neighbors.NeighborhoodComponentsAnalysis`, are supervised
#   dimensionality reduction method, i.e. they make use of the provided labels,
#   contrary to other methods.
# * the :class:`~sklearn.manifold.TSNE` is initialized with the embedding that is
#   generated by PCA in this example. It ensures global stability  of the embedding,
#   i.e., the embedding does not depend on random initialization.
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.manifold import (
    MDS,
    TSNE,
    Isomap,
    LocallyLinearEmbedding,
    SpectralEmbedding,
)
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.random_projection import SparseRandomProjection

embeddings = {
    "Random projection embedding": SparseRandomProjection(
        n_components=2, random_state=42
    ),
    "Truncated SVD embedding": TruncatedSVD(n_components=2),
    "Linear Discriminant Analysis embedding": LinearDiscriminantAnalysis(
        n_components=2
    ),
    "Isomap embedding": Isomap(n_neighbors=n_neighbors, n_components=2),
    "Standard LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="standard"
    ),
    "Modified LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="modified"
    ),
    "Hessian LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="hessian"
    ),
    "LTSA LLE embedding": LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=2, method="ltsa"
    ),
    "MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2),
    "Random Trees embedding": make_pipeline(
        RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
        TruncatedSVD(n_components=2),
    ),
    "Spectral embedding": SpectralEmbedding(
        n_components=2, random_state=0, eigen_solver="arpack"
    ),
    "t-SNE embedding": TSNE(
        n_components=2,
        max_iter=500,
        n_iter_without_progress=150,
        n_jobs=2,
        random_state=0,
    ),
    "NCA embedding": NeighborhoodComponentsAnalysis(
        n_components=2, init="pca", random_state=0
    ),
}

# %%
# Once we declared all the methods of interest, we can run and perform the projection
# of the original data. We will store the projected data as well as the computational
# time needed to perform each projection.
from time import time

projections, timing = {}, {}
for name, transformer in embeddings.items():
    if name.startswith("Linear Discriminant Analysis"):
        data = X.copy()
        data.flat[:: X.shape[1] + 1] += 0.01  # Make X invertible
    else:
        data = X

    print(f"Computing {name}...")
    start_time = time()
    projections[name] = transformer.fit_transform(data, y)
    timing[name] = time() - start_time

# %%
# Finally, we can plot the resulting projection given by each method.
for name in timing:
    title = f"{name} (time {timing[name]:.3f}s)"
    plot_embedding(projections[name], title)

plt.show()
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`=============================================================================`
			`Manifold learning on handwritten digits: Locally Linear Embedding, Isomap...`
			`=============================================================================`

			`We illustrate various embedding techniques on the digits dataset.`

			`"""`

			`# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>`
			`# Olivier Grisel <olivier.grisel@ensta.org>`
			`# Mathieu Blondel <mathieu@mblondel.org>`
			`# Gael Varoquaux`
			`# Guillaume Lemaitre <g.lemaitre58@gmail.com>`
			`# License: BSD 3 clause (C) INRIA 2011`


			`# %%`
			`# Load digits dataset`
			`# -------------------`
			`# We will load the digits dataset and only use six first of the ten available classes.`
			`from sklearn.datasets import load_digits`

			`digits = load_digits(n_class=6)`
			`X, y = digits.data, digits.target`
			`n_samples, n_features = X.shape`
			`n_neighbors = 30`

			`# %%`
			`# We can plot the first hundred digits from this data set.`
			`import matplotlib.pyplot as plt`

			`fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(6, 6))`
			`for idx, ax in enumerate(axs.ravel()):`
			`ax.imshow(X[idx].reshape((8, 8)), cmap=plt.cm.binary)`
			`ax.axis("off")`
			`_ = fig.suptitle("A selection from the 64-dimensional digits dataset", fontsize=16)`

			`# %%`
			`# Helper function to plot embedding`
			`# ---------------------------------`
			`# Below, we will use different techniques to embed the digits dataset. We will plot`
			`# the projection of the original data onto each embedding. It will allow us to`
			`# check whether or digits are grouped together in the embedding space, or`
			`# scattered across it.`
			`import numpy as np`
			`from matplotlib import offsetbox`

			`from sklearn.preprocessing import MinMaxScaler`


			`def plot_embedding(X, title):`
			`_, ax = plt.subplots()`
			`X = MinMaxScaler().fit_transform(X)`

			`for digit in digits.target_names:`
			`ax.scatter(`
			`*X[y == digit].T,`
			`marker=f"${digit}$",`
			`s=60,`
			`color=plt.cm.Dark2(digit),`
			`alpha=0.425,`
			`zorder=2,`
			`)`
			`shown_images = np.array([[1.0, 1.0]]) # just something big`
			`for i in range(X.shape[0]):`
			`# plot every digit on the embedding`
			`# show an annotation box for a group of digits`
			`dist = np.sum((X[i] - shown_images) ** 2, 1)`
			`if np.min(dist) < 4e-3:`
			`# don't show points that are too close`
			`continue`
			`shown_images = np.concatenate([shown_images, [X[i]]], axis=0)`
			`imagebox = offsetbox.AnnotationBbox(`
			`offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i]`
			`)`
			`imagebox.set(zorder=1)`
			`ax.add_artist(imagebox)`

			`ax.set_title(title)`
			`ax.axis("off")`


			`# %%`
			`# Embedding techniques comparison`
			`# -------------------------------`
			`#`
			`# Below, we compare different techniques. However, there are a couple of things`
			`# to note:`
			`#`
			# * the :class:`~sklearn.ensemble.RandomTreesEmbedding` is not
			`# technically a manifold embedding method, as it learn a high-dimensional`
			`# representation on which we apply a dimensionality reduction method.`
			`# However, it is often useful to cast a dataset into a representation in`
			`# which the classes are linearly-separable.`
			# * the :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis` and
			# the :class:`~sklearn.neighbors.NeighborhoodComponentsAnalysis`, are supervised
			`# dimensionality reduction method, i.e. they make use of the provided labels,`
			`# contrary to other methods.`
			# * the :class:`~sklearn.manifold.TSNE` is initialized with the embedding that is
			`# generated by PCA in this example. It ensures global stability of the embedding,`
			`# i.e., the embedding does not depend on random initialization.`
			`from sklearn.decomposition import TruncatedSVD`
			`from sklearn.discriminant_analysis import LinearDiscriminantAnalysis`
			`from sklearn.ensemble import RandomTreesEmbedding`
			`from sklearn.manifold import (`
			`MDS,`
			`TSNE,`
			`Isomap,`
			`LocallyLinearEmbedding,`
			`SpectralEmbedding,`
			`)`
			`from sklearn.neighbors import NeighborhoodComponentsAnalysis`
			`from sklearn.pipeline import make_pipeline`
			`from sklearn.random_projection import SparseRandomProjection`

			`embeddings = {`
			`"Random projection embedding": SparseRandomProjection(`
			`n_components=2, random_state=42`
			`),`
			`"Truncated SVD embedding": TruncatedSVD(n_components=2),`
			`"Linear Discriminant Analysis embedding": LinearDiscriminantAnalysis(`
			`n_components=2`
			`),`
			`"Isomap embedding": Isomap(n_neighbors=n_neighbors, n_components=2),`
			`"Standard LLE embedding": LocallyLinearEmbedding(`
			`n_neighbors=n_neighbors, n_components=2, method="standard"`
			`),`
			`"Modified LLE embedding": LocallyLinearEmbedding(`
			`n_neighbors=n_neighbors, n_components=2, method="modified"`
			`),`
			`"Hessian LLE embedding": LocallyLinearEmbedding(`
			`n_neighbors=n_neighbors, n_components=2, method="hessian"`
			`),`
			`"LTSA LLE embedding": LocallyLinearEmbedding(`
			`n_neighbors=n_neighbors, n_components=2, method="ltsa"`
			`),`
			`"MDS embedding": MDS(n_components=2, n_init=1, max_iter=120, n_jobs=2),`
			`"Random Trees embedding": make_pipeline(`
			`RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),`
			`TruncatedSVD(n_components=2),`
			`),`
			`"Spectral embedding": SpectralEmbedding(`
			`n_components=2, random_state=0, eigen_solver="arpack"`
			`),`
			`"t-SNE embedding": TSNE(`
			`n_components=2,`
			`max_iter=500,`
			`n_iter_without_progress=150,`
			`n_jobs=2,`
			`random_state=0,`
			`),`
			`"NCA embedding": NeighborhoodComponentsAnalysis(`
			`n_components=2, init="pca", random_state=0`
			`),`
			`}`

			`# %%`
			`# Once we declared all the methods of interest, we can run and perform the projection`
			`# of the original data. We will store the projected data as well as the computational`
			`# time needed to perform each projection.`
			`from time import time`

			`projections, timing = {}, {}`
			`for name, transformer in embeddings.items():`
			`if name.startswith("Linear Discriminant Analysis"):`
			`data = X.copy()`
			`data.flat[:: X.shape[1] + 1] += 0.01 # Make X invertible`
			`else:`
			`data = X`

			`print(f"Computing {name}...")`
			`start_time = time()`
			`projections[name] = transformer.fit_transform(data, y)`
			`timing[name] = time() - start_time`

			`# %%`
			`# Finally, we can plot the resulting projection given by each method.`
			`for name in timing:`
			`title = f"{name} (time {timing[name]:.3f}s)"`
			`plot_embedding(projections[name], title)`

			`plt.show()`