sklearn/examples/neighbors/approximate_nearest_neighbo...

"""
=====================================
Approximate nearest neighbors in TSNE
=====================================

This example presents how to chain KNeighborsTransformer and TSNE in a pipeline.
It also shows how to wrap the packages `nmslib` and `pynndescent` to replace
KNeighborsTransformer and perform approximate nearest neighbors. These packages
can be installed with `pip install nmslib pynndescent`.

Note: In KNeighborsTransformer we use the definition which includes each
training point as its own neighbor in the count of `n_neighbors`, and for
compatibility reasons, one extra neighbor is computed when `mode == 'distance'`.
Please note that we do the same in the proposed `nmslib` wrapper.
"""

# Author: Tom Dupre la Tour
# License: BSD 3 clause

# %%
# First we try to import the packages and warn the user in case they are
# missing.
import sys

try:
    import nmslib
except ImportError:
    print("The package 'nmslib' is required to run this example.")
    sys.exit()

try:
    from pynndescent import PyNNDescentTransformer
except ImportError:
    print("The package 'pynndescent' is required to run this example.")
    sys.exit()

# %%
# We define a wrapper class for implementing the scikit-learn API to the
# `nmslib`, as well as a loading function.
import joblib
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle


class NMSlibTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=-1):
        self.n_neighbors = n_neighbors
        self.method = method
        self.metric = metric
        self.n_jobs = n_jobs

    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]

        # see more metric in the manual
        # https://github.com/nmslib/nmslib/tree/master/manual
        space = {
            "euclidean": "l2",
            "cosine": "cosinesimil",
            "l1": "l1",
            "l2": "l2",
        }[self.metric]

        self.nmslib_ = nmslib.init(method=self.method, space=space)
        self.nmslib_.addDataPointBatch(X.copy())
        self.nmslib_.createIndex()
        return self

    def transform(self, X):
        n_samples_transform = X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        if self.n_jobs < 0:
            # Same handling as done in joblib for negative values of n_jobs:
            # in particular, `n_jobs == -1` means "as many threads as CPUs".
            num_threads = joblib.cpu_count() + self.n_jobs + 1
        else:
            num_threads = self.n_jobs

        results = self.nmslib_.knnQueryBatch(
            X.copy(), k=n_neighbors, num_threads=num_threads
        )
        indices, distances = zip(*results)
        indices, distances = np.vstack(indices), np.vstack(distances)

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
        kneighbors_graph = csr_matrix(
            (distances.ravel(), indices.ravel(), indptr),
            shape=(n_samples_transform, self.n_samples_fit_),
        )

        return kneighbors_graph


def load_mnist(n_samples):
    """Load MNIST, shuffle the data, and return only n_samples."""
    mnist = fetch_openml("mnist_784", as_frame=False)
    X, y = shuffle(mnist.data, mnist.target, random_state=2)
    return X[:n_samples] / 255, y[:n_samples]


# %%
# We benchmark the different exact/approximate nearest neighbors transformers.
import time

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline

datasets = [
    ("MNIST_10000", load_mnist(n_samples=10_000)),
    ("MNIST_20000", load_mnist(n_samples=20_000)),
]

n_iter = 500
perplexity = 30
metric = "euclidean"
# TSNE requires a certain number of neighbors which depends on the
# perplexity parameter.
# Add one since we include each sample as its own neighbor.
n_neighbors = int(3.0 * perplexity + 1) + 1

tsne_params = dict(
    init="random",  # pca not supported for sparse matrices
    perplexity=perplexity,
    method="barnes_hut",
    random_state=42,
    n_iter=n_iter,
    learning_rate="auto",
)

transformers = [
    (
        "KNeighborsTransformer",
        KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance", metric=metric),
    ),
    (
        "NMSlibTransformer",
        NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
    ),
    (
        "PyNNDescentTransformer",
        PyNNDescentTransformer(
            n_neighbors=n_neighbors, metric=metric, parallel_batch_queries=True
        ),
    ),
]

for dataset_name, (X, y) in datasets:
    msg = f"Benchmarking on {dataset_name}:"
    print(f"\n{msg}\n" + str("-" * len(msg)))

    for transformer_name, transformer in transformers:
        longest = np.max([len(name) for name, model in transformers])
        start = time.time()
        transformer.fit(X)
        fit_duration = time.time() - start
        print(f"{transformer_name:<{longest}} {fit_duration:.3f} sec (fit)")
        start = time.time()
        Xt = transformer.transform(X)
        transform_duration = time.time() - start
        print(f"{transformer_name:<{longest}} {transform_duration:.3f} sec (transform)")
        if transformer_name == "PyNNDescentTransformer":
            start = time.time()
            Xt = transformer.transform(X)
            transform_duration = time.time() - start
            print(
                f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
                " (transform)"
            )

# %%
# Sample output::
#
#     Benchmarking on MNIST_10000:
#     ----------------------------
#     KNeighborsTransformer  0.007 sec (fit)
#     KNeighborsTransformer  1.139 sec (transform)
#     NMSlibTransformer      0.208 sec (fit)
#     NMSlibTransformer      0.315 sec (transform)
#     PyNNDescentTransformer 4.823 sec (fit)
#     PyNNDescentTransformer 4.884 sec (transform)
#     PyNNDescentTransformer 0.744 sec (transform)
#
#     Benchmarking on MNIST_20000:
#     ----------------------------
#     KNeighborsTransformer  0.011 sec (fit)
#     KNeighborsTransformer  5.769 sec (transform)
#     NMSlibTransformer      0.733 sec (fit)
#     NMSlibTransformer      1.077 sec (transform)
#     PyNNDescentTransformer 14.448 sec (fit)
#     PyNNDescentTransformer 7.103 sec (transform)
#     PyNNDescentTransformer 1.759 sec (transform)
#
# Notice that the `PyNNDescentTransformer` takes more time during the first
# `fit` and the first `transform` due to the overhead of the numba just in time
# compiler. But after the first call, the compiled Python code is kept in a
# cache by numba and subsequent calls do not suffer from this initial overhead.
# Both :class:`~sklearn.neighbors.KNeighborsTransformer` and `NMSlibTransformer`
# are only run once here as they would show more stable `fit` and `transform`
# times (they don't have the cold start problem of PyNNDescentTransformer).

# %%
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

transformers = [
    ("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)),
    (
        "TSNE with KNeighborsTransformer",
        make_pipeline(
            KNeighborsTransformer(
                n_neighbors=n_neighbors, mode="distance", metric=metric
            ),
            TSNE(metric="precomputed", **tsne_params),
        ),
    ),
    (
        "TSNE with NMSlibTransformer",
        make_pipeline(
            NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
            TSNE(metric="precomputed", **tsne_params),
        ),
    ),
]

# init the plot
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if "TSNE" in name])
fig, axes = plt.subplots(
    nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)
)
axes = axes.ravel()
i_ax = 0

for dataset_name, (X, y) in datasets:
    msg = f"Benchmarking on {dataset_name}:"
    print(f"\n{msg}\n" + str("-" * len(msg)))

    for transformer_name, transformer in transformers:
        longest = np.max([len(name) for name, model in transformers])
        start = time.time()
        Xt = transformer.fit_transform(X)
        transform_duration = time.time() - start
        print(
            f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
            " (fit_transform)"
        )

        # plot TSNE embedding which should be very similar across methods
        axes[i_ax].set_title(transformer_name + "\non " + dataset_name)
        axes[i_ax].scatter(
            Xt[:, 0],
            Xt[:, 1],
            c=y.astype(np.int32),
            alpha=0.2,
            cmap=plt.cm.viridis,
        )
        axes[i_ax].xaxis.set_major_formatter(NullFormatter())
        axes[i_ax].yaxis.set_major_formatter(NullFormatter())
        axes[i_ax].axis("tight")
        i_ax += 1

fig.tight_layout()
plt.show()

# %%
# Sample output::
#
#     Benchmarking on MNIST_10000:
#     ----------------------------
#     TSNE with internal NearestNeighbors 24.828 sec (fit_transform)
#     TSNE with KNeighborsTransformer     20.111 sec (fit_transform)
#     TSNE with NMSlibTransformer         21.757 sec (fit_transform)
#
#     Benchmarking on MNIST_20000:
#     ----------------------------
#     TSNE with internal NearestNeighbors 51.955 sec (fit_transform)
#     TSNE with KNeighborsTransformer     50.994 sec (fit_transform)
#     TSNE with NMSlibTransformer         43.536 sec (fit_transform)
#
# We can observe that the default :class:`~sklearn.manifold.TSNE` estimator with
# its internal :class:`~sklearn.neighbors.NearestNeighbors` implementation is
# roughly equivalent to the pipeline with :class:`~sklearn.manifold.TSNE` and
# :class:`~sklearn.neighbors.KNeighborsTransformer` in terms of performance.
# This is expected because both pipelines rely internally on the same
# :class:`~sklearn.neighbors.NearestNeighbors` implementation that performs
# exacts neighbors search. The approximate `NMSlibTransformer` is already
# slightly faster than the exact search on the smallest dataset but this speed
# difference is expected to become more significant on datasets with a larger
# number of samples.
#
# Notice however that not all approximate search methods are guaranteed to
# improve the speed of the default exact search method: indeed the exact search
# implementation significantly improved since scikit-learn 1.1. Furthermore, the
# brute-force exact search method does not require building an index at `fit`
# time. So, to get an overall performance improvement in the context of the
# :class:`~sklearn.manifold.TSNE` pipeline, the gains of the approximate search
# at `transform` need to be larger than the extra time spent to build the
# approximate search index at `fit` time.
#
# Finally, the TSNE algorithm itself is also computationally intensive,
# irrespective of the nearest neighbors search. So speeding-up the nearest
# neighbors search step by a factor of 5 would not result in a speed up by a
# factor of 5 for the overall pipeline.