sklearn/examples/miscellaneous/plot_kernel_ridge_regressio...

"""
=============================================
Comparison of kernel ridge regression and SVR
=============================================

Both kernel ridge regression (KRR) and SVR learn a non-linear function by
employing the kernel trick, i.e., they learn a linear function in the space
induced by the respective kernel which corresponds to a non-linear function in
the original space. They differ in the loss functions (ridge versus
epsilon-insensitive loss). In contrast to SVR, fitting a KRR can be done in
closed-form and is typically faster for medium-sized datasets. On the other
hand, the learned model is non-sparse and thus slower than SVR at
prediction-time.

This example illustrates both methods on an artificial dataset, which
consists of a sinusoidal target function and strong noise added to every fifth
datapoint.

"""

# %%
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD 3 clause

# %%
# Generate sample data
# --------------------
import numpy as np

rng = np.random.RandomState(42)

X = 5 * rng.rand(10000, 1)
y = np.sin(X).ravel()

# Add noise to targets
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))

X_plot = np.linspace(0, 5, 100000)[:, None]

# %%
# Construct the kernel-based regression models
# --------------------------------------------

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

train_size = 100

svr = GridSearchCV(
    SVR(kernel="rbf", gamma=0.1),
    param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},
)

kr = GridSearchCV(
    KernelRidge(kernel="rbf", gamma=0.1),
    param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
)

# %%
# Compare times of SVR and Kernel Ridge Regression
# ------------------------------------------------

import time

t0 = time.time()
svr.fit(X[:train_size], y[:train_size])
svr_fit = time.time() - t0
print(f"Best SVR with params: {svr.best_params_} and R2 score: {svr.best_score_:.3f}")
print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)

t0 = time.time()
kr.fit(X[:train_size], y[:train_size])
kr_fit = time.time() - t0
print(f"Best KRR with params: {kr.best_params_} and R2 score: {kr.best_score_:.3f}")
print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)

sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
print("Support vector ratio: %.3f" % sv_ratio)

t0 = time.time()
y_svr = svr.predict(X_plot)
svr_predict = time.time() - t0
print("SVR prediction for %d inputs in %.3f s" % (X_plot.shape[0], svr_predict))

t0 = time.time()
y_kr = kr.predict(X_plot)
kr_predict = time.time() - t0
print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict))

# %%
# Look at the results
# -------------------

import matplotlib.pyplot as plt

sv_ind = svr.best_estimator_.support_
plt.scatter(
    X[sv_ind],
    y[sv_ind],
    c="r",
    s=50,
    label="SVR support vectors",
    zorder=2,
    edgecolors=(0, 0, 0),
)
plt.scatter(X[:100], y[:100], c="k", label="data", zorder=1, edgecolors=(0, 0, 0))
plt.plot(
    X_plot,
    y_svr,
    c="r",
    label="SVR (fit: %.3fs, predict: %.3fs)" % (svr_fit, svr_predict),
)
plt.plot(
    X_plot, y_kr, c="g", label="KRR (fit: %.3fs, predict: %.3fs)" % (kr_fit, kr_predict)
)
plt.xlabel("data")
plt.ylabel("target")
plt.title("SVR versus Kernel Ridge")
_ = plt.legend()

# %%
# The previous figure compares the learned model of KRR and SVR when both
# complexity/regularization and bandwidth of the RBF kernel are optimized using
# grid-search. The learned functions are very similar; however, fitting KRR is
# approximately 3-4 times faster than fitting SVR (both with grid-search).
#
# Prediction of 100000 target values could be in theory approximately three
# times faster with SVR since it has learned a sparse model using only
# approximately 1/3 of the training datapoints as support vectors. However, in
# practice, this is not necessarily the case because of implementation details
# in the way the kernel function is computed for each model that can make the
# KRR model as fast or even faster despite computing more arithmetic
# operations.

# %%
# Visualize training and prediction times
# ---------------------------------------

plt.figure()

sizes = np.logspace(1, 3.8, 7).astype(int)
for name, estimator in {
    "KRR": KernelRidge(kernel="rbf", alpha=0.01, gamma=10),
    "SVR": SVR(kernel="rbf", C=1e2, gamma=10),
}.items():
    train_time = []
    test_time = []
    for train_test_size in sizes:
        t0 = time.time()
        estimator.fit(X[:train_test_size], y[:train_test_size])
        train_time.append(time.time() - t0)

        t0 = time.time()
        estimator.predict(X_plot[:1000])
        test_time.append(time.time() - t0)

    plt.plot(
        sizes,
        train_time,
        "o-",
        color="r" if name == "SVR" else "g",
        label="%s (train)" % name,
    )
    plt.plot(
        sizes,
        test_time,
        "o--",
        color="r" if name == "SVR" else "g",
        label="%s (test)" % name,
    )

plt.xscale("log")
plt.yscale("log")
plt.xlabel("Train size")
plt.ylabel("Time (seconds)")
plt.title("Execution Time")
_ = plt.legend(loc="best")

# %%
# This figure compares the time for fitting and prediction of KRR and SVR for
# different sizes of the training set. Fitting KRR is faster than SVR for
# medium-sized training sets (less than a few thousand samples); however, for
# larger training sets SVR scales better. With regard to prediction time, SVR
# should be faster than KRR for all sizes of the training set because of the
# learned sparse solution, however this is not necessarily the case in practice
# because of implementation details. Note that the degree of sparsity and thus
# the prediction time depends on the parameters epsilon and C of the SVR.

# %%
# Visualize the learning curves
# -----------------------------
from sklearn.model_selection import LearningCurveDisplay

_, ax = plt.subplots()

svr = SVR(kernel="rbf", C=1e1, gamma=0.1)
kr = KernelRidge(kernel="rbf", alpha=0.1, gamma=0.1)

common_params = {
    "X": X[:100],
    "y": y[:100],
    "train_sizes": np.linspace(0.1, 1, 10),
    "scoring": "neg_mean_squared_error",
    "negate_score": True,
    "score_name": "Mean Squared Error",
    "score_type": "test",
    "std_display_style": None,
    "ax": ax,
}

LearningCurveDisplay.from_estimator(svr, **common_params)
LearningCurveDisplay.from_estimator(kr, **common_params)
ax.set_title("Learning curves")
ax.legend(handles=ax.get_legend_handles_labels()[0], labels=["SVR", "KRR"])

plt.show()
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`=============================================`
			`Comparison of kernel ridge regression and SVR`
			`=============================================`

			`Both kernel ridge regression (KRR) and SVR learn a non-linear function by`
			`employing the kernel trick, i.e., they learn a linear function in the space`
			`induced by the respective kernel which corresponds to a non-linear function in`
			`the original space. They differ in the loss functions (ridge versus`
			`epsilon-insensitive loss). In contrast to SVR, fitting a KRR can be done in`
			`closed-form and is typically faster for medium-sized datasets. On the other`
			`hand, the learned model is non-sparse and thus slower than SVR at`
			`prediction-time.`

			`This example illustrates both methods on an artificial dataset, which`
			`consists of a sinusoidal target function and strong noise added to every fifth`
			`datapoint.`

			`"""`

			`# %%`
			`# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>`
			`# License: BSD 3 clause`

			`# %%`
			`# Generate sample data`
			`# --------------------`
			`import numpy as np`

			`rng = np.random.RandomState(42)`

			`X = 5 * rng.rand(10000, 1)`
			`y = np.sin(X).ravel()`

			`# Add noise to targets`
			`y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))`

			`X_plot = np.linspace(0, 5, 100000)[:, None]`

			`# %%`
			`# Construct the kernel-based regression models`
			`# --------------------------------------------`

			`from sklearn.kernel_ridge import KernelRidge`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.svm import SVR`

			`train_size = 100`

			`svr = GridSearchCV(`
			`SVR(kernel="rbf", gamma=0.1),`
			`param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},`
			`)`

			`kr = GridSearchCV(`
			`KernelRidge(kernel="rbf", gamma=0.1),`
			`param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},`
			`)`

			`# %%`
			`# Compare times of SVR and Kernel Ridge Regression`
			`# ------------------------------------------------`

			`import time`

			`t0 = time.time()`
			`svr.fit(X[:train_size], y[:train_size])`
			`svr_fit = time.time() - t0`
			`print(f"Best SVR with params: {svr.best_params_} and R2 score: {svr.best_score_:.3f}")`
			`print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)`

			`t0 = time.time()`
			`kr.fit(X[:train_size], y[:train_size])`
			`kr_fit = time.time() - t0`
			`print(f"Best KRR with params: {kr.best_params_} and R2 score: {kr.best_score_:.3f}")`
			`print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)`

			`sv_ratio = svr.best_estimator_.support_.shape[0] / train_size`
			`print("Support vector ratio: %.3f" % sv_ratio)`

			`t0 = time.time()`
			`y_svr = svr.predict(X_plot)`
			`svr_predict = time.time() - t0`
			`print("SVR prediction for %d inputs in %.3f s" % (X_plot.shape[0], svr_predict))`

			`t0 = time.time()`
			`y_kr = kr.predict(X_plot)`
			`kr_predict = time.time() - t0`
			`print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict))`

			`# %%`
			`# Look at the results`
			`# -------------------`

			`import matplotlib.pyplot as plt`

			`sv_ind = svr.best_estimator_.support_`
			`plt.scatter(`
			`X[sv_ind],`
			`y[sv_ind],`
			`c="r",`
			`s=50,`
			`label="SVR support vectors",`
			`zorder=2,`
			`edgecolors=(0, 0, 0),`
			`)`
			`plt.scatter(X[:100], y[:100], c="k", label="data", zorder=1, edgecolors=(0, 0, 0))`
			`plt.plot(`
			`X_plot,`
			`y_svr,`
			`c="r",`
			`label="SVR (fit: %.3fs, predict: %.3fs)" % (svr_fit, svr_predict),`
			`)`
			`plt.plot(`
			`X_plot, y_kr, c="g", label="KRR (fit: %.3fs, predict: %.3fs)" % (kr_fit, kr_predict)`
			`)`
			`plt.xlabel("data")`
			`plt.ylabel("target")`
			`plt.title("SVR versus Kernel Ridge")`
			`_ = plt.legend()`

			`# %%`
			`# The previous figure compares the learned model of KRR and SVR when both`
			`# complexity/regularization and bandwidth of the RBF kernel are optimized using`
			`# grid-search. The learned functions are very similar; however, fitting KRR is`
			`# approximately 3-4 times faster than fitting SVR (both with grid-search).`
			`#`
			`# Prediction of 100000 target values could be in theory approximately three`
			`# times faster with SVR since it has learned a sparse model using only`
			`# approximately 1/3 of the training datapoints as support vectors. However, in`
			`# practice, this is not necessarily the case because of implementation details`
			`# in the way the kernel function is computed for each model that can make the`
			`# KRR model as fast or even faster despite computing more arithmetic`
			`# operations.`

			`# %%`
			`# Visualize training and prediction times`
			`# ---------------------------------------`

			`plt.figure()`

			`sizes = np.logspace(1, 3.8, 7).astype(int)`
			`for name, estimator in {`
			`"KRR": KernelRidge(kernel="rbf", alpha=0.01, gamma=10),`
			`"SVR": SVR(kernel="rbf", C=1e2, gamma=10),`
			`}.items():`
			`train_time = []`
			`test_time = []`
			`for train_test_size in sizes:`
			`t0 = time.time()`
			`estimator.fit(X[:train_test_size], y[:train_test_size])`
			`train_time.append(time.time() - t0)`

			`t0 = time.time()`
			`estimator.predict(X_plot[:1000])`
			`test_time.append(time.time() - t0)`

			`plt.plot(`
			`sizes,`
			`train_time,`
			`"o-",`
			`color="r" if name == "SVR" else "g",`
			`label="%s (train)" % name,`
			`)`
			`plt.plot(`
			`sizes,`
			`test_time,`
			`"o--",`
			`color="r" if name == "SVR" else "g",`
			`label="%s (test)" % name,`
			`)`

			`plt.xscale("log")`
			`plt.yscale("log")`
			`plt.xlabel("Train size")`
			`plt.ylabel("Time (seconds)")`
			`plt.title("Execution Time")`
			`_ = plt.legend(loc="best")`

			`# %%`
			`# This figure compares the time for fitting and prediction of KRR and SVR for`
			`# different sizes of the training set. Fitting KRR is faster than SVR for`
			`# medium-sized training sets (less than a few thousand samples); however, for`
			`# larger training sets SVR scales better. With regard to prediction time, SVR`
			`# should be faster than KRR for all sizes of the training set because of the`
			`# learned sparse solution, however this is not necessarily the case in practice`
			`# because of implementation details. Note that the degree of sparsity and thus`
			`# the prediction time depends on the parameters epsilon and C of the SVR.`

			`# %%`
			`# Visualize the learning curves`
			`# -----------------------------`
			`from sklearn.model_selection import LearningCurveDisplay`

			`_, ax = plt.subplots()`

			`svr = SVR(kernel="rbf", C=1e1, gamma=0.1)`
			`kr = KernelRidge(kernel="rbf", alpha=0.1, gamma=0.1)`

			`common_params = {`
			`"X": X[:100],`
			`"y": y[:100],`
			`"train_sizes": np.linspace(0.1, 1, 10),`
			`"scoring": "neg_mean_squared_error",`
			`"negate_score": True,`
			`"score_name": "Mean Squared Error",`
			`"score_type": "test",`
			`"std_display_style": None,`
			`"ax": ax,`
			`}`

			`LearningCurveDisplay.from_estimator(svr, **common_params)`
			`LearningCurveDisplay.from_estimator(kr, **common_params)`
			`ax.set_title("Learning curves")`
			`ax.legend(handles=ax.get_legend_handles_labels()[0], labels=["SVR", "KRR"])`

			`plt.show()`