218 lines
6.5 KiB
Python
218 lines
6.5 KiB
Python
"""
|
|
=============================================
|
|
Comparison of kernel ridge regression and SVR
|
|
=============================================
|
|
|
|
Both kernel ridge regression (KRR) and SVR learn a non-linear function by
|
|
employing the kernel trick, i.e., they learn a linear function in the space
|
|
induced by the respective kernel which corresponds to a non-linear function in
|
|
the original space. They differ in the loss functions (ridge versus
|
|
epsilon-insensitive loss). In contrast to SVR, fitting a KRR can be done in
|
|
closed-form and is typically faster for medium-sized datasets. On the other
|
|
hand, the learned model is non-sparse and thus slower than SVR at
|
|
prediction-time.
|
|
|
|
This example illustrates both methods on an artificial dataset, which
|
|
consists of a sinusoidal target function and strong noise added to every fifth
|
|
datapoint.
|
|
|
|
"""
|
|
|
|
# %%
|
|
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
|
# License: BSD 3 clause
|
|
|
|
# %%
|
|
# Generate sample data
|
|
# --------------------
|
|
import numpy as np
|
|
|
|
rng = np.random.RandomState(42)
|
|
|
|
X = 5 * rng.rand(10000, 1)
|
|
y = np.sin(X).ravel()
|
|
|
|
# Add noise to targets
|
|
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
|
|
|
|
X_plot = np.linspace(0, 5, 100000)[:, None]
|
|
|
|
# %%
|
|
# Construct the kernel-based regression models
|
|
# --------------------------------------------
|
|
|
|
from sklearn.kernel_ridge import KernelRidge
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.svm import SVR
|
|
|
|
train_size = 100
|
|
|
|
svr = GridSearchCV(
|
|
SVR(kernel="rbf", gamma=0.1),
|
|
param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},
|
|
)
|
|
|
|
kr = GridSearchCV(
|
|
KernelRidge(kernel="rbf", gamma=0.1),
|
|
param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
|
|
)
|
|
|
|
# %%
|
|
# Compare times of SVR and Kernel Ridge Regression
|
|
# ------------------------------------------------
|
|
|
|
import time
|
|
|
|
t0 = time.time()
|
|
svr.fit(X[:train_size], y[:train_size])
|
|
svr_fit = time.time() - t0
|
|
print(f"Best SVR with params: {svr.best_params_} and R2 score: {svr.best_score_:.3f}")
|
|
print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)
|
|
|
|
t0 = time.time()
|
|
kr.fit(X[:train_size], y[:train_size])
|
|
kr_fit = time.time() - t0
|
|
print(f"Best KRR with params: {kr.best_params_} and R2 score: {kr.best_score_:.3f}")
|
|
print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)
|
|
|
|
sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
|
|
print("Support vector ratio: %.3f" % sv_ratio)
|
|
|
|
t0 = time.time()
|
|
y_svr = svr.predict(X_plot)
|
|
svr_predict = time.time() - t0
|
|
print("SVR prediction for %d inputs in %.3f s" % (X_plot.shape[0], svr_predict))
|
|
|
|
t0 = time.time()
|
|
y_kr = kr.predict(X_plot)
|
|
kr_predict = time.time() - t0
|
|
print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict))
|
|
|
|
# %%
|
|
# Look at the results
|
|
# -------------------
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
sv_ind = svr.best_estimator_.support_
|
|
plt.scatter(
|
|
X[sv_ind],
|
|
y[sv_ind],
|
|
c="r",
|
|
s=50,
|
|
label="SVR support vectors",
|
|
zorder=2,
|
|
edgecolors=(0, 0, 0),
|
|
)
|
|
plt.scatter(X[:100], y[:100], c="k", label="data", zorder=1, edgecolors=(0, 0, 0))
|
|
plt.plot(
|
|
X_plot,
|
|
y_svr,
|
|
c="r",
|
|
label="SVR (fit: %.3fs, predict: %.3fs)" % (svr_fit, svr_predict),
|
|
)
|
|
plt.plot(
|
|
X_plot, y_kr, c="g", label="KRR (fit: %.3fs, predict: %.3fs)" % (kr_fit, kr_predict)
|
|
)
|
|
plt.xlabel("data")
|
|
plt.ylabel("target")
|
|
plt.title("SVR versus Kernel Ridge")
|
|
_ = plt.legend()
|
|
|
|
# %%
|
|
# The previous figure compares the learned model of KRR and SVR when both
|
|
# complexity/regularization and bandwidth of the RBF kernel are optimized using
|
|
# grid-search. The learned functions are very similar; however, fitting KRR is
|
|
# approximately 3-4 times faster than fitting SVR (both with grid-search).
|
|
#
|
|
# Prediction of 100000 target values could be in theory approximately three
|
|
# times faster with SVR since it has learned a sparse model using only
|
|
# approximately 1/3 of the training datapoints as support vectors. However, in
|
|
# practice, this is not necessarily the case because of implementation details
|
|
# in the way the kernel function is computed for each model that can make the
|
|
# KRR model as fast or even faster despite computing more arithmetic
|
|
# operations.
|
|
|
|
# %%
|
|
# Visualize training and prediction times
|
|
# ---------------------------------------
|
|
|
|
plt.figure()
|
|
|
|
sizes = np.logspace(1, 3.8, 7).astype(int)
|
|
for name, estimator in {
|
|
"KRR": KernelRidge(kernel="rbf", alpha=0.01, gamma=10),
|
|
"SVR": SVR(kernel="rbf", C=1e2, gamma=10),
|
|
}.items():
|
|
train_time = []
|
|
test_time = []
|
|
for train_test_size in sizes:
|
|
t0 = time.time()
|
|
estimator.fit(X[:train_test_size], y[:train_test_size])
|
|
train_time.append(time.time() - t0)
|
|
|
|
t0 = time.time()
|
|
estimator.predict(X_plot[:1000])
|
|
test_time.append(time.time() - t0)
|
|
|
|
plt.plot(
|
|
sizes,
|
|
train_time,
|
|
"o-",
|
|
color="r" if name == "SVR" else "g",
|
|
label="%s (train)" % name,
|
|
)
|
|
plt.plot(
|
|
sizes,
|
|
test_time,
|
|
"o--",
|
|
color="r" if name == "SVR" else "g",
|
|
label="%s (test)" % name,
|
|
)
|
|
|
|
plt.xscale("log")
|
|
plt.yscale("log")
|
|
plt.xlabel("Train size")
|
|
plt.ylabel("Time (seconds)")
|
|
plt.title("Execution Time")
|
|
_ = plt.legend(loc="best")
|
|
|
|
# %%
|
|
# This figure compares the time for fitting and prediction of KRR and SVR for
|
|
# different sizes of the training set. Fitting KRR is faster than SVR for
|
|
# medium-sized training sets (less than a few thousand samples); however, for
|
|
# larger training sets SVR scales better. With regard to prediction time, SVR
|
|
# should be faster than KRR for all sizes of the training set because of the
|
|
# learned sparse solution, however this is not necessarily the case in practice
|
|
# because of implementation details. Note that the degree of sparsity and thus
|
|
# the prediction time depends on the parameters epsilon and C of the SVR.
|
|
|
|
# %%
|
|
# Visualize the learning curves
|
|
# -----------------------------
|
|
from sklearn.model_selection import LearningCurveDisplay
|
|
|
|
_, ax = plt.subplots()
|
|
|
|
svr = SVR(kernel="rbf", C=1e1, gamma=0.1)
|
|
kr = KernelRidge(kernel="rbf", alpha=0.1, gamma=0.1)
|
|
|
|
common_params = {
|
|
"X": X[:100],
|
|
"y": y[:100],
|
|
"train_sizes": np.linspace(0.1, 1, 10),
|
|
"scoring": "neg_mean_squared_error",
|
|
"negate_score": True,
|
|
"score_name": "Mean Squared Error",
|
|
"score_type": "test",
|
|
"std_display_style": None,
|
|
"ax": ax,
|
|
}
|
|
|
|
LearningCurveDisplay.from_estimator(svr, **common_params)
|
|
LearningCurveDisplay.from_estimator(kr, **common_params)
|
|
ax.set_title("Learning curves")
|
|
ax.legend(handles=ax.get_legend_handles_labels()[0], labels=["SVR", "KRR"])
|
|
|
|
plt.show()
|