339 lines
12 KiB
Python
339 lines
12 KiB
Python
"""
|
|
=====================================================
|
|
Prediction Intervals for Gradient Boosting Regression
|
|
=====================================================
|
|
|
|
This example shows how quantile regression can be used to create prediction
|
|
intervals. See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
|
|
for an example showcasing some other features of
|
|
:class:`~ensemble.HistGradientBoostingRegressor`.
|
|
|
|
"""
|
|
|
|
# %%
|
|
# Generate some data for a synthetic regression problem by applying the
|
|
# function f to uniformly sampled random inputs.
|
|
import numpy as np
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
def f(x):
|
|
"""The function to predict."""
|
|
return x * np.sin(x)
|
|
|
|
|
|
rng = np.random.RandomState(42)
|
|
X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
|
|
expected_y = f(X).ravel()
|
|
|
|
# %%
|
|
# To make the problem interesting, we generate observations of the target y as
|
|
# the sum of a deterministic term computed by the function f and a random noise
|
|
# term that follows a centered `log-normal
|
|
# <https://en.wikipedia.org/wiki/Log-normal_distribution>`_. To make this even
|
|
# more interesting we consider the case where the amplitude of the noise
|
|
# depends on the input variable x (heteroscedastic noise).
|
|
#
|
|
# The lognormal distribution is non-symmetric and long tailed: observing large
|
|
# outliers is likely but it is impossible to observe small outliers.
|
|
sigma = 0.5 + X.ravel() / 10
|
|
noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
|
|
y = expected_y + noise
|
|
|
|
# %%
|
|
# Split into train, test datasets:
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
|
|
|
# %%
|
|
# Fitting non-linear quantile and least squares regressors
|
|
# --------------------------------------------------------
|
|
#
|
|
# Fit gradient boosting models trained with the quantile loss and
|
|
# alpha=0.05, 0.5, 0.95.
|
|
#
|
|
# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence
|
|
# interval (95% - 5% = 90%).
|
|
#
|
|
# The model trained with alpha=0.5 produces a regression of the median: on
|
|
# average, there should be the same number of target observations above and
|
|
# below the predicted values.
|
|
from sklearn.ensemble import GradientBoostingRegressor
|
|
from sklearn.metrics import mean_pinball_loss, mean_squared_error
|
|
|
|
all_models = {}
|
|
common_params = dict(
|
|
learning_rate=0.05,
|
|
n_estimators=200,
|
|
max_depth=2,
|
|
min_samples_leaf=9,
|
|
min_samples_split=9,
|
|
)
|
|
for alpha in [0.05, 0.5, 0.95]:
|
|
gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
|
|
all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
|
|
|
|
# %%
|
|
# Notice that :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is much
|
|
# faster than :class:`~sklearn.ensemble.GradientBoostingRegressor` starting with
|
|
# intermediate datasets (`n_samples >= 10_000`), which is not the case of the
|
|
# present example.
|
|
#
|
|
# For the sake of comparison, we also fit a baseline model trained with the
|
|
# usual (mean) squared error (MSE).
|
|
gbr_ls = GradientBoostingRegressor(loss="squared_error", **common_params)
|
|
all_models["mse"] = gbr_ls.fit(X_train, y_train)
|
|
|
|
# %%
|
|
# Create an evenly spaced evaluation set of input values spanning the [0, 10]
|
|
# range.
|
|
xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
|
|
|
|
# %%
|
|
# Plot the true conditional mean function f, the predictions of the conditional
|
|
# mean (loss equals squared error), the conditional median and the conditional
|
|
# 90% interval (from 5th to 95th conditional percentiles).
|
|
import matplotlib.pyplot as plt
|
|
|
|
y_pred = all_models["mse"].predict(xx)
|
|
y_lower = all_models["q 0.05"].predict(xx)
|
|
y_upper = all_models["q 0.95"].predict(xx)
|
|
y_med = all_models["q 0.50"].predict(xx)
|
|
|
|
fig = plt.figure(figsize=(10, 10))
|
|
plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
|
|
plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
|
|
plt.plot(xx, y_med, "r-", label="Predicted median")
|
|
plt.plot(xx, y_pred, "r-", label="Predicted mean")
|
|
plt.plot(xx, y_upper, "k-")
|
|
plt.plot(xx, y_lower, "k-")
|
|
plt.fill_between(
|
|
xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
|
|
)
|
|
plt.xlabel("$x$")
|
|
plt.ylabel("$f(x)$")
|
|
plt.ylim(-10, 25)
|
|
plt.legend(loc="upper left")
|
|
plt.show()
|
|
|
|
# %%
|
|
# Comparing the predicted median with the predicted mean, we note that the
|
|
# median is on average below the mean as the noise is skewed towards high
|
|
# values (large outliers). The median estimate also seems to be smoother
|
|
# because of its natural robustness to outliers.
|
|
#
|
|
# Also observe that the inductive bias of gradient boosting trees is
|
|
# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal
|
|
# shape of the signal, in particular around x=8. Tuning hyper-parameters can
|
|
# reduce this effect as shown in the last part of this notebook.
|
|
#
|
|
# Analysis of the error metrics
|
|
# -----------------------------
|
|
#
|
|
# Measure the models with :func:`~sklearn.metrics.mean_squared_error` and
|
|
# :func:`~sklearn.metrics.mean_pinball_loss` metrics on the training dataset.
|
|
import pandas as pd
|
|
|
|
|
|
def highlight_min(x):
|
|
x_min = x.min()
|
|
return ["font-weight: bold" if v == x_min else "" for v in x]
|
|
|
|
|
|
results = []
|
|
for name, gbr in sorted(all_models.items()):
|
|
metrics = {"model": name}
|
|
y_pred = gbr.predict(X_train)
|
|
for alpha in [0.05, 0.5, 0.95]:
|
|
metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_train, y_pred, alpha=alpha)
|
|
metrics["MSE"] = mean_squared_error(y_train, y_pred)
|
|
results.append(metrics)
|
|
|
|
pd.DataFrame(results).set_index("model").style.apply(highlight_min)
|
|
|
|
# %%
|
|
# One column shows all models evaluated by the same metric. The minimum number
|
|
# on a column should be obtained when the model is trained and measured with
|
|
# the same metric. This should be always the case on the training set if the
|
|
# training converged.
|
|
#
|
|
# Note that because the target distribution is asymmetric, the expected
|
|
# conditional mean and conditional median are significantly different and
|
|
# therefore one could not use the squared error model get a good estimation of
|
|
# the conditional median nor the converse.
|
|
#
|
|
# If the target distribution were symmetric and had no outliers (e.g. with a
|
|
# Gaussian noise), then median estimator and the least squares estimator would
|
|
# have yielded similar predictions.
|
|
#
|
|
# We then do the same on the test set.
|
|
results = []
|
|
for name, gbr in sorted(all_models.items()):
|
|
metrics = {"model": name}
|
|
y_pred = gbr.predict(X_test)
|
|
for alpha in [0.05, 0.5, 0.95]:
|
|
metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_test, y_pred, alpha=alpha)
|
|
metrics["MSE"] = mean_squared_error(y_test, y_pred)
|
|
results.append(metrics)
|
|
|
|
pd.DataFrame(results).set_index("model").style.apply(highlight_min)
|
|
|
|
|
|
# %%
|
|
# Errors are higher meaning the models slightly overfitted the data. It still
|
|
# shows that the best test metric is obtained when the model is trained by
|
|
# minimizing this same metric.
|
|
#
|
|
# Note that the conditional median estimator is competitive with the squared
|
|
# error estimator in terms of MSE on the test set: this can be explained by
|
|
# the fact the squared error estimator is very sensitive to large outliers
|
|
# which can cause significant overfitting. This can be seen on the right hand
|
|
# side of the previous plot. The conditional median estimator is biased
|
|
# (underestimation for this asymmetric noise) but is also naturally robust to
|
|
# outliers and overfits less.
|
|
#
|
|
# .. _calibration-section:
|
|
#
|
|
# Calibration of the confidence interval
|
|
# --------------------------------------
|
|
#
|
|
# We can also evaluate the ability of the two extreme quantile estimators at
|
|
# producing a well-calibrated conditional 90%-confidence interval.
|
|
#
|
|
# To do this we can compute the fraction of observations that fall between the
|
|
# predictions:
|
|
def coverage_fraction(y, y_low, y_high):
|
|
return np.mean(np.logical_and(y >= y_low, y <= y_high))
|
|
|
|
|
|
coverage_fraction(
|
|
y_train,
|
|
all_models["q 0.05"].predict(X_train),
|
|
all_models["q 0.95"].predict(X_train),
|
|
)
|
|
|
|
# %%
|
|
# On the training set the calibration is very close to the expected coverage
|
|
# value for a 90% confidence interval.
|
|
coverage_fraction(
|
|
y_test, all_models["q 0.05"].predict(X_test), all_models["q 0.95"].predict(X_test)
|
|
)
|
|
|
|
|
|
# %%
|
|
# On the test set, the estimated confidence interval is slightly too narrow.
|
|
# Note, however, that we would need to wrap those metrics in a cross-validation
|
|
# loop to assess their variability under data resampling.
|
|
#
|
|
# Tuning the hyper-parameters of the quantile regressors
|
|
# ------------------------------------------------------
|
|
#
|
|
# In the plot above, we observed that the 5th percentile regressor seems to
|
|
# underfit and could not adapt to sinusoidal shape of the signal.
|
|
#
|
|
# The hyper-parameters of the model were approximately hand-tuned for the
|
|
# median regressor and there is no reason that the same hyper-parameters are
|
|
# suitable for the 5th percentile regressor.
|
|
#
|
|
# To confirm this hypothesis, we tune the hyper-parameters of a new regressor
|
|
# of the 5th percentile by selecting the best model parameters by
|
|
# cross-validation on the pinball loss with alpha=0.05:
|
|
|
|
# %%
|
|
from sklearn.experimental import enable_halving_search_cv # noqa
|
|
from sklearn.model_selection import HalvingRandomSearchCV
|
|
from sklearn.metrics import make_scorer
|
|
from pprint import pprint
|
|
|
|
param_grid = dict(
|
|
learning_rate=[0.05, 0.1, 0.2],
|
|
max_depth=[2, 5, 10],
|
|
min_samples_leaf=[1, 5, 10, 20],
|
|
min_samples_split=[5, 10, 20, 30, 50],
|
|
)
|
|
alpha = 0.05
|
|
neg_mean_pinball_loss_05p_scorer = make_scorer(
|
|
mean_pinball_loss,
|
|
alpha=alpha,
|
|
greater_is_better=False, # maximize the negative loss
|
|
)
|
|
gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0)
|
|
search_05p = HalvingRandomSearchCV(
|
|
gbr,
|
|
param_grid,
|
|
resource="n_estimators",
|
|
max_resources=250,
|
|
min_resources=50,
|
|
scoring=neg_mean_pinball_loss_05p_scorer,
|
|
n_jobs=2,
|
|
random_state=0,
|
|
).fit(X_train, y_train)
|
|
pprint(search_05p.best_params_)
|
|
|
|
# %%
|
|
# We observe that the hyper-parameters that were hand-tuned for the median
|
|
# regressor are in the same range as the hyper-parameters suitable for the 5th
|
|
# percentile regressor.
|
|
#
|
|
# Let's now tune the hyper-parameters for the 95th percentile regressor. We
|
|
# need to redefine the `scoring` metric used to select the best model, along
|
|
# with adjusting the alpha parameter of the inner gradient boosting estimator
|
|
# itself:
|
|
from sklearn.base import clone
|
|
|
|
alpha = 0.95
|
|
neg_mean_pinball_loss_95p_scorer = make_scorer(
|
|
mean_pinball_loss,
|
|
alpha=alpha,
|
|
greater_is_better=False, # maximize the negative loss
|
|
)
|
|
search_95p = clone(search_05p).set_params(
|
|
estimator__alpha=alpha,
|
|
scoring=neg_mean_pinball_loss_95p_scorer,
|
|
)
|
|
search_95p.fit(X_train, y_train)
|
|
pprint(search_95p.best_params_)
|
|
|
|
# %%
|
|
# The result shows that the hyper-parameters for the 95th percentile regressor
|
|
# identified by the search procedure are roughly in the same range as the hand-
|
|
# tuned hyper-parameters for the median regressor and the hyper-parameters
|
|
# identified by the search procedure for the 5th percentile regressor. However,
|
|
# the hyper-parameter searches did lead to an improved 90% confidence interval
|
|
# that is comprised by the predictions of those two tuned quantile regressors.
|
|
# Note that the prediction of the upper 95th percentile has a much coarser shape
|
|
# than the prediction of the lower 5th percentile because of the outliers:
|
|
y_lower = search_05p.predict(xx)
|
|
y_upper = search_95p.predict(xx)
|
|
|
|
fig = plt.figure(figsize=(10, 10))
|
|
plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
|
|
plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
|
|
plt.plot(xx, y_upper, "k-")
|
|
plt.plot(xx, y_lower, "k-")
|
|
plt.fill_between(
|
|
xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
|
|
)
|
|
plt.xlabel("$x$")
|
|
plt.ylabel("$f(x)$")
|
|
plt.ylim(-10, 25)
|
|
plt.legend(loc="upper left")
|
|
plt.title("Prediction with tuned hyper-parameters")
|
|
plt.show()
|
|
|
|
# %%
|
|
# The plot looks qualitatively better than for the untuned models, especially
|
|
# for the shape of the of lower quantile.
|
|
#
|
|
# We now quantitatively evaluate the joint-calibration of the pair of
|
|
# estimators:
|
|
coverage_fraction(y_train, search_05p.predict(X_train), search_95p.predict(X_train))
|
|
# %%
|
|
coverage_fraction(y_test, search_05p.predict(X_test), search_95p.predict(X_test))
|
|
# %%
|
|
# The calibration of the tuned pair is sadly not better on the test set: the
|
|
# width of the estimated confidence interval is still too narrow.
|
|
#
|
|
# Again, we would need to wrap this study in a cross-validation loop to
|
|
# better assess the variability of those estimates.
|