sklearn/examples/linear_model/plot_quantile_regression.py

"""
===================
Quantile regression
===================

This example illustrates how quantile regression can predict non-trivial
conditional quantiles.

The left figure shows the case when the error distribution is normal,
but has non-constant variance, i.e. with heteroscedasticity.

The right figure shows an example of an asymmetric error distribution,
namely the Pareto distribution.

"""

# Authors: David Dale <dale.david@mail.ru>
#          Christian Lorentzen <lorentzen.ch@gmail.com>
#          Guillaume Lemaitre <glemaitre58@gmail.com>
# License: BSD 3 clause

# %%
# Dataset generation
# ------------------
#
# To illustrate the behaviour of quantile regression, we will generate two
# synthetic datasets. The true generative random processes for both datasets
# will be composed by the same expected value with a linear relationship with a
# single feature `x`.
import numpy as np

rng = np.random.RandomState(42)
x = np.linspace(start=0, stop=10, num=100)
X = x[:, np.newaxis]
y_true_mean = 10 + 0.5 * x

# %%
# We will create two subsequent problems by changing the distribution of the
# target `y` while keeping the same expected value:
#
# - in the first case, a heteroscedastic Normal noise is added;
# - in the second case, an asymmetric Pareto noise is added.
y_normal = y_true_mean + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0])
a = 5
y_pareto = y_true_mean + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1))

# %%
# Let's first visualize the datasets as well as the distribution of the
# residuals `y - mean(y)`.
import matplotlib.pyplot as plt

_, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row")

axs[0, 0].plot(x, y_true_mean, label="True mean")
axs[0, 0].scatter(x, y_normal, color="black", alpha=0.5, label="Observations")
axs[1, 0].hist(y_true_mean - y_normal, edgecolor="black")


axs[0, 1].plot(x, y_true_mean, label="True mean")
axs[0, 1].scatter(x, y_pareto, color="black", alpha=0.5, label="Observations")
axs[1, 1].hist(y_true_mean - y_pareto, edgecolor="black")

axs[0, 0].set_title("Dataset with heteroscedastic Normal distributed targets")
axs[0, 1].set_title("Dataset with asymmetric Pareto distributed target")
axs[1, 0].set_title(
    "Residuals distribution for heteroscedastic Normal distributed targets"
)
axs[1, 1].set_title("Residuals distribution for asymmetric Pareto distributed target")
axs[0, 0].legend()
axs[0, 1].legend()
axs[0, 0].set_ylabel("y")
axs[1, 0].set_ylabel("Counts")
axs[0, 1].set_xlabel("x")
axs[0, 0].set_xlabel("x")
axs[1, 0].set_xlabel("Residuals")
_ = axs[1, 1].set_xlabel("Residuals")

# %%
# With the heteroscedastic Normal distributed target, we observe that the
# variance of the noise is increasing when the value of the feature `x` is
# increasing.
#
# With the asymmetric Pareto distributed target, we observe that the positive
# residuals are bounded.
#
# These types of noisy targets make the estimation via
# :class:`~sklearn.linear_model.LinearRegression` less efficient, i.e. we need
# more data to get stable results and, in addition, large outliers can have a
# huge impact on the fitted coefficients. (Stated otherwise: in a setting with
# constant variance, ordinary least squares estimators converge much faster to
# the *true* coefficients with increasing sample size.)
#
# In this asymmetric setting, the median or different quantiles give additional
# insights. On top of that, median estimation is much more robust to outliers
# and heavy tailed distributions. But note that extreme quantiles are estimated
# by very few data points. 95% quantile are more or less estimated by the 5%
# largest values and thus also a bit sensitive outliers.
#
# In the remainder of this tutorial, we will show how
# :class:`~sklearn.linear_model.QuantileRegressor` can be used in practice and
# give the intuition into the properties of the fitted models. Finally,
# we will compare the both :class:`~sklearn.linear_model.QuantileRegressor`
# and :class:`~sklearn.linear_model.LinearRegression`.
#
# Fitting a `QuantileRegressor`
# -----------------------------
#
# In this section, we want to estimate the conditional median as well as
# a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get
# three linear models, one for each quantile.
#
# We will use the quantiles at 5% and 95% to find the outliers in the training
# sample beyond the central 90% interval.
from sklearn.utils.fixes import parse_version, sp_version

# This is line is to avoid incompatibility if older SciPy version.
# You should use `solver="highs"` with recent version of SciPy.
solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"

# %%
from sklearn.linear_model import QuantileRegressor

quantiles = [0.05, 0.5, 0.95]
predictions = {}
out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
for quantile in quantiles:
    qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver)
    y_pred = qr.fit(X, y_normal).predict(X)
    predictions[quantile] = y_pred

    if quantile == min(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred >= y_normal
        )
    elif quantile == max(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred <= y_normal
        )

# %%
# Now, we can plot the three linear models and the distinguished samples that
# are within the central 90% interval from samples that are outside this
# interval.
plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean")

for quantile, y_pred in predictions.items():
    plt.plot(X, y_pred, label=f"Quantile: {quantile}")

plt.scatter(
    x[out_bounds_predictions],
    y_normal[out_bounds_predictions],
    color="black",
    marker="+",
    alpha=0.5,
    label="Outside interval",
)
plt.scatter(
    x[~out_bounds_predictions],
    y_normal[~out_bounds_predictions],
    color="black",
    alpha=0.5,
    label="Inside interval",
)

plt.legend()
plt.xlabel("x")
plt.ylabel("y")
_ = plt.title("Quantiles of heteroscedastic Normal distributed target")

# %%
# Since the noise is still Normally distributed, in particular is symmetric,
# the true conditional mean and the true conditional median coincide. Indeed,
# we see that the estimated median almost hits the true mean. We observe the
# effect of having an increasing noise variance on the 5% and 95% quantiles:
# the slopes of those quantiles are very different and the interval between
# them becomes wider with increasing `x`.
#
# To get an additional intuition regarding the meaning of the 5% and 95%
# quantiles estimators, one can count the number of samples above and below the
# predicted quantiles (represented by a cross on the above plot), considering
# that we have a total of 100 samples.
#
# We can repeat the same experiment using the asymmetric Pareto distributed
# target.
quantiles = [0.05, 0.5, 0.95]
predictions = {}
out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
for quantile in quantiles:
    qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver)
    y_pred = qr.fit(X, y_pareto).predict(X)
    predictions[quantile] = y_pred

    if quantile == min(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred >= y_pareto
        )
    elif quantile == max(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred <= y_pareto
        )

# %%
plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean")

for quantile, y_pred in predictions.items():
    plt.plot(X, y_pred, label=f"Quantile: {quantile}")

plt.scatter(
    x[out_bounds_predictions],
    y_pareto[out_bounds_predictions],
    color="black",
    marker="+",
    alpha=0.5,
    label="Outside interval",
)
plt.scatter(
    x[~out_bounds_predictions],
    y_pareto[~out_bounds_predictions],
    color="black",
    alpha=0.5,
    label="Inside interval",
)

plt.legend()
plt.xlabel("x")
plt.ylabel("y")
_ = plt.title("Quantiles of asymmetric Pareto distributed target")


# %%
# Due to the asymmetry of the distribution of the noise, we observe that the
# true mean and estimated conditional median are different. We also observe
# that each quantile model has different parameters to better fit the desired
# quantile. Note that ideally, all quantiles would be parallel in this case,
# which would become more visible with more data points or less extreme
# quantiles, e.g. 10% and 90%.
#
# Comparing `QuantileRegressor` and `LinearRegression`
# ----------------------------------------------------
#
# In this section, we will linger on the difference regarding the error that
# :class:`~sklearn.linear_model.QuantileRegressor` and
# :class:`~sklearn.linear_model.LinearRegression` are minimizing.
#
# Indeed, :class:`~sklearn.linear_model.LinearRegression` is a least squares
# approach minimizing the mean squared error (MSE) between the training and
# predicted targets. In contrast,
# :class:`~sklearn.linear_model.QuantileRegressor` with `quantile=0.5`
# minimizes the mean absolute error (MAE) instead.
#
# Let's first compute the training errors of such models in terms of mean
# squared error and mean absolute error. We will use the asymmetric Pareto
# distributed target to make it more interesting as mean and median are not
# equal.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

linear_regression = LinearRegression()
quantile_regression = QuantileRegressor(quantile=0.5, alpha=0, solver=solver)

y_pred_lr = linear_regression.fit(X, y_pareto).predict(X)
y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X)

print(
    f"""Training error (in-sample performance)
    {linear_regression.__class__.__name__}:
    MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f}
    MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f}
    {quantile_regression.__class__.__name__}:
    MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f}
    MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f}
    """
)

# %%
# On the training set, we see that MAE is lower for
# :class:`~sklearn.linear_model.QuantileRegressor` than
# :class:`~sklearn.linear_model.LinearRegression`. In contrast to that, MSE is
# lower for :class:`~sklearn.linear_model.LinearRegression` than
# :class:`~sklearn.linear_model.QuantileRegressor`. These results confirms that
# MAE is the loss minimized by :class:`~sklearn.linear_model.QuantileRegressor`
# while MSE is the loss minimized
# :class:`~sklearn.linear_model.LinearRegression`.
#
# We can make a similar evaluation by looking at the test error obtained by
# cross-validation.
from sklearn.model_selection import cross_validate

cv_results_lr = cross_validate(
    linear_regression,
    X,
    y_pareto,
    cv=3,
    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)
cv_results_qr = cross_validate(
    quantile_regression,
    X,
    y_pareto,
    cv=3,
    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)
print(
    f"""Test error (cross-validated performance)
    {linear_regression.__class__.__name__}:
    MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f}
    MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f}
    {quantile_regression.__class__.__name__}:
    MAE = {-cv_results_qr["test_neg_mean_absolute_error"].mean():.3f}
    MSE = {-cv_results_qr["test_neg_mean_squared_error"].mean():.3f}
    """
)

# %%
# We reach similar conclusions on the out-of-sample evaluation.