sklearn/examples/linear_model/plot_ard.py

"""
====================================
Comparing Linear Bayesian Regressors
====================================

This example compares two different bayesian regressors:

 - a :ref:`automatic_relevance_determination`
 - a :ref:`bayesian_ridge_regression`

In the first part, we use an :ref:`ordinary_least_squares` (OLS) model as a
baseline for comparing the models' coefficients with respect to the true
coefficients. Thereafter, we show that the estimation of such models is done by
iteratively maximizing the marginal log-likelihood of the observations.

In the last section we plot predictions and uncertainties for the ARD and the
Bayesian Ridge regressions using a polynomial feature expansion to fit a
non-linear relationship between `X` and `y`.

"""

# Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>

# %%
# Models robustness to recover the ground truth weights
# =====================================================
#
# Generate synthetic dataset
# --------------------------
#
# We generate a dataset where `X` and `y` are linearly linked: 10 of the
# features of `X` will be used to generate `y`. The other features are not
# useful at predicting `y`. In addition, we generate a dataset where `n_samples
# == n_features`. Such a setting is challenging for an OLS model and leads
# potentially to arbitrary large weights. Having a prior on the weights and a
# penalty alleviates the problem. Finally, gaussian noise is added.

from sklearn.datasets import make_regression

X, y, true_weights = make_regression(
    n_samples=100,
    n_features=100,
    n_informative=10,
    noise=8,
    coef=True,
    random_state=42,
)

# %%
# Fit the regressors
# ------------------
#
# We now fit both Bayesian models and the OLS to later compare the models'
# coefficients.

import pandas as pd

from sklearn.linear_model import ARDRegression, BayesianRidge, LinearRegression

olr = LinearRegression().fit(X, y)
brr = BayesianRidge(compute_score=True, max_iter=30).fit(X, y)
ard = ARDRegression(compute_score=True, max_iter=30).fit(X, y)
df = pd.DataFrame(
    {
        "Weights of true generative process": true_weights,
        "ARDRegression": ard.coef_,
        "BayesianRidge": brr.coef_,
        "LinearRegression": olr.coef_,
    }
)

# %%
# Plot the true and estimated coefficients
# ----------------------------------------
#
# Now we compare the coefficients of each model with the weights of
# the true generative model.
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import SymLogNorm

plt.figure(figsize=(10, 6))
ax = sns.heatmap(
    df.T,
    norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),
    cbar_kws={"label": "coefficients' values"},
    cmap="seismic_r",
)
plt.ylabel("linear model")
plt.xlabel("coefficients")
plt.tight_layout(rect=(0, 0, 1, 0.95))
_ = plt.title("Models' coefficients")

# %%
# Due to the added noise, none of the models recover the true weights. Indeed,
# all models always have more than 10 non-zero coefficients. Compared to the OLS
# estimator, the coefficients using a Bayesian Ridge regression are slightly
# shifted toward zero, which stabilises them. The ARD regression provides a
# sparser solution: some of the non-informative coefficients are set exactly to
# zero, while shifting others closer to zero. Some non-informative coefficients
# are still present and retain large values.

# %%
# Plot the marginal log-likelihood
# --------------------------------
import numpy as np

ard_scores = -np.array(ard.scores_)
brr_scores = -np.array(brr.scores_)
plt.plot(ard_scores, color="navy", label="ARD")
plt.plot(brr_scores, color="red", label="BayesianRidge")
plt.ylabel("Log-likelihood")
plt.xlabel("Iterations")
plt.xlim(1, 30)
plt.legend()
_ = plt.title("Models log-likelihood")

# %%
# Indeed, both models minimize the log-likelihood up to an arbitrary cutoff
# defined by the `max_iter` parameter.
#
# Bayesian regressions with polynomial feature expansion
# ======================================================
# Generate synthetic dataset
# --------------------------
# We create a target that is a non-linear function of the input feature.
# Noise following a standard uniform distribution is added.

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

rng = np.random.RandomState(0)
n_samples = 110

# sort the data to make plotting easier later
X = np.sort(-10 * rng.rand(n_samples) + 10)
noise = rng.normal(0, 1, n_samples) * 1.35
y = np.sqrt(X) * np.sin(X) + noise
full_data = pd.DataFrame({"input_feature": X, "target": y})
X = X.reshape((-1, 1))

# extrapolation
X_plot = np.linspace(10, 10.4, 10)
y_plot = np.sqrt(X_plot) * np.sin(X_plot)
X_plot = np.concatenate((X, X_plot.reshape((-1, 1))))
y_plot = np.concatenate((y - noise, y_plot))

# %%
# Fit the regressors
# ------------------
#
# Here we try a degree 10 polynomial to potentially overfit, though the bayesian
# linear models regularize the size of the polynomial coefficients. As
# `fit_intercept=True` by default for
# :class:`~sklearn.linear_model.ARDRegression` and
# :class:`~sklearn.linear_model.BayesianRidge`, then
# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an
# additional bias feature. By setting `return_std=True`, the bayesian regressors
# return the standard deviation of the posterior distribution for the model
# parameters.

ard_poly = make_pipeline(
    PolynomialFeatures(degree=10, include_bias=False),
    StandardScaler(),
    ARDRegression(),
).fit(X, y)
brr_poly = make_pipeline(
    PolynomialFeatures(degree=10, include_bias=False),
    StandardScaler(),
    BayesianRidge(),
).fit(X, y)

y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True)
y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True)

# %%
# Plotting polynomial regressions with std errors of the scores
# -------------------------------------------------------------

ax = sns.scatterplot(
    data=full_data, x="input_feature", y="target", color="black", alpha=0.75
)
ax.plot(X_plot, y_plot, color="black", label="Ground Truth")
ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")
ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")
ax.fill_between(
    X_plot.ravel(),
    y_ard - y_ard_std,
    y_ard + y_ard_std,
    color="navy",
    alpha=0.3,
)
ax.fill_between(
    X_plot.ravel(),
    y_brr - y_brr_std,
    y_brr + y_brr_std,
    color="red",
    alpha=0.3,
)
ax.legend()
_ = ax.set_title("Polynomial fit of a non-linear feature")

# %%
# The error bars represent one standard deviation of the predicted gaussian
# distribution of the query points. Notice that the ARD regression captures the
# ground truth the best when using the default parameters in both models, but
# further reducing the `lambda_init` hyperparameter of the Bayesian Ridge can
# reduce its bias (see example
# :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`).
# Finally, due to the intrinsic limitations of a polynomial regression, both
# models fail when extrapolating.
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`====================================`
			`Comparing Linear Bayesian Regressors`
			`====================================`

			`This example compares two different bayesian regressors:`

			- a :ref:`automatic_relevance_determination`
			- a :ref:`bayesian_ridge_regression`

			In the first part, we use an :ref:`ordinary_least_squares` (OLS) model as a
			`baseline for comparing the models' coefficients with respect to the true`
			`coefficients. Thereafter, we show that the estimation of such models is done by`
			`iteratively maximizing the marginal log-likelihood of the observations.`

			`In the last section we plot predictions and uncertainties for the ARD and the`
			`Bayesian Ridge regressions using a polynomial feature expansion to fit a`
			non-linear relationship between `X` and `y`.

			`"""`

			`# Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>`

			`# %%`
			`# Models robustness to recover the ground truth weights`
			`# =====================================================`
			`#`
			`# Generate synthetic dataset`
			`# --------------------------`
			`#`
			# We generate a dataset where `X` and `y` are linearly linked: 10 of the
			# features of `X` will be used to generate `y`. The other features are not
			# useful at predicting `y`. In addition, we generate a dataset where `n_samples
			# == n_features`. Such a setting is challenging for an OLS model and leads
			`# potentially to arbitrary large weights. Having a prior on the weights and a`
			`# penalty alleviates the problem. Finally, gaussian noise is added.`

			`from sklearn.datasets import make_regression`

			`X, y, true_weights = make_regression(`
			`n_samples=100,`
			`n_features=100,`
			`n_informative=10,`
			`noise=8,`
			`coef=True,`
			`random_state=42,`
			`)`

			`# %%`
			`# Fit the regressors`
			`# ------------------`
			`#`
			`# We now fit both Bayesian models and the OLS to later compare the models'`
			`# coefficients.`

			`import pandas as pd`

			`from sklearn.linear_model import ARDRegression, BayesianRidge, LinearRegression`

			`olr = LinearRegression().fit(X, y)`
			`brr = BayesianRidge(compute_score=True, max_iter=30).fit(X, y)`
			`ard = ARDRegression(compute_score=True, max_iter=30).fit(X, y)`
			`df = pd.DataFrame(`
			`{`
			`"Weights of true generative process": true_weights,`
			`"ARDRegression": ard.coef_,`
			`"BayesianRidge": brr.coef_,`
			`"LinearRegression": olr.coef_,`
			`}`
			`)`

			`# %%`
			`# Plot the true and estimated coefficients`
			`# ----------------------------------------`
			`#`
			`# Now we compare the coefficients of each model with the weights of`
			`# the true generative model.`
			`import matplotlib.pyplot as plt`
			`import seaborn as sns`
			`from matplotlib.colors import SymLogNorm`

			`plt.figure(figsize=(10, 6))`
			`ax = sns.heatmap(`
			`df.T,`
			`norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),`
			`cbar_kws={"label": "coefficients' values"},`
			`cmap="seismic_r",`
			`)`
			`plt.ylabel("linear model")`
			`plt.xlabel("coefficients")`
			`plt.tight_layout(rect=(0, 0, 1, 0.95))`
			`_ = plt.title("Models' coefficients")`

			`# %%`
			`# Due to the added noise, none of the models recover the true weights. Indeed,`
			`# all models always have more than 10 non-zero coefficients. Compared to the OLS`
			`# estimator, the coefficients using a Bayesian Ridge regression are slightly`
			`# shifted toward zero, which stabilises them. The ARD regression provides a`
			`# sparser solution: some of the non-informative coefficients are set exactly to`
			`# zero, while shifting others closer to zero. Some non-informative coefficients`
			`# are still present and retain large values.`

			`# %%`
			`# Plot the marginal log-likelihood`
			`# --------------------------------`
			`import numpy as np`

			`ard_scores = -np.array(ard.scores_)`
			`brr_scores = -np.array(brr.scores_)`
			`plt.plot(ard_scores, color="navy", label="ARD")`
			`plt.plot(brr_scores, color="red", label="BayesianRidge")`
			`plt.ylabel("Log-likelihood")`
			`plt.xlabel("Iterations")`
			`plt.xlim(1, 30)`
			`plt.legend()`
			`_ = plt.title("Models log-likelihood")`

			`# %%`
			`# Indeed, both models minimize the log-likelihood up to an arbitrary cutoff`
			# defined by the `max_iter` parameter.
			`#`
			`# Bayesian regressions with polynomial feature expansion`
			`# ======================================================`
			`# Generate synthetic dataset`
			`# --------------------------`
			`# We create a target that is a non-linear function of the input feature.`
			`# Noise following a standard uniform distribution is added.`

			`from sklearn.pipeline import make_pipeline`
			`from sklearn.preprocessing import PolynomialFeatures, StandardScaler`

			`rng = np.random.RandomState(0)`
			`n_samples = 110`

			`# sort the data to make plotting easier later`
			`X = np.sort(-10 * rng.rand(n_samples) + 10)`
			`noise = rng.normal(0, 1, n_samples) * 1.35`
			`y = np.sqrt(X) * np.sin(X) + noise`
			`full_data = pd.DataFrame({"input_feature": X, "target": y})`
			`X = X.reshape((-1, 1))`

			`# extrapolation`
			`X_plot = np.linspace(10, 10.4, 10)`
			`y_plot = np.sqrt(X_plot) * np.sin(X_plot)`
			`X_plot = np.concatenate((X, X_plot.reshape((-1, 1))))`
			`y_plot = np.concatenate((y - noise, y_plot))`

			`# %%`
			`# Fit the regressors`
			`# ------------------`
			`#`
			`# Here we try a degree 10 polynomial to potentially overfit, though the bayesian`
			`# linear models regularize the size of the polynomial coefficients. As`
			# `fit_intercept=True` by default for
			# :class:`~sklearn.linear_model.ARDRegression` and
			# :class:`~sklearn.linear_model.BayesianRidge`, then
			# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an
			# additional bias feature. By setting `return_std=True`, the bayesian regressors
			`# return the standard deviation of the posterior distribution for the model`
			`# parameters.`

			`ard_poly = make_pipeline(`
			`PolynomialFeatures(degree=10, include_bias=False),`
			`StandardScaler(),`
			`ARDRegression(),`
			`).fit(X, y)`
			`brr_poly = make_pipeline(`
			`PolynomialFeatures(degree=10, include_bias=False),`
			`StandardScaler(),`
			`BayesianRidge(),`
			`).fit(X, y)`

			`y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True)`
			`y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True)`

			`# %%`
			`# Plotting polynomial regressions with std errors of the scores`
			`# -------------------------------------------------------------`

			`ax = sns.scatterplot(`
			`data=full_data, x="input_feature", y="target", color="black", alpha=0.75`
			`)`
			`ax.plot(X_plot, y_plot, color="black", label="Ground Truth")`
			`ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")`
			`ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")`
			`ax.fill_between(`
			`X_plot.ravel(),`
			`y_ard - y_ard_std,`
			`y_ard + y_ard_std,`
			`color="navy",`
			`alpha=0.3,`
			`)`
			`ax.fill_between(`
			`X_plot.ravel(),`
			`y_brr - y_brr_std,`
			`y_brr + y_brr_std,`
			`color="red",`
			`alpha=0.3,`
			`)`
			`ax.legend()`
			`_ = ax.set_title("Polynomial fit of a non-linear feature")`

			`# %%`
			`# The error bars represent one standard deviation of the predicted gaussian`
			`# distribution of the query points. Notice that the ARD regression captures the`
			`# ground truth the best when using the default parameters in both models, but`
			# further reducing the `lambda_init` hyperparameter of the Bayesian Ridge can
			`# reduce its bias (see example`
			# :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`).
			`# Finally, due to the intrinsic limitations of a polynomial regression, both`
			`# models fail when extrapolating.`