sklearn/examples/calibration/plot_compare_calibration.py

"""
========================================
Comparison of Calibration of Classifiers
========================================

Well calibrated classifiers are probabilistic classifiers for which the output
of :term:`predict_proba` can be directly interpreted as a confidence level.
For instance, a well calibrated (binary) classifier should classify the samples
such that for the samples to which it gave a :term:`predict_proba` value close
to 0.8, approximately 80% actually belong to the positive class.

In this example we will compare the calibration of four different
models: :ref:`Logistic_regression`, :ref:`gaussian_naive_bayes`,
:ref:`Random Forest Classifier <forest>` and :ref:`Linear SVM
<svm_classification>`.

"""

# %%
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD 3 clause.
#
# Dataset
# -------
#
# We will use a synthetic binary classification dataset with 100,000 samples
# and 20 features. Of the 20 features, only 2 are informative, 2 are
# redundant (random combinations of the informative features) and the
# remaining 16 are uninformative (random numbers).
#
# Of the 100,000 samples, 100 will be used for model fitting and the remaining
# for testing. Note that this split is quite unusual: the goal is to obtain
# stable calibration curve estimates for models that are potentially prone to
# overfitting. In practice, one should rather use cross-validation with more
# balanced splits but this would make the code of this example more complicated
# to follow.

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_samples=100_000, n_features=20, n_informative=2, n_redundant=2, random_state=42
)

train_samples = 100  # Samples used for training the models
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    shuffle=False,
    test_size=100_000 - train_samples,
)

# %%
# Calibration curves
# ------------------
#
# Below, we train each of the four models with the small training dataset, then
# plot calibration curves (also known as reliability diagrams) using
# predicted probabilities of the test dataset. Calibration curves are created
# by binning predicted probabilities, then plotting the mean predicted
# probability in each bin against the observed frequency ('fraction of
# positives'). Below the calibration curve, we plot a histogram showing
# the distribution of the predicted probabilities or more specifically,
# the number of samples in each predicted probability bin.

import numpy as np

from sklearn.svm import LinearSVC


class NaivelyCalibratedLinearSVC(LinearSVC):
    """LinearSVC with `predict_proba` method that naively scales
    `decision_function` output."""

    def fit(self, X, y):
        super().fit(X, y)
        df = self.decision_function(X)
        self.df_min_ = df.min()
        self.df_max_ = df.max()

    def predict_proba(self, X):
        """Min-max scale output of `decision_function` to [0,1]."""
        df = self.decision_function(X)
        calibrated_df = (df - self.df_min_) / (self.df_max_ - self.df_min_)
        proba_pos_class = np.clip(calibrated_df, 0, 1)
        proba_neg_class = 1 - proba_pos_class
        proba = np.c_[proba_neg_class, proba_pos_class]
        return proba


# %%

from sklearn.calibration import CalibrationDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB

# Define the classifiers to be compared in the study.
#
# Note that we use a variant of the logistic regression model that can
# automatically tune its regularization parameter.
#
# For a fair comparison, we should run a hyper-parameter search for all the
# classifiers but we don't do it here for the sake of keeping the example code
# concise and fast to execute.
lr = LogisticRegressionCV(
    Cs=np.logspace(-6, 6, 101), cv=10, scoring="neg_log_loss", max_iter=1_000
)
gnb = GaussianNB()
svc = NaivelyCalibratedLinearSVC(C=1.0)
rfc = RandomForestClassifier(random_state=42)

clf_list = [
    (lr, "Logistic Regression"),
    (gnb, "Naive Bayes"),
    (svc, "SVC"),
    (rfc, "Random forest"),
]

# %%

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.get_cmap("Dark2")

ax_calibration_curve = fig.add_subplot(gs[:2, :2])
calibration_displays = {}
markers = ["^", "v", "s", "o"]
for i, (clf, name) in enumerate(clf_list):
    clf.fit(X_train, y_train)
    display = CalibrationDisplay.from_estimator(
        clf,
        X_test,
        y_test,
        n_bins=10,
        name=name,
        ax=ax_calibration_curve,
        color=colors(i),
        marker=markers[i],
    )
    calibration_displays[name] = display

ax_calibration_curve.grid()
ax_calibration_curve.set_title("Calibration plots")

# Add histogram
grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
for i, (_, name) in enumerate(clf_list):
    row, col = grid_positions[i]
    ax = fig.add_subplot(gs[row, col])

    ax.hist(
        calibration_displays[name].y_prob,
        range=(0, 1),
        bins=10,
        label=name,
        color=colors(i),
    )
    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")

plt.tight_layout()
plt.show()

# %%
#
# Analysis of the results
# -----------------------
#
# :class:`~sklearn.linear_model.LogisticRegressionCV` returns reasonably well
# calibrated predictions despite the small training set size: its reliability
# curve is the closest to the diagonal among the four models.
#
# Logistic regression is trained by minimizing the log-loss which is a strictly
# proper scoring rule: in the limit of infinite training data, strictly proper
# scoring rules are minimized by the model that predicts the true conditional
# probabilities. That (hypothetical) model would therefore be perfectly
# calibrated. However, using a proper scoring rule as training objective is not
# sufficient to guarantee a well-calibrated model by itself: even with a very
# large training set, logistic regression could still be poorly calibrated, if
# it was too strongly regularized or if the choice and preprocessing of input
# features made this model mis-specified (e.g. if the true decision boundary of
# the dataset is a highly non-linear function of the input features).
#
# In this example the training set was intentionally kept very small. In this
# setting, optimizing the log-loss can still lead to poorly calibrated models
# because of overfitting. To mitigate this, the
# :class:`~sklearn.linear_model.LogisticRegressionCV` class was configured to
# tune the `C` regularization parameter to also minimize the log-loss via inner
# cross-validation so as to find the best compromise for this model in the
# small training set setting.
#
# Because of the finite training set size and the lack of guarantee for
# well-specification, we observe that the calibration curve of the logistic
# regression model is close but not perfectly on the diagonal. The shape of the
# calibration curve of this model can be interpreted as slightly
# under-confident: the predicted probabilities are a bit too close to 0.5
# compared to the true fraction of positive samples.
#
# The other methods all output less well calibrated probabilities:
#
# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push probabilities to 0
#   or 1 (see histogram) on this particular dataset (over-confidence). This is
#   mainly because the naive Bayes equation only provides correct estimate of
#   probabilities when the assumption that features are conditionally
#   independent holds [2]_. However, features can be correlated and this is the case
#   with this dataset, which contains 2 features generated as random linear
#   combinations of the informative features. These correlated features are
#   effectively being 'counted twice', resulting in pushing the predicted
#   probabilities towards 0 and 1 [3]_. Note, however, that changing the seed
#   used to generate the dataset can lead to widely varying results for the
#   naive Bayes estimator.
#
# * :class:`~sklearn.svm.LinearSVC` is not a natural probabilistic classifier.
#   In order to interpret its prediction as such, we naively scaled the output
#   of the :term:`decision_function` into [0, 1] by applying min-max scaling in
#   the `NaivelyCalibratedLinearSVC` wrapper class defined above. This
#   estimator shows a typical sigmoid-shaped calibration curve on this data:
#   predictions larger than 0.5 correspond to samples with an even larger
#   effective positive class fraction (above the diagonal), while predictions
#   below 0.5 corresponds to even lower positive class fractions (below the
#   diagonal). This under-confident predictions are typical for maximum-margin
#   methods [1]_.
#
# * :class:`~sklearn.ensemble.RandomForestClassifier`'s prediction histogram
#   shows peaks at approx. 0.2 and 0.9 probability, while probabilities close to
#   0 or 1 are very rare. An explanation for this is given by [1]_:
#   "Methods such as bagging and random forests that average
#   predictions from a base set of models can have difficulty making
#   predictions near 0 and 1 because variance in the underlying base models
#   will bias predictions that should be near zero or one away from these
#   values. Because predictions are restricted to the interval [0, 1], errors
#   caused by variance tend to be one-sided near zero and one. For example, if
#   a model should predict p = 0 for a case, the only way bagging can achieve
#   this is if all bagged trees predict zero. If we add noise to the trees that
#   bagging is averaging over, this noise will cause some trees to predict
#   values larger than 0 for this case, thus moving the average prediction of
#   the bagged ensemble away from 0. We observe this effect most strongly with
#   random forests because the base-level trees trained with random forests
#   have relatively high variance due to feature subsetting." This effect can
#   make random forests under-confident. Despite this possible bias, note that
#   the trees themselves are fit by minimizing either the Gini or Entropy
#   criterion, both of which lead to splits that minimize proper scoring rules:
#   the Brier score or the log-loss respectively. See :ref:`the user guide
#   <tree_mathematical_formulation>` for more details. This can explain why
#   this model shows a good enough calibration curve on this particular example
#   dataset. Indeed the Random Forest model is not significantly more
#   under-confident than the Logistic Regression model.
#
# Feel free to re-run this example with different random seeds and other
# dataset generation parameters to see how different the calibration plots can
# look. In general, Logistic Regression and Random Forest will tend to be the
# best calibrated classifiers, while SVC will often display the typical
# under-confident miscalibration. The naive Bayes model is also often poorly
# calibrated but the general shape of its calibration curve can vary widely
# depending on the dataset.
#
# Finally, note that for some dataset seeds, all models are poorly calibrated,
# even when tuning the regularization parameter as above. This is bound to
# happen when the training size is too small or when the model is severely
# misspecified.
#
# References
# ----------
#
# .. [1] `Predicting Good Probabilities with Supervised Learning
#        <https://dl.acm.org/doi/pdf/10.1145/1102351.1102430>`_, A.
#        Niculescu-Mizil & R. Caruana, ICML 2005
#
# .. [2] `Beyond independence: Conditions for the optimality of the simple
#        bayesian classifier
#        <https://www.ics.uci.edu/~pazzani/Publications/mlc96-pedro.pdf>`_
#        Domingos, P., & Pazzani, M., Proc. 13th Intl. Conf. Machine Learning.
#        1996.
#
# .. [3] `Obtaining calibrated probability estimates from decision trees and
#        naive Bayesian classifiers
#        <https://citeseerx.ist.psu.edu/doc_view/pid/4f67a122ec3723f08ad5cbefecad119b432b3304>`_
#        Zadrozny, Bianca, and Charles Elkan. Icml. Vol. 1. 2001.