430 lines
16 KiB
Python
430 lines
16 KiB
Python
"""
|
||
==============================================
|
||
Features in Histogram Gradient Boosting Trees
|
||
==============================================
|
||
|
||
:ref:`histogram_based_gradient_boosting` (HGBT) models may be one of the most
|
||
useful supervised learning models in scikit-learn. They are based on a modern
|
||
gradient boosting implementation comparable to LightGBM and XGBoost. As such,
|
||
HGBT models are more feature rich than and often outperform alternative models
|
||
like random forests, especially when the number of samples is larger than some
|
||
ten thousands (see
|
||
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`).
|
||
|
||
The top usability features of HGBT models are:
|
||
|
||
1. Several available loss functions for mean and quantile regression tasks, see
|
||
:ref:`Quantile loss <quantile_support_hgbdt>`.
|
||
2. :ref:`categorical_support_gbdt`, see
|
||
:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`.
|
||
3. Early stopping.
|
||
4. :ref:`nan_support_hgbt`, which avoids the need for an imputer.
|
||
5. :ref:`monotonic_cst_gbdt`.
|
||
6. :ref:`interaction_cst_hgbt`.
|
||
|
||
This example aims at showcasing all points except 2 and 6 in a real life
|
||
setting.
|
||
"""
|
||
|
||
# Author: Arturo Amor <david-arturo.amor-quiroz@inria.fr>
|
||
# License: BSD 3 clause
|
||
|
||
# %%
|
||
# Preparing the data
|
||
# ==================
|
||
# The `electricity dataset <http://www.openml.org/d/151>`_ consists of data
|
||
# collected from the Australian New South Wales Electricity Market. In this
|
||
# market, prices are not fixed and are affected by supply and demand. They are
|
||
# set every five minutes. Electricity transfers to/from the neighboring state of
|
||
# Victoria were done to alleviate fluctuations.
|
||
#
|
||
# The dataset, originally named ELEC2, contains 45,312 instances dated from 7
|
||
# May 1996 to 5 December 1998. Each sample of the dataset refers to a period of
|
||
# 30 minutes, i.e. there are 48 instances for each time period of one day. Each
|
||
# sample on the dataset has 7 columns:
|
||
#
|
||
# - date: between 7 May 1996 to 5 December 1998. Normalized between 0 and 1;
|
||
# - day: day of week (1-7);
|
||
# - period: half hour intervals over 24 hours. Normalized between 0 and 1;
|
||
# - nswprice/nswdemand: electricity price/demand of New South Wales;
|
||
# - vicprice/vicdemand: electricity price/demand of Victoria.
|
||
#
|
||
# Originally, it is a classification task, but here we use it for the regression
|
||
# task to predict the scheduled electricity transfer between states.
|
||
|
||
from sklearn.datasets import fetch_openml
|
||
|
||
electricity = fetch_openml(
|
||
name="electricity", version=1, as_frame=True, parser="pandas"
|
||
)
|
||
df = electricity.frame
|
||
|
||
# %%
|
||
# This particular dataset has a stepwise constant target for the first 17,760
|
||
# samples:
|
||
|
||
df["transfer"][:17_760].unique()
|
||
|
||
# %%
|
||
# Let us drop those entries and explore the hourly electricity transfer over
|
||
# different days of the week:
|
||
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
|
||
df = electricity.frame.iloc[17_760:]
|
||
X = df.drop(columns=["transfer", "class"])
|
||
y = df["transfer"]
|
||
|
||
fig, ax = plt.subplots(figsize=(15, 10))
|
||
pointplot = sns.lineplot(x=df["period"], y=df["transfer"], hue=df["day"], ax=ax)
|
||
handles, lables = ax.get_legend_handles_labels()
|
||
ax.set(
|
||
title="Hourly energy transfer for different days of the week",
|
||
xlabel="Normalized time of the day",
|
||
ylabel="Normalized energy transfer",
|
||
)
|
||
_ = ax.legend(handles, ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])
|
||
|
||
# %%
|
||
# Notice that energy transfer increases systematically during weekends.
|
||
#
|
||
# Effect of number of trees and early stopping
|
||
# ============================================
|
||
# For the sake of illustrating the effect of the (maximum) number of trees, we
|
||
# train a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` over the
|
||
# daily electricity transfer using the whole dataset. Then we visualize its
|
||
# predictions depending on the `max_iter` parameter. Here we don't try to
|
||
# evaluate the performance of the model and its capacity to generalize but
|
||
# rather its capability to learn from the training data.
|
||
|
||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||
from sklearn.model_selection import train_test_split
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
|
||
|
||
print(f"Training sample size: {X_train.shape[0]}")
|
||
print(f"Test sample size: {X_test.shape[0]}")
|
||
print(f"Number of features: {X_train.shape[1]}")
|
||
|
||
# %%
|
||
max_iter_list = [5, 50]
|
||
average_week_demand = (
|
||
df.loc[X_test.index].groupby(["day", "period"], observed=False)["transfer"].mean()
|
||
)
|
||
colors = sns.color_palette("colorblind")
|
||
fig, ax = plt.subplots(figsize=(10, 5))
|
||
average_week_demand.plot(color=colors[0], label="recorded average", linewidth=2, ax=ax)
|
||
|
||
for idx, max_iter in enumerate(max_iter_list):
|
||
hgbt = HistGradientBoostingRegressor(
|
||
max_iter=max_iter, categorical_features=None, random_state=42
|
||
)
|
||
hgbt.fit(X_train, y_train)
|
||
|
||
y_pred = hgbt.predict(X_test)
|
||
prediction_df = df.loc[X_test.index].copy()
|
||
prediction_df["y_pred"] = y_pred
|
||
average_pred = prediction_df.groupby(["day", "period"], observed=False)[
|
||
"y_pred"
|
||
].mean()
|
||
average_pred.plot(
|
||
color=colors[idx + 1], label=f"max_iter={max_iter}", linewidth=2, ax=ax
|
||
)
|
||
|
||
ax.set(
|
||
title="Predicted average energy transfer during the week",
|
||
xticks=[(i + 0.2) * 48 for i in range(7)],
|
||
xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
|
||
xlabel="Time of the week",
|
||
ylabel="Normalized energy transfer",
|
||
)
|
||
_ = ax.legend()
|
||
|
||
# %%
|
||
# With just a few iterations, HGBT models can achieve convergence (see
|
||
# :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`),
|
||
# meaning that adding more trees does not improve the model anymore. In the
|
||
# figure above, 5 iterations are not enough to get good predictions. With 50
|
||
# iterations, we are already able to do a good job.
|
||
#
|
||
# Setting `max_iter` too high might degrade the prediction quality and cost a lot of
|
||
# avoidable computing resources. Therefore, the HGBT implementation in scikit-learn
|
||
# provides an automatic **early stopping** strategy. With it, the model
|
||
# uses a fraction of the training data as internal validation set
|
||
# (`validation_fraction`) and stops training if the validation score does not
|
||
# improve (or degrades) after `n_iter_no_change` iterations up to a certain
|
||
# tolerance (`tol`).
|
||
#
|
||
# Notice that there is a trade-off between `learning_rate` and `max_iter`:
|
||
# Generally, smaller learning rates are preferable but require more iterations
|
||
# to converge to the minimum loss, while larger learning rates converge faster
|
||
# (less iterations/trees needed) but at the cost of a larger minimum loss.
|
||
#
|
||
# Because of this high correlation between the learning rate the number of iterations,
|
||
# a good practice is to tune the learning rate along with all (important) other
|
||
# hyperparameters, fit the HBGT on the training set with a large enough value
|
||
# for `max_iter` and determine the best `max_iter` via early stopping and some
|
||
# explicit `validation_fraction`.
|
||
|
||
common_params = {
|
||
"max_iter": 1_000,
|
||
"learning_rate": 0.3,
|
||
"validation_fraction": 0.2,
|
||
"random_state": 42,
|
||
"categorical_features": None,
|
||
"scoring": "neg_root_mean_squared_error",
|
||
}
|
||
|
||
hgbt = HistGradientBoostingRegressor(early_stopping=True, **common_params)
|
||
hgbt.fit(X_train, y_train)
|
||
|
||
_, ax = plt.subplots()
|
||
plt.plot(-hgbt.validation_score_)
|
||
_ = ax.set(
|
||
xlabel="number of iterations",
|
||
ylabel="root mean squared error",
|
||
title=f"Loss of hgbt with early stopping (n_iter={hgbt.n_iter_})",
|
||
)
|
||
|
||
# %%
|
||
# We can then overwrite the value for `max_iter` to a reasonable value and avoid
|
||
# the extra computational cost of the inner validation. Rounding up the number
|
||
# of iterations may account for variability of the training set:
|
||
|
||
import math
|
||
|
||
common_params["max_iter"] = math.ceil(hgbt.n_iter_ / 100) * 100
|
||
common_params["early_stopping"] = False
|
||
hgbt = HistGradientBoostingRegressor(**common_params)
|
||
|
||
# %%
|
||
# .. note:: The inner validation done during early stopping is not optimal for
|
||
# time series.
|
||
#
|
||
# Support for missing values
|
||
# ==========================
|
||
# HGBT models have native support of missing values. During training, the tree
|
||
# grower decides where samples with missing values should go (left or right
|
||
# child) at each split, based on the potential gain. When predicting, these
|
||
# samples are sent to the learnt child accordingly. If a feature had no missing
|
||
# values during training, then for prediction, samples with missing values for that
|
||
# feature are sent to the child with the most samples (as seen during fit).
|
||
#
|
||
# The present example shows how HGBT regressions deal with values missing
|
||
# completely at random (MCAR), i.e. the missingness does not depend on the
|
||
# observed data or the unobserved data. We can simulate such scenario by
|
||
# randomly replacing values from randomly selected features with `nan` values.
|
||
|
||
import numpy as np
|
||
|
||
from sklearn.metrics import root_mean_squared_error
|
||
|
||
rng = np.random.RandomState(42)
|
||
first_week = slice(0, 336) # first week in the test set as 7 * 48 = 336
|
||
missing_fraction_list = [0, 0.01, 0.03]
|
||
|
||
|
||
def generate_missing_values(X, missing_fraction):
|
||
total_cells = X.shape[0] * X.shape[1]
|
||
num_missing_cells = int(total_cells * missing_fraction)
|
||
row_indices = rng.choice(X.shape[0], num_missing_cells, replace=True)
|
||
col_indices = rng.choice(X.shape[1], num_missing_cells, replace=True)
|
||
X_missing = X.copy()
|
||
X_missing.iloc[row_indices, col_indices] = np.nan
|
||
return X_missing
|
||
|
||
|
||
fig, ax = plt.subplots(figsize=(12, 6))
|
||
ax.plot(y_test.values[first_week], label="Actual transfer")
|
||
|
||
for missing_fraction in missing_fraction_list:
|
||
X_train_missing = generate_missing_values(X_train, missing_fraction)
|
||
X_test_missing = generate_missing_values(X_test, missing_fraction)
|
||
hgbt.fit(X_train_missing, y_train)
|
||
y_pred = hgbt.predict(X_test_missing[first_week])
|
||
rmse = root_mean_squared_error(y_test[first_week], y_pred)
|
||
ax.plot(
|
||
y_pred[first_week],
|
||
label=f"missing_fraction={missing_fraction}, RMSE={rmse:.3f}",
|
||
alpha=0.5,
|
||
)
|
||
ax.set(
|
||
title="Daily energy transfer predictions on data with MCAR values",
|
||
xticks=[(i + 0.2) * 48 for i in range(7)],
|
||
xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
|
||
xlabel="Time of the week",
|
||
ylabel="Normalized energy transfer",
|
||
)
|
||
_ = ax.legend(loc="lower right")
|
||
|
||
# %%
|
||
# As expected, the model degrades as the proportion of missing values increases.
|
||
#
|
||
# Support for quantile loss
|
||
# =========================
|
||
#
|
||
# The quantile loss in regression enables a view of the variability or
|
||
# uncertainty of the target variable. For instance, predicting the 5th and 95th
|
||
# percentiles can provide a 90% prediction interval, i.e. the range within which
|
||
# we expect a new observed value to fall with 90% probability.
|
||
|
||
from sklearn.metrics import mean_pinball_loss
|
||
|
||
quantiles = [0.95, 0.05]
|
||
predictions = []
|
||
|
||
fig, ax = plt.subplots(figsize=(12, 6))
|
||
ax.plot(y_test.values[first_week], label="Actual transfer")
|
||
|
||
for quantile in quantiles:
|
||
hgbt_quantile = HistGradientBoostingRegressor(
|
||
loss="quantile", quantile=quantile, **common_params
|
||
)
|
||
hgbt_quantile.fit(X_train, y_train)
|
||
y_pred = hgbt_quantile.predict(X_test[first_week])
|
||
|
||
predictions.append(y_pred)
|
||
score = mean_pinball_loss(y_test[first_week], y_pred)
|
||
ax.plot(
|
||
y_pred[first_week],
|
||
label=f"quantile={quantile}, pinball loss={score:.2f}",
|
||
alpha=0.5,
|
||
)
|
||
|
||
ax.fill_between(
|
||
range(len(predictions[0][first_week])),
|
||
predictions[0][first_week],
|
||
predictions[1][first_week],
|
||
color=colors[0],
|
||
alpha=0.1,
|
||
)
|
||
ax.set(
|
||
title="Daily energy transfer predictions with quantile loss",
|
||
xticks=[(i + 0.2) * 48 for i in range(7)],
|
||
xticklabels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
|
||
xlabel="Time of the week",
|
||
ylabel="Normalized energy transfer",
|
||
)
|
||
_ = ax.legend(loc="lower right")
|
||
|
||
# %%
|
||
# We observe a tendence to over-estimate the energy transfer. This could be be
|
||
# quantitatively confirmed by computing empirical coverage numbers as done in
|
||
# the :ref:`calibration of confidence intervals section <calibration-section>`.
|
||
# Keep in mind that those predicted percentiles are just estimations from a
|
||
# model. One can still improve the quality of such estimations by:
|
||
#
|
||
# - collecting more data-points;
|
||
# - better tuning of the model hyperparameters, see
|
||
# :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`;
|
||
# - engineering more predictive features from the same data, see
|
||
# :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`.
|
||
#
|
||
# Monotonic constraints
|
||
# =====================
|
||
#
|
||
# Given specific domain knowledge that requires the relationship between a
|
||
# feature and the target to be monotonically increasing or decreasing, one can
|
||
# enforce such behaviour in the predictions of a HGBT model using monotonic
|
||
# constraints. This makes the model more interpretable and can reduce its
|
||
# variance (and potentially mitigate overfitting) at the risk of increasing
|
||
# bias. Monotonic constraints can also be used to enforce specific regulatory
|
||
# requirements, ensure compliance and align with ethical considerations.
|
||
#
|
||
# In the present example, the policy of transferring energy from Victoria to New
|
||
# South Wales is meant to alleviate price fluctuations, meaning that the model
|
||
# predictions have to enforce such goal, i.e. transfer should increase with
|
||
# price and demand in New South Wales, but also decrease with price and demand
|
||
# in Victoria, in order to benefit both populations.
|
||
#
|
||
# If the training data has feature names, it’s possible to specify the monotonic
|
||
# constraints by passing a dictionary with the convention:
|
||
#
|
||
# - 1: monotonic increase
|
||
# - 0: no constraint
|
||
# - -1: monotonic decrease
|
||
#
|
||
# Alternatively, one can pass an array-like object encoding the above convention by
|
||
# position.
|
||
|
||
from sklearn.inspection import PartialDependenceDisplay
|
||
|
||
monotonic_cst = {
|
||
"date": 0,
|
||
"day": 0,
|
||
"period": 0,
|
||
"nswdemand": 1,
|
||
"nswprice": 1,
|
||
"vicdemand": -1,
|
||
"vicprice": -1,
|
||
}
|
||
hgbt_no_cst = HistGradientBoostingRegressor(
|
||
categorical_features=None, random_state=42
|
||
).fit(X, y)
|
||
hgbt_cst = HistGradientBoostingRegressor(
|
||
monotonic_cst=monotonic_cst, categorical_features=None, random_state=42
|
||
).fit(X, y)
|
||
|
||
fig, ax = plt.subplots(nrows=2, figsize=(15, 10))
|
||
disp = PartialDependenceDisplay.from_estimator(
|
||
hgbt_no_cst,
|
||
X,
|
||
features=["nswdemand", "nswprice"],
|
||
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
|
||
ax=ax[0],
|
||
)
|
||
PartialDependenceDisplay.from_estimator(
|
||
hgbt_cst,
|
||
X,
|
||
features=["nswdemand", "nswprice"],
|
||
line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
|
||
ax=disp.axes_,
|
||
)
|
||
disp = PartialDependenceDisplay.from_estimator(
|
||
hgbt_no_cst,
|
||
X,
|
||
features=["vicdemand", "vicprice"],
|
||
line_kw={"linewidth": 2, "label": "unconstrained", "color": "tab:blue"},
|
||
ax=ax[1],
|
||
)
|
||
PartialDependenceDisplay.from_estimator(
|
||
hgbt_cst,
|
||
X,
|
||
features=["vicdemand", "vicprice"],
|
||
line_kw={"linewidth": 2, "label": "constrained", "color": "tab:orange"},
|
||
ax=disp.axes_,
|
||
)
|
||
_ = plt.legend()
|
||
|
||
# %%
|
||
# Observe that `nswdemand` and `vicdemand` seem already monotonic without constraint.
|
||
# This is a good example to show that the model with monotonicity constraints is
|
||
# "overconstraining".
|
||
#
|
||
# Additionally, we can verify that the predictive quality of the model is not
|
||
# significantly degraded by introducing the monotonic constraints. For such
|
||
# purpose we use :class:`~sklearn.model_selection.TimeSeriesSplit`
|
||
# cross-validation to estimate the variance of the test score. By doing so we
|
||
# guarantee that the training data does not succeed the testing data, which is
|
||
# crucial when dealing with data that have a temporal relationship.
|
||
|
||
from sklearn.metrics import make_scorer, root_mean_squared_error
|
||
from sklearn.model_selection import TimeSeriesSplit, cross_validate
|
||
|
||
ts_cv = TimeSeriesSplit(n_splits=5, gap=48, test_size=336) # a week has 336 samples
|
||
scorer = make_scorer(root_mean_squared_error)
|
||
|
||
cv_results = cross_validate(hgbt_no_cst, X, y, cv=ts_cv, scoring=scorer)
|
||
rmse = cv_results["test_score"]
|
||
print(f"RMSE without constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
|
||
|
||
cv_results = cross_validate(hgbt_cst, X, y, cv=ts_cv, scoring=scorer)
|
||
rmse = cv_results["test_score"]
|
||
print(f"RMSE with constraints = {rmse.mean():.3f} +/- {rmse.std():.3f}")
|
||
|
||
# %%
|
||
# That being said, notice the comparison is between two different models that
|
||
# may be optimized by a different combination of hyperparameters. That is the
|
||
# reason why we do no use the `common_params` in this section as done before.
|