235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
|
# ruff: noqa
|
|||
|
"""
|
|||
|
=======================================
|
|||
|
Release Highlights for scikit-learn 1.1
|
|||
|
=======================================
|
|||
|
|
|||
|
.. currentmodule:: sklearn
|
|||
|
|
|||
|
We are pleased to announce the release of scikit-learn 1.1! Many bug fixes
|
|||
|
and improvements were added, as well as some new key features. We detail
|
|||
|
below a few of the major features of this release. **For an exhaustive list of
|
|||
|
all the changes**, please refer to the :ref:`release notes <release_notes_1_1>`.
|
|||
|
|
|||
|
To install the latest version (with pip)::
|
|||
|
|
|||
|
pip install --upgrade scikit-learn
|
|||
|
|
|||
|
or with conda::
|
|||
|
|
|||
|
conda install -c conda-forge scikit-learn
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
# %%
|
|||
|
# .. _quantile_support_hgbdt:
|
|||
|
#
|
|||
|
# Quantile loss in :class:`~ensemble.HistGradientBoostingRegressor`
|
|||
|
# -----------------------------------------------------------------
|
|||
|
# :class:`~ensemble.HistGradientBoostingRegressor` can model quantiles with
|
|||
|
# `loss="quantile"` and the new parameter `quantile`.
|
|||
|
from sklearn.ensemble import HistGradientBoostingRegressor
|
|||
|
import numpy as np
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
|
|||
|
# Simple regression function for X * cos(X)
|
|||
|
rng = np.random.RandomState(42)
|
|||
|
X_1d = np.linspace(0, 10, num=2000)
|
|||
|
X = X_1d.reshape(-1, 1)
|
|||
|
y = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)
|
|||
|
|
|||
|
quantiles = [0.95, 0.5, 0.05]
|
|||
|
parameters = dict(loss="quantile", max_bins=32, max_iter=50)
|
|||
|
hist_quantiles = {
|
|||
|
f"quantile={quantile:.2f}": HistGradientBoostingRegressor(
|
|||
|
**parameters, quantile=quantile
|
|||
|
).fit(X, y)
|
|||
|
for quantile in quantiles
|
|||
|
}
|
|||
|
|
|||
|
fig, ax = plt.subplots()
|
|||
|
ax.plot(X_1d, y, "o", alpha=0.5, markersize=1)
|
|||
|
for quantile, hist in hist_quantiles.items():
|
|||
|
ax.plot(X_1d, hist.predict(X), label=quantile)
|
|||
|
_ = ax.legend(loc="lower left")
|
|||
|
|
|||
|
# %%
|
|||
|
# For a usecase example, see
|
|||
|
# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
|
|||
|
|
|||
|
# %%
|
|||
|
# `get_feature_names_out` Available in all Transformers
|
|||
|
# -----------------------------------------------------
|
|||
|
# :term:`get_feature_names_out` is now available in all Transformers. This enables
|
|||
|
# :class:`~pipeline.Pipeline` to construct the output feature names for more complex
|
|||
|
# pipelines:
|
|||
|
from sklearn.compose import ColumnTransformer
|
|||
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
|||
|
from sklearn.pipeline import make_pipeline
|
|||
|
from sklearn.impute import SimpleImputer
|
|||
|
from sklearn.feature_selection import SelectKBest
|
|||
|
from sklearn.datasets import fetch_openml
|
|||
|
from sklearn.linear_model import LogisticRegression
|
|||
|
|
|||
|
X, y = fetch_openml(
|
|||
|
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
|
|||
|
)
|
|||
|
numeric_features = ["age", "fare"]
|
|||
|
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
|
|||
|
categorical_features = ["embarked", "pclass"]
|
|||
|
|
|||
|
preprocessor = ColumnTransformer(
|
|||
|
[
|
|||
|
("num", numeric_transformer, numeric_features),
|
|||
|
(
|
|||
|
"cat",
|
|||
|
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
|||
|
categorical_features,
|
|||
|
),
|
|||
|
],
|
|||
|
verbose_feature_names_out=False,
|
|||
|
)
|
|||
|
log_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())
|
|||
|
log_reg.fit(X, y)
|
|||
|
|
|||
|
|
|||
|
# %%
|
|||
|
# Here we slice the pipeline to include all the steps but the last one. The output
|
|||
|
# feature names of this pipeline slice are the features put into logistic
|
|||
|
# regression. These names correspond directly to the coefficients in the logistic
|
|||
|
# regression:
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
log_reg_input_features = log_reg[:-1].get_feature_names_out()
|
|||
|
pd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()
|
|||
|
plt.tight_layout()
|
|||
|
|
|||
|
|
|||
|
# %%
|
|||
|
# Grouping infrequent categories in :class:`~preprocessing.OneHotEncoder`
|
|||
|
# -----------------------------------------------------------------------
|
|||
|
# :class:`~preprocessing.OneHotEncoder` supports aggregating infrequent
|
|||
|
# categories into a single output for each feature. The parameters to enable
|
|||
|
# the gathering of infrequent categories are `min_frequency` and
|
|||
|
# `max_categories`. See the :ref:`User Guide <encoder_infrequent_categories>`
|
|||
|
# for more details.
|
|||
|
from sklearn.preprocessing import OneHotEncoder
|
|||
|
import numpy as np
|
|||
|
|
|||
|
X = np.array(
|
|||
|
[["dog"] * 5 + ["cat"] * 20 + ["rabbit"] * 10 + ["snake"] * 3], dtype=object
|
|||
|
).T
|
|||
|
enc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
|
|||
|
enc.infrequent_categories_
|
|||
|
|
|||
|
# %%
|
|||
|
# Since dog and snake are infrequent categories, they are grouped together when
|
|||
|
# transformed:
|
|||
|
encoded = enc.transform(np.array([["dog"], ["snake"], ["cat"], ["rabbit"]]))
|
|||
|
pd.DataFrame(encoded, columns=enc.get_feature_names_out())
|
|||
|
|
|||
|
# %%
|
|||
|
# Performance improvements
|
|||
|
# ------------------------
|
|||
|
# Reductions on pairwise distances for dense float64 datasets has been refactored
|
|||
|
# to better take advantage of non-blocking thread parallelism. For example,
|
|||
|
# :meth:`neighbors.NearestNeighbors.kneighbors` and
|
|||
|
# :meth:`neighbors.NearestNeighbors.radius_neighbors` can respectively be up to ×20 and
|
|||
|
# ×5 faster than previously. In summary, the following functions and estimators
|
|||
|
# now benefit from improved performance:
|
|||
|
#
|
|||
|
# - :func:`metrics.pairwise_distances_argmin`
|
|||
|
# - :func:`metrics.pairwise_distances_argmin_min`
|
|||
|
# - :class:`cluster.AffinityPropagation`
|
|||
|
# - :class:`cluster.Birch`
|
|||
|
# - :class:`cluster.MeanShift`
|
|||
|
# - :class:`cluster.OPTICS`
|
|||
|
# - :class:`cluster.SpectralClustering`
|
|||
|
# - :func:`feature_selection.mutual_info_regression`
|
|||
|
# - :class:`neighbors.KNeighborsClassifier`
|
|||
|
# - :class:`neighbors.KNeighborsRegressor`
|
|||
|
# - :class:`neighbors.RadiusNeighborsClassifier`
|
|||
|
# - :class:`neighbors.RadiusNeighborsRegressor`
|
|||
|
# - :class:`neighbors.LocalOutlierFactor`
|
|||
|
# - :class:`neighbors.NearestNeighbors`
|
|||
|
# - :class:`manifold.Isomap`
|
|||
|
# - :class:`manifold.LocallyLinearEmbedding`
|
|||
|
# - :class:`manifold.TSNE`
|
|||
|
# - :func:`manifold.trustworthiness`
|
|||
|
# - :class:`semi_supervised.LabelPropagation`
|
|||
|
# - :class:`semi_supervised.LabelSpreading`
|
|||
|
#
|
|||
|
# To know more about the technical details of this work, you can read
|
|||
|
# `this suite of blog posts <https://blog.scikit-learn.org/technical/performances/>`_.
|
|||
|
#
|
|||
|
# Moreover, the computation of loss functions has been refactored using
|
|||
|
# Cython resulting in performance improvements for the following estimators:
|
|||
|
#
|
|||
|
# - :class:`linear_model.LogisticRegression`
|
|||
|
# - :class:`linear_model.GammaRegressor`
|
|||
|
# - :class:`linear_model.PoissonRegressor`
|
|||
|
# - :class:`linear_model.TweedieRegressor`
|
|||
|
|
|||
|
# %%
|
|||
|
# :class:`~decomposition.MiniBatchNMF`: an online version of NMF
|
|||
|
# --------------------------------------------------------------
|
|||
|
# The new class :class:`~decomposition.MiniBatchNMF` implements a faster but
|
|||
|
# less accurate version of non-negative matrix factorization
|
|||
|
# (:class:`~decomposition.NMF`). :class:`~decomposition.MiniBatchNMF` divides the
|
|||
|
# data into mini-batches and optimizes the NMF model in an online manner by
|
|||
|
# cycling over the mini-batches, making it better suited for large datasets. In
|
|||
|
# particular, it implements `partial_fit`, which can be used for online
|
|||
|
# learning when the data is not readily available from the start, or when the
|
|||
|
# data does not fit into memory.
|
|||
|
import numpy as np
|
|||
|
from sklearn.decomposition import MiniBatchNMF
|
|||
|
|
|||
|
rng = np.random.RandomState(0)
|
|||
|
n_samples, n_features, n_components = 10, 10, 5
|
|||
|
true_W = rng.uniform(size=(n_samples, n_components))
|
|||
|
true_H = rng.uniform(size=(n_components, n_features))
|
|||
|
X = true_W @ true_H
|
|||
|
|
|||
|
nmf = MiniBatchNMF(n_components=n_components, random_state=0)
|
|||
|
|
|||
|
for _ in range(10):
|
|||
|
nmf.partial_fit(X)
|
|||
|
|
|||
|
W = nmf.transform(X)
|
|||
|
H = nmf.components_
|
|||
|
X_reconstructed = W @ H
|
|||
|
|
|||
|
print(
|
|||
|
f"relative reconstruction error: ",
|
|||
|
f"{np.sum((X - X_reconstructed) ** 2) / np.sum(X**2):.5f}",
|
|||
|
)
|
|||
|
|
|||
|
# %%
|
|||
|
# :class:`~cluster.BisectingKMeans`: divide and cluster
|
|||
|
# -----------------------------------------------------
|
|||
|
# The new class :class:`~cluster.BisectingKMeans` is a variant of
|
|||
|
# :class:`~cluster.KMeans`, using divisive hierarchical clustering. Instead of
|
|||
|
# creating all centroids at once, centroids are picked progressively based on a
|
|||
|
# previous clustering: a cluster is split into two new clusters repeatedly
|
|||
|
# until the target number of clusters is reached, giving a hierarchical
|
|||
|
# structure to the clustering.
|
|||
|
from sklearn.datasets import make_blobs
|
|||
|
from sklearn.cluster import KMeans, BisectingKMeans
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
|
|||
|
X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)
|
|||
|
|
|||
|
km = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(X)
|
|||
|
bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)
|
|||
|
|
|||
|
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
|
|||
|
ax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)
|
|||
|
ax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c="r")
|
|||
|
ax[0].set_title("KMeans")
|
|||
|
|
|||
|
ax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)
|
|||
|
ax[1].scatter(
|
|||
|
bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c="r"
|
|||
|
)
|
|||
|
_ = ax[1].set_title("BisectingKMeans")
|