235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
# ruff: noqa
|
||
"""
|
||
=======================================
|
||
Release Highlights for scikit-learn 1.1
|
||
=======================================
|
||
|
||
.. currentmodule:: sklearn
|
||
|
||
We are pleased to announce the release of scikit-learn 1.1! Many bug fixes
|
||
and improvements were added, as well as some new key features. We detail
|
||
below a few of the major features of this release. **For an exhaustive list of
|
||
all the changes**, please refer to the :ref:`release notes <release_notes_1_1>`.
|
||
|
||
To install the latest version (with pip)::
|
||
|
||
pip install --upgrade scikit-learn
|
||
|
||
or with conda::
|
||
|
||
conda install -c conda-forge scikit-learn
|
||
|
||
"""
|
||
|
||
# %%
|
||
# .. _quantile_support_hgbdt:
|
||
#
|
||
# Quantile loss in :class:`~ensemble.HistGradientBoostingRegressor`
|
||
# -----------------------------------------------------------------
|
||
# :class:`~ensemble.HistGradientBoostingRegressor` can model quantiles with
|
||
# `loss="quantile"` and the new parameter `quantile`.
|
||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
|
||
# Simple regression function for X * cos(X)
|
||
rng = np.random.RandomState(42)
|
||
X_1d = np.linspace(0, 10, num=2000)
|
||
X = X_1d.reshape(-1, 1)
|
||
y = X_1d * np.cos(X_1d) + rng.normal(scale=X_1d / 3)
|
||
|
||
quantiles = [0.95, 0.5, 0.05]
|
||
parameters = dict(loss="quantile", max_bins=32, max_iter=50)
|
||
hist_quantiles = {
|
||
f"quantile={quantile:.2f}": HistGradientBoostingRegressor(
|
||
**parameters, quantile=quantile
|
||
).fit(X, y)
|
||
for quantile in quantiles
|
||
}
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(X_1d, y, "o", alpha=0.5, markersize=1)
|
||
for quantile, hist in hist_quantiles.items():
|
||
ax.plot(X_1d, hist.predict(X), label=quantile)
|
||
_ = ax.legend(loc="lower left")
|
||
|
||
# %%
|
||
# For a usecase example, see
|
||
# :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py`
|
||
|
||
# %%
|
||
# `get_feature_names_out` Available in all Transformers
|
||
# -----------------------------------------------------
|
||
# :term:`get_feature_names_out` is now available in all Transformers. This enables
|
||
# :class:`~pipeline.Pipeline` to construct the output feature names for more complex
|
||
# pipelines:
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
||
from sklearn.pipeline import make_pipeline
|
||
from sklearn.impute import SimpleImputer
|
||
from sklearn.feature_selection import SelectKBest
|
||
from sklearn.datasets import fetch_openml
|
||
from sklearn.linear_model import LogisticRegression
|
||
|
||
X, y = fetch_openml(
|
||
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
|
||
)
|
||
numeric_features = ["age", "fare"]
|
||
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
|
||
categorical_features = ["embarked", "pclass"]
|
||
|
||
preprocessor = ColumnTransformer(
|
||
[
|
||
("num", numeric_transformer, numeric_features),
|
||
(
|
||
"cat",
|
||
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
||
categorical_features,
|
||
),
|
||
],
|
||
verbose_feature_names_out=False,
|
||
)
|
||
log_reg = make_pipeline(preprocessor, SelectKBest(k=7), LogisticRegression())
|
||
log_reg.fit(X, y)
|
||
|
||
|
||
# %%
|
||
# Here we slice the pipeline to include all the steps but the last one. The output
|
||
# feature names of this pipeline slice are the features put into logistic
|
||
# regression. These names correspond directly to the coefficients in the logistic
|
||
# regression:
|
||
import pandas as pd
|
||
|
||
log_reg_input_features = log_reg[:-1].get_feature_names_out()
|
||
pd.Series(log_reg[-1].coef_.ravel(), index=log_reg_input_features).plot.bar()
|
||
plt.tight_layout()
|
||
|
||
|
||
# %%
|
||
# Grouping infrequent categories in :class:`~preprocessing.OneHotEncoder`
|
||
# -----------------------------------------------------------------------
|
||
# :class:`~preprocessing.OneHotEncoder` supports aggregating infrequent
|
||
# categories into a single output for each feature. The parameters to enable
|
||
# the gathering of infrequent categories are `min_frequency` and
|
||
# `max_categories`. See the :ref:`User Guide <encoder_infrequent_categories>`
|
||
# for more details.
|
||
from sklearn.preprocessing import OneHotEncoder
|
||
import numpy as np
|
||
|
||
X = np.array(
|
||
[["dog"] * 5 + ["cat"] * 20 + ["rabbit"] * 10 + ["snake"] * 3], dtype=object
|
||
).T
|
||
enc = OneHotEncoder(min_frequency=6, sparse_output=False).fit(X)
|
||
enc.infrequent_categories_
|
||
|
||
# %%
|
||
# Since dog and snake are infrequent categories, they are grouped together when
|
||
# transformed:
|
||
encoded = enc.transform(np.array([["dog"], ["snake"], ["cat"], ["rabbit"]]))
|
||
pd.DataFrame(encoded, columns=enc.get_feature_names_out())
|
||
|
||
# %%
|
||
# Performance improvements
|
||
# ------------------------
|
||
# Reductions on pairwise distances for dense float64 datasets has been refactored
|
||
# to better take advantage of non-blocking thread parallelism. For example,
|
||
# :meth:`neighbors.NearestNeighbors.kneighbors` and
|
||
# :meth:`neighbors.NearestNeighbors.radius_neighbors` can respectively be up to ×20 and
|
||
# ×5 faster than previously. In summary, the following functions and estimators
|
||
# now benefit from improved performance:
|
||
#
|
||
# - :func:`metrics.pairwise_distances_argmin`
|
||
# - :func:`metrics.pairwise_distances_argmin_min`
|
||
# - :class:`cluster.AffinityPropagation`
|
||
# - :class:`cluster.Birch`
|
||
# - :class:`cluster.MeanShift`
|
||
# - :class:`cluster.OPTICS`
|
||
# - :class:`cluster.SpectralClustering`
|
||
# - :func:`feature_selection.mutual_info_regression`
|
||
# - :class:`neighbors.KNeighborsClassifier`
|
||
# - :class:`neighbors.KNeighborsRegressor`
|
||
# - :class:`neighbors.RadiusNeighborsClassifier`
|
||
# - :class:`neighbors.RadiusNeighborsRegressor`
|
||
# - :class:`neighbors.LocalOutlierFactor`
|
||
# - :class:`neighbors.NearestNeighbors`
|
||
# - :class:`manifold.Isomap`
|
||
# - :class:`manifold.LocallyLinearEmbedding`
|
||
# - :class:`manifold.TSNE`
|
||
# - :func:`manifold.trustworthiness`
|
||
# - :class:`semi_supervised.LabelPropagation`
|
||
# - :class:`semi_supervised.LabelSpreading`
|
||
#
|
||
# To know more about the technical details of this work, you can read
|
||
# `this suite of blog posts <https://blog.scikit-learn.org/technical/performances/>`_.
|
||
#
|
||
# Moreover, the computation of loss functions has been refactored using
|
||
# Cython resulting in performance improvements for the following estimators:
|
||
#
|
||
# - :class:`linear_model.LogisticRegression`
|
||
# - :class:`linear_model.GammaRegressor`
|
||
# - :class:`linear_model.PoissonRegressor`
|
||
# - :class:`linear_model.TweedieRegressor`
|
||
|
||
# %%
|
||
# :class:`~decomposition.MiniBatchNMF`: an online version of NMF
|
||
# --------------------------------------------------------------
|
||
# The new class :class:`~decomposition.MiniBatchNMF` implements a faster but
|
||
# less accurate version of non-negative matrix factorization
|
||
# (:class:`~decomposition.NMF`). :class:`~decomposition.MiniBatchNMF` divides the
|
||
# data into mini-batches and optimizes the NMF model in an online manner by
|
||
# cycling over the mini-batches, making it better suited for large datasets. In
|
||
# particular, it implements `partial_fit`, which can be used for online
|
||
# learning when the data is not readily available from the start, or when the
|
||
# data does not fit into memory.
|
||
import numpy as np
|
||
from sklearn.decomposition import MiniBatchNMF
|
||
|
||
rng = np.random.RandomState(0)
|
||
n_samples, n_features, n_components = 10, 10, 5
|
||
true_W = rng.uniform(size=(n_samples, n_components))
|
||
true_H = rng.uniform(size=(n_components, n_features))
|
||
X = true_W @ true_H
|
||
|
||
nmf = MiniBatchNMF(n_components=n_components, random_state=0)
|
||
|
||
for _ in range(10):
|
||
nmf.partial_fit(X)
|
||
|
||
W = nmf.transform(X)
|
||
H = nmf.components_
|
||
X_reconstructed = W @ H
|
||
|
||
print(
|
||
f"relative reconstruction error: ",
|
||
f"{np.sum((X - X_reconstructed) ** 2) / np.sum(X**2):.5f}",
|
||
)
|
||
|
||
# %%
|
||
# :class:`~cluster.BisectingKMeans`: divide and cluster
|
||
# -----------------------------------------------------
|
||
# The new class :class:`~cluster.BisectingKMeans` is a variant of
|
||
# :class:`~cluster.KMeans`, using divisive hierarchical clustering. Instead of
|
||
# creating all centroids at once, centroids are picked progressively based on a
|
||
# previous clustering: a cluster is split into two new clusters repeatedly
|
||
# until the target number of clusters is reached, giving a hierarchical
|
||
# structure to the clustering.
|
||
from sklearn.datasets import make_blobs
|
||
from sklearn.cluster import KMeans, BisectingKMeans
|
||
import matplotlib.pyplot as plt
|
||
|
||
X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)
|
||
|
||
km = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(X)
|
||
bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)
|
||
|
||
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
|
||
ax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)
|
||
ax[0].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], s=20, c="r")
|
||
ax[0].set_title("KMeans")
|
||
|
||
ax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)
|
||
ax[1].scatter(
|
||
bisect_km.cluster_centers_[:, 0], bisect_km.cluster_centers_[:, 1], s=20, c="r"
|
||
)
|
||
_ = ax[1].set_title("BisectingKMeans")
|