152 lines
5.0 KiB
Python
152 lines
5.0 KiB
Python
"""
|
|
================================
|
|
Introducing the `set_output` API
|
|
================================
|
|
|
|
.. currentmodule:: sklearn
|
|
|
|
This example will demonstrate the `set_output` API to configure transformers to
|
|
output pandas DataFrames. `set_output` can be configured per estimator by calling
|
|
the `set_output` method or globally by setting `set_config(transform_output="pandas")`.
|
|
For details, see
|
|
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
|
|
""" # noqa
|
|
|
|
# %%
|
|
# First, we load the iris dataset as a DataFrame to demonstrate the `set_output` API.
|
|
from sklearn.datasets import load_iris
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
X, y = load_iris(as_frame=True, return_X_y=True)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
|
|
X_train.head()
|
|
|
|
# %%
|
|
# To configure an estimator such as :class:`preprocessing.StandardScaler` to return
|
|
# DataFrames, call `set_output`. This feature requires pandas to be installed.
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
scaler = StandardScaler().set_output(transform="pandas")
|
|
|
|
scaler.fit(X_train)
|
|
X_test_scaled = scaler.transform(X_test)
|
|
X_test_scaled.head()
|
|
|
|
# %%
|
|
# `set_output` can be called after `fit` to configure `transform` after the fact.
|
|
scaler2 = StandardScaler()
|
|
|
|
scaler2.fit(X_train)
|
|
X_test_np = scaler2.transform(X_test)
|
|
print(f"Default output type: {type(X_test_np).__name__}")
|
|
|
|
scaler2.set_output(transform="pandas")
|
|
X_test_df = scaler2.transform(X_test)
|
|
print(f"Configured pandas output type: {type(X_test_df).__name__}")
|
|
|
|
# %%
|
|
# In a :class:`pipeline.Pipeline`, `set_output` configures all steps to output
|
|
# DataFrames.
|
|
from sklearn.feature_selection import SelectPercentile
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.pipeline import make_pipeline
|
|
|
|
clf = make_pipeline(
|
|
StandardScaler(), SelectPercentile(percentile=75), LogisticRegression()
|
|
)
|
|
clf.set_output(transform="pandas")
|
|
clf.fit(X_train, y_train)
|
|
|
|
# %%
|
|
# Each transformer in the pipeline is configured to return DataFrames. This
|
|
# means that the final logistic regression step contains the feature names of the input.
|
|
clf[-1].feature_names_in_
|
|
|
|
# %%
|
|
# .. note:: If one uses the method `set_params`, the transformer will be
|
|
# replaced by a new one with the default output format.
|
|
clf.set_params(standardscaler=StandardScaler())
|
|
clf.fit(X_train, y_train)
|
|
clf[-1].feature_names_in_
|
|
|
|
# %%
|
|
# To keep the intended behavior, use `set_output` on the new transformer
|
|
# beforehand
|
|
scaler = StandardScaler().set_output(transform="pandas")
|
|
clf.set_params(standardscaler=scaler)
|
|
clf.fit(X_train, y_train)
|
|
clf[-1].feature_names_in_
|
|
|
|
# %%
|
|
# Next we load the titanic dataset to demonstrate `set_output` with
|
|
# :class:`compose.ColumnTransformer` and heterogeneous data.
|
|
from sklearn.datasets import fetch_openml
|
|
|
|
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
|
|
|
|
# %%
|
|
# The `set_output` API can be configured globally by using :func:`set_config` and
|
|
# setting `transform_output` to `"pandas"`.
|
|
from sklearn import set_config
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
|
|
|
set_config(transform_output="pandas")
|
|
|
|
num_pipe = make_pipeline(SimpleImputer(), StandardScaler())
|
|
num_cols = ["age", "fare"]
|
|
ct = ColumnTransformer(
|
|
(
|
|
("numerical", num_pipe, num_cols),
|
|
(
|
|
"categorical",
|
|
OneHotEncoder(
|
|
sparse_output=False, drop="if_binary", handle_unknown="ignore"
|
|
),
|
|
["embarked", "sex", "pclass"],
|
|
),
|
|
),
|
|
verbose_feature_names_out=False,
|
|
)
|
|
clf = make_pipeline(ct, SelectPercentile(percentile=50), LogisticRegression())
|
|
clf.fit(X_train, y_train)
|
|
clf.score(X_test, y_test)
|
|
|
|
# %%
|
|
# With the global configuration, all transformers output DataFrames. This allows us to
|
|
# easily plot the logistic regression coefficients with the corresponding feature names.
|
|
import pandas as pd
|
|
|
|
log_reg = clf[-1]
|
|
coef = pd.Series(log_reg.coef_.ravel(), index=log_reg.feature_names_in_)
|
|
_ = coef.sort_values().plot.barh()
|
|
|
|
# %%
|
|
# In order to demonstrate the :func:`config_context` functionality below, let
|
|
# us first reset `transform_output` to its default value.
|
|
set_config(transform_output="default")
|
|
|
|
# %%
|
|
# When configuring the output type with :func:`config_context` the
|
|
# configuration at the time when `transform` or `fit_transform` are
|
|
# called is what counts. Setting these only when you construct or fit
|
|
# the transformer has no effect.
|
|
from sklearn import config_context
|
|
|
|
scaler = StandardScaler()
|
|
scaler.fit(X_train[num_cols])
|
|
|
|
# %%
|
|
with config_context(transform_output="pandas"):
|
|
# the output of transform will be a Pandas DataFrame
|
|
X_test_scaled = scaler.transform(X_test[num_cols])
|
|
X_test_scaled.head()
|
|
|
|
# %%
|
|
# outside of the context manager, the output will be a NumPy array
|
|
X_test_scaled = scaler.transform(X_test[num_cols])
|
|
X_test_scaled[:5]
|