63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
"""
|
|
=================================================
|
|
Concatenating multiple feature extraction methods
|
|
=================================================
|
|
|
|
In many real-world examples, there are many ways to extract features from a
|
|
dataset. Often it is beneficial to combine several methods to obtain good
|
|
performance. This example shows how to use ``FeatureUnion`` to combine
|
|
features obtained by PCA and univariate selection.
|
|
|
|
Combining features using this transformer has the benefit that it allows
|
|
cross validation and grid searches over the whole process.
|
|
|
|
The combination used in this example is not particularly helpful on this
|
|
dataset and is only used to illustrate the usage of FeatureUnion.
|
|
|
|
"""
|
|
|
|
# Author: Andreas Mueller <amueller@ais.uni-bonn.de>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from sklearn.datasets import load_iris
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.feature_selection import SelectKBest
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.pipeline import FeatureUnion, Pipeline
|
|
from sklearn.svm import SVC
|
|
|
|
iris = load_iris()
|
|
|
|
X, y = iris.data, iris.target
|
|
|
|
# This dataset is way too high-dimensional. Better do PCA:
|
|
pca = PCA(n_components=2)
|
|
|
|
# Maybe some original features were good, too?
|
|
selection = SelectKBest(k=1)
|
|
|
|
# Build estimator from PCA and Univariate selection:
|
|
|
|
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
|
|
|
|
# Use combined features to transform dataset:
|
|
X_features = combined_features.fit(X, y).transform(X)
|
|
print("Combined space has", X_features.shape[1], "features")
|
|
|
|
svm = SVC(kernel="linear")
|
|
|
|
# Do grid search over k, n_components and C:
|
|
|
|
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
|
|
|
|
param_grid = dict(
|
|
features__pca__n_components=[1, 2, 3],
|
|
features__univ_select__k=[1, 2],
|
|
svm__C=[0.1, 1, 10],
|
|
)
|
|
|
|
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
|
|
grid_search.fit(X, y)
|
|
print(grid_search.best_estimator_)
|