188 lines
6.6 KiB
Python
188 lines
6.6 KiB
Python
"""
|
|
==================================================
|
|
Column Transformer with Heterogeneous Data Sources
|
|
==================================================
|
|
|
|
Datasets can often contain components that require different feature
|
|
extraction and processing pipelines. This scenario might occur when:
|
|
|
|
1. your dataset consists of heterogeneous data types (e.g. raster images and
|
|
text captions),
|
|
2. your dataset is stored in a :class:`pandas.DataFrame` and different columns
|
|
require different processing pipelines.
|
|
|
|
This example demonstrates how to use
|
|
:class:`~sklearn.compose.ColumnTransformer` on a dataset containing
|
|
different types of features. The choice of features is not particularly
|
|
helpful, but serves to illustrate the technique.
|
|
|
|
"""
|
|
|
|
# Author: Matt Terry <matt.terry@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import numpy as np
|
|
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.datasets import fetch_20newsgroups
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.feature_extraction import DictVectorizer
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics import classification_report
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.preprocessing import FunctionTransformer
|
|
from sklearn.svm import LinearSVC
|
|
|
|
##############################################################################
|
|
# 20 newsgroups dataset
|
|
# ---------------------
|
|
#
|
|
# We will use the :ref:`20 newsgroups dataset <20newsgroups_dataset>`, which
|
|
# comprises posts from newsgroups on 20 topics. This dataset is split
|
|
# into train and test subsets based on messages posted before and after
|
|
# a specific date. We will only use posts from 2 categories to speed up running
|
|
# time.
|
|
|
|
categories = ["sci.med", "sci.space"]
|
|
X_train, y_train = fetch_20newsgroups(
|
|
random_state=1,
|
|
subset="train",
|
|
categories=categories,
|
|
remove=("footers", "quotes"),
|
|
return_X_y=True,
|
|
)
|
|
X_test, y_test = fetch_20newsgroups(
|
|
random_state=1,
|
|
subset="test",
|
|
categories=categories,
|
|
remove=("footers", "quotes"),
|
|
return_X_y=True,
|
|
)
|
|
|
|
##############################################################################
|
|
# Each feature comprises meta information about that post, such as the subject,
|
|
# and the body of the news post.
|
|
|
|
print(X_train[0])
|
|
|
|
##############################################################################
|
|
# Creating transformers
|
|
# ---------------------
|
|
#
|
|
# First, we would like a transformer that extracts the subject and
|
|
# body of each post. Since this is a stateless transformation (does not
|
|
# require state information from training data), we can define a function that
|
|
# performs the data transformation then use
|
|
# :class:`~sklearn.preprocessing.FunctionTransformer` to create a scikit-learn
|
|
# transformer.
|
|
|
|
|
|
def subject_body_extractor(posts):
|
|
# construct object dtype array with two columns
|
|
# first column = 'subject' and second column = 'body'
|
|
features = np.empty(shape=(len(posts), 2), dtype=object)
|
|
for i, text in enumerate(posts):
|
|
# temporary variable `_` stores '\n\n'
|
|
headers, _, body = text.partition("\n\n")
|
|
# store body text in second column
|
|
features[i, 1] = body
|
|
|
|
prefix = "Subject:"
|
|
sub = ""
|
|
# save text after 'Subject:' in first column
|
|
for line in headers.split("\n"):
|
|
if line.startswith(prefix):
|
|
sub = line[len(prefix) :]
|
|
break
|
|
features[i, 0] = sub
|
|
|
|
return features
|
|
|
|
|
|
subject_body_transformer = FunctionTransformer(subject_body_extractor)
|
|
|
|
##############################################################################
|
|
# We will also create a transformer that extracts the
|
|
# length of the text and the number of sentences.
|
|
|
|
|
|
def text_stats(posts):
|
|
return [{"length": len(text), "num_sentences": text.count(".")} for text in posts]
|
|
|
|
|
|
text_stats_transformer = FunctionTransformer(text_stats)
|
|
|
|
##############################################################################
|
|
# Classification pipeline
|
|
# -----------------------
|
|
#
|
|
# The pipeline below extracts the subject and body from each post using
|
|
# ``SubjectBodyExtractor``, producing a (n_samples, 2) array. This array is
|
|
# then used to compute standard bag-of-words features for the subject and body
|
|
# as well as text length and number of sentences on the body, using
|
|
# ``ColumnTransformer``. We combine them, with weights, then train a
|
|
# classifier on the combined set of features.
|
|
|
|
pipeline = Pipeline(
|
|
[
|
|
# Extract subject & body
|
|
("subjectbody", subject_body_transformer),
|
|
# Use ColumnTransformer to combine the subject and body features
|
|
(
|
|
"union",
|
|
ColumnTransformer(
|
|
[
|
|
# bag-of-words for subject (col 0)
|
|
("subject", TfidfVectorizer(min_df=50), 0),
|
|
# bag-of-words with decomposition for body (col 1)
|
|
(
|
|
"body_bow",
|
|
Pipeline(
|
|
[
|
|
("tfidf", TfidfVectorizer()),
|
|
("best", PCA(n_components=50, svd_solver="arpack")),
|
|
]
|
|
),
|
|
1,
|
|
),
|
|
# Pipeline for pulling text stats from post's body
|
|
(
|
|
"body_stats",
|
|
Pipeline(
|
|
[
|
|
(
|
|
"stats",
|
|
text_stats_transformer,
|
|
), # returns a list of dicts
|
|
(
|
|
"vect",
|
|
DictVectorizer(),
|
|
), # list of dicts -> feature matrix
|
|
]
|
|
),
|
|
1,
|
|
),
|
|
],
|
|
# weight above ColumnTransformer features
|
|
transformer_weights={
|
|
"subject": 0.8,
|
|
"body_bow": 0.5,
|
|
"body_stats": 1.0,
|
|
},
|
|
),
|
|
),
|
|
# Use a SVC classifier on the combined features
|
|
("svc", LinearSVC(dual=False)),
|
|
],
|
|
verbose=True,
|
|
)
|
|
|
|
##############################################################################
|
|
# Finally, we fit our pipeline on the training data and use it to predict
|
|
# topics for ``X_test``. Performance metrics of our pipeline are then printed.
|
|
|
|
pipeline.fit(X_train, y_train)
|
|
y_pred = pipeline.predict(X_test)
|
|
print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))
|