sklearn/examples/text/plot_document_clustering.py

"""
=======================================
Clustering text documents using k-means
=======================================

This is an example showing how the scikit-learn API can be used to cluster
documents by topics using a `Bag of Words approach
<https://en.wikipedia.org/wiki/Bag-of-words_model>`_.

Two algorithms are demonstrated, namely :class:`~sklearn.cluster.KMeans` and its more
scalable variant, :class:`~sklearn.cluster.MiniBatchKMeans`. Additionally,
latent semantic analysis is used to reduce dimensionality and discover latent
patterns in the data.

This example uses two different text vectorizers: a
:class:`~sklearn.feature_extraction.text.TfidfVectorizer` and a
:class:`~sklearn.feature_extraction.text.HashingVectorizer`. See the example
notebook :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`
for more information on vectorizers and a comparison of their processing times.

For document analysis via a supervised learning approach, see the example script
:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.

"""

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Lars Buitinck
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Arturo Amor <david-arturo.amor-quiroz@inria.fr>
# License: BSD 3 clause

# %%
# Loading text data
# =================
#
# We load data from :ref:`20newsgroups_dataset`, which comprises around 18,000
# newsgroups posts on 20 topics. For illustrative purposes and to reduce the
# computational cost, we select a subset of 4 topics only accounting for around
# 3,400 documents. See the example
# :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
# to gain intuition on the overlap of such topics.
#
# Notice that, by default, the text samples contain some message metadata such
# as `"headers"`, `"footers"` (signatures) and `"quotes"` to other posts. We use
# the `remove` parameter from :func:`~sklearn.datasets.fetch_20newsgroups` to
# strip those features and have a more sensible clustering problem.

import numpy as np

from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")

# %%
# Quantifying the quality of clustering results
# =============================================
#
# In this section we define a function to score different clustering pipelines
# using several metrics.
#
# Clustering algorithms are fundamentally unsupervised learning methods.
# However, since we happen to have class labels for this specific dataset, it is
# possible to use evaluation metrics that leverage this "supervised" ground
# truth information to quantify the quality of the resulting clusters. Examples
# of such metrics are the following:
#
# - homogeneity, which quantifies how much clusters contain only members of a
#   single class;
#
# - completeness, which quantifies how much members of a given class are
#   assigned to the same clusters;
#
# - V-measure, the harmonic mean of completeness and homogeneity;
#
# - Rand-Index, which measures how frequently pairs of data points are grouped
#   consistently according to the result of the clustering algorithm and the
#   ground truth class assignment;
#
# - Adjusted Rand-Index, a chance-adjusted Rand-Index such that random cluster
#   assignment have an ARI of 0.0 in expectation.
#
# If the ground truth labels are not known, evaluation can only be performed
# using the model results itself. In that case, the Silhouette Coefficient comes in
# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
# for an example on how to do it.
#
# For more reference, see :ref:`clustering_evaluation`.

from collections import defaultdict
from time import time

from sklearn import metrics

evaluations = []
evaluations_std = []


def fit_and_evaluate(km, X, name=None, n_runs=5):
    name = km.__class__.__name__ if name is None else name

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    print(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    evaluations.append(evaluation)
    evaluations_std.append(evaluation_std)


# %%
# K-means clustering on text features
# ===================================
#
# Two feature extraction methods are used in this example:
#
# - :class:`~sklearn.feature_extraction.text.TfidfVectorizer` uses an in-memory
#   vocabulary (a Python dict) to map the most frequent words to features
#   indices and hence compute a word occurrence frequency (sparse) matrix. The
#   word frequencies are then reweighted using the Inverse Document Frequency
#   (IDF) vector collected feature-wise over the corpus.
#
# - :class:`~sklearn.feature_extraction.text.HashingVectorizer` hashes word
#   occurrences to a fixed dimensional space, possibly with collisions. The word
#   count vectors are then normalized to each have l2-norm equal to one
#   (projected to the euclidean unit-sphere) which seems to be important for
#   k-means to work in high dimensional space.
#
# Furthermore it is possible to post-process those extracted features using
# dimensionality reduction. We will explore the impact of those choices on the
# clustering quality in the following.
#
# Feature Extraction using TfidfVectorizer
# ----------------------------------------
#
# We first benchmark the estimators using a dictionary vectorizer along with an
# IDF normalization as provided by
# :class:`~sklearn.feature_extraction.text.TfidfVectorizer`.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
t0 = time()
X_tfidf = vectorizer.fit_transform(dataset.data)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

# %%
# After ignoring terms that appear in more than 50% of the documents (as set by
# `max_df=0.5`) and terms that are not present in at least 5 documents (set by
# `min_df=5`), the resulting number of unique terms `n_features` is around
# 8,000. We can additionally quantify the sparsity of the `X_tfidf` matrix as
# the fraction of non-zero entries divided by the total number of elements.

print(f"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}")

# %%
# We find that around 0.7% of the entries of the `X_tfidf` matrix are non-zero.
#
# .. _kmeans_sparse_high_dim:
#
# Clustering sparse data with k-means
# -----------------------------------
#
# As both :class:`~sklearn.cluster.KMeans` and
# :class:`~sklearn.cluster.MiniBatchKMeans` optimize a non-convex objective
# function, their clustering is not guaranteed to be optimal for a given random
# init. Even further, on sparse high-dimensional data such as text vectorized
# using the Bag of Words approach, k-means can initialize centroids on extremely
# isolated data points. Those data points can stay their own centroids all
# along.
#
# The following code illustrates how the previous phenomenon can sometimes lead
# to highly imbalanced clusters, depending on the random initialization:

from sklearn.cluster import KMeans

for seed in range(5):
    kmeans = KMeans(
        n_clusters=true_k,
        max_iter=100,
        n_init=1,
        random_state=seed,
    ).fit(X_tfidf)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
print()
print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

# %%
# To avoid this problem, one possibility is to increase the number of runs with
# independent random initiations `n_init`. In such case the clustering with the
# best inertia (objective function of k-means) is chosen.

kmeans = KMeans(
    n_clusters=true_k,
    max_iter=100,
    n_init=5,
)

fit_and_evaluate(kmeans, X_tfidf, name="KMeans\non tf-idf vectors")

# %%
# All those clustering evaluation metrics have a maximum value of 1.0 (for a
# perfect clustering result). Higher values are better. Values of the Adjusted
# Rand-Index close to 0.0 correspond to a random labeling. Notice from the
# scores above that the cluster assignment is indeed well above chance level,
# but the overall quality can certainly improve.
#
# Keep in mind that the class labels may not reflect accurately the document
# topics and therefore metrics that use labels are not necessarily the best to
# evaluate the quality of our clustering pipeline.
#
# Performing dimensionality reduction using LSA
# ---------------------------------------------
#
# A `n_init=1` can still be used as long as the dimension of the vectorized
# space is reduced first to make k-means more stable. For such purpose we use
# :class:`~sklearn.decomposition.TruncatedSVD`, which works on term count/tf-idf
# matrices. Since SVD results are not normalized, we redo the normalization to
# improve the :class:`~sklearn.cluster.KMeans` result. Using SVD to reduce the
# dimensionality of TF-IDF document vectors is often known as `latent semantic
# analysis <https://en.wikipedia.org/wiki/Latent_semantic_analysis>`_ (LSA) in
# the information retrieval and text mining literature.

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
t0 = time()
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"LSA done in {time() - t0:.3f} s")
print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

# %%
# Using a single initialization means the processing time will be reduced for
# both :class:`~sklearn.cluster.KMeans` and
# :class:`~sklearn.cluster.MiniBatchKMeans`.

kmeans = KMeans(
    n_clusters=true_k,
    max_iter=100,
    n_init=1,
)

fit_and_evaluate(kmeans, X_lsa, name="KMeans\nwith LSA on tf-idf vectors")

# %%
# We can observe that clustering on the LSA representation of the document is
# significantly faster (both because of `n_init=1` and because the
# dimensionality of the LSA feature space is much smaller). Furthermore, all the
# clustering evaluation metrics have improved. We repeat the experiment with
# :class:`~sklearn.cluster.MiniBatchKMeans`.

from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(
    n_clusters=true_k,
    n_init=1,
    init_size=1000,
    batch_size=1000,
)

fit_and_evaluate(
    minibatch_kmeans,
    X_lsa,
    name="MiniBatchKMeans\nwith LSA on tf-idf vectors",
)

# %%
# Top terms per cluster
# ---------------------
#
# Since :class:`~sklearn.feature_extraction.text.TfidfVectorizer` can be
# inverted we can identify the cluster centers, which provide an intuition of
# the most influential words **for each cluster**. See the example script
# :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
# for a comparison with the most predictive words **for each target class**.

original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

# %%
# HashingVectorizer
# -----------------
# An alternative vectorization can be done using a
# :class:`~sklearn.feature_extraction.text.HashingVectorizer` instance, which
# does not provide IDF weighting as this is a stateless model (the fit method
# does nothing). When IDF weighting is needed it can be added by pipelining the
# :class:`~sklearn.feature_extraction.text.HashingVectorizer` output to a
# :class:`~sklearn.feature_extraction.text.TfidfTransformer` instance. In this
# case we also add LSA to the pipeline to reduce the dimension and sparcity of
# the hashed vector space.

from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer

lsa_vectorizer = make_pipeline(
    HashingVectorizer(stop_words="english", n_features=50_000),
    TfidfTransformer(),
    TruncatedSVD(n_components=100, random_state=0),
    Normalizer(copy=False),
)

t0 = time()
X_hashed_lsa = lsa_vectorizer.fit_transform(dataset.data)
print(f"vectorization done in {time() - t0:.3f} s")

# %%
# One can observe that the LSA step takes a relatively long time to fit,
# especially with hashed vectors. The reason is that a hashed space is typically
# large (set to `n_features=50_000` in this example). One can try lowering the
# number of features at the expense of having a larger fraction of features with
# hash collisions as shown in the example notebook
# :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
#
# We now fit and evaluate the `kmeans` and `minibatch_kmeans` instances on this
# hashed-lsa-reduced data:

fit_and_evaluate(kmeans, X_hashed_lsa, name="KMeans\nwith LSA on hashed vectors")

# %%
fit_and_evaluate(
    minibatch_kmeans,
    X_hashed_lsa,
    name="MiniBatchKMeans\nwith LSA on hashed vectors",
)

# %%
# Both methods lead to good results that are similar to running the same models
# on the traditional LSA vectors (without hashing).
#
# Clustering evaluation summary
# ==============================

import matplotlib.pyplot as plt
import pandas as pd

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(16, 6), sharey=True)

df = pd.DataFrame(evaluations[::-1]).set_index("estimator")
df_std = pd.DataFrame(evaluations_std[::-1]).set_index("estimator")

df.drop(
    ["train_time"],
    axis="columns",
).plot.barh(ax=ax0, xerr=df_std)
ax0.set_xlabel("Clustering scores")
ax0.set_ylabel("")

df["train_time"].plot.barh(ax=ax1, xerr=df_std["train_time"])
ax1.set_xlabel("Clustering time (s)")
plt.tight_layout()

# %%
# :class:`~sklearn.cluster.KMeans` and :class:`~sklearn.cluster.MiniBatchKMeans`
# suffer from the phenomenon called the `Curse of Dimensionality
# <https://en.wikipedia.org/wiki/Curse_of_dimensionality>`_ for high dimensional
# datasets such as text data. That is the reason why the overall scores improve
# when using LSA. Using LSA reduced data also improves the stability and
# requires lower clustering time, though keep in mind that the LSA step itself
# takes a long time, especially with hashed vectors.
#
# The Silhouette Coefficient is defined between 0 and 1. In all cases we obtain
# values close to 0 (even if they improve a bit after using LSA) because its
# definition requires measuring distances, in contrast with other evaluation
# metrics such as the V-measure and the Adjusted Rand Index which are only based
# on cluster assignments rather than distances. Notice that strictly speaking,
# one should not compare the Silhouette Coefficient between spaces of different
# dimension, due to the different notions of distance they imply.
#
# The homogeneity, completeness and hence v-measure metrics do not yield a
# baseline with regards to random labeling: this means that depending on the
# number of samples, clusters and ground truth classes, a completely random
# labeling will not always yield the same values. In particular random labeling
# won't yield zero scores, especially when the number of clusters is large. This
# problem can safely be ignored when the number of samples is more than a
# thousand and the number of clusters is less than 10, which is the case of the
# present example. For smaller sample sizes or larger number of clusters it is
# safer to use an adjusted index such as the Adjusted Rand Index (ARI). See the
# example
# :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py` for
# a demo on the effect of random labeling.
#
# The size of the error bars show that :class:`~sklearn.cluster.MiniBatchKMeans`
# is less stable than :class:`~sklearn.cluster.KMeans` for this relatively small
# dataset. It is more interesting to use when the number of samples is much
# bigger, but it can come at the expense of a small degradation in clustering
# quality compared to the traditional k-means algorithm.