231 lines
8.5 KiB
Python
231 lines
8.5 KiB
Python
"""
|
|
==========================================================
|
|
Adjustment for chance in clustering performance evaluation
|
|
==========================================================
|
|
This notebook explores the impact of uniformly-distributed random labeling on
|
|
the behavior of some clustering evaluation metrics. For such purpose, the
|
|
metrics are computed with a fixed number of samples and as a function of the number
|
|
of clusters assigned by the estimator. The example is divided into two
|
|
experiments:
|
|
|
|
- a first experiment with fixed "ground truth labels" (and therefore fixed
|
|
number of classes) and randomly "predicted labels";
|
|
- a second experiment with varying "ground truth labels", randomly "predicted
|
|
labels". The "predicted labels" have the same number of classes and clusters
|
|
as the "ground truth labels".
|
|
"""
|
|
|
|
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
|
# Arturo Amor <david-arturo.amor-quiroz@inria.fr>
|
|
# License: BSD 3 clause
|
|
|
|
# %%
|
|
# Defining the list of metrics to evaluate
|
|
# ----------------------------------------
|
|
#
|
|
# Clustering algorithms are fundamentally unsupervised learning methods.
|
|
# However, since we assign class labels for the synthetic clusters in this
|
|
# example, it is possible to use evaluation metrics that leverage this
|
|
# "supervised" ground truth information to quantify the quality of the resulting
|
|
# clusters. Examples of such metrics are the following:
|
|
#
|
|
# - V-measure, the harmonic mean of completeness and homogeneity;
|
|
#
|
|
# - Rand index, which measures how frequently pairs of data points are grouped
|
|
# consistently according to the result of the clustering algorithm and the
|
|
# ground truth class assignment;
|
|
#
|
|
# - Adjusted Rand index (ARI), a chance-adjusted Rand index such that a random
|
|
# cluster assignment has an ARI of 0.0 in expectation;
|
|
#
|
|
# - Mutual Information (MI) is an information theoretic measure that quantifies
|
|
# how dependent are the two labelings. Note that the maximum value of MI for
|
|
# perfect labelings depends on the number of clusters and samples;
|
|
#
|
|
# - Normalized Mutual Information (NMI), a Mutual Information defined between 0
|
|
# (no mutual information) in the limit of large number of data points and 1
|
|
# (perfectly matching label assignments, up to a permutation of the labels).
|
|
# It is not adjusted for chance: then the number of clustered data points is
|
|
# not large enough, the expected values of MI or NMI for random labelings can
|
|
# be significantly non-zero;
|
|
#
|
|
# - Adjusted Mutual Information (AMI), a chance-adjusted Mutual Information.
|
|
# Similarly to ARI, random cluster assignment has an AMI of 0.0 in
|
|
# expectation.
|
|
#
|
|
# For more information, see the :ref:`clustering_evaluation` module.
|
|
|
|
from sklearn import metrics
|
|
|
|
score_funcs = [
|
|
("V-measure", metrics.v_measure_score),
|
|
("Rand index", metrics.rand_score),
|
|
("ARI", metrics.adjusted_rand_score),
|
|
("MI", metrics.mutual_info_score),
|
|
("NMI", metrics.normalized_mutual_info_score),
|
|
("AMI", metrics.adjusted_mutual_info_score),
|
|
]
|
|
|
|
# %%
|
|
# First experiment: fixed ground truth labels and growing number of clusters
|
|
# --------------------------------------------------------------------------
|
|
#
|
|
# We first define a function that creates uniformly-distributed random labeling.
|
|
|
|
import numpy as np
|
|
|
|
rng = np.random.RandomState(0)
|
|
|
|
|
|
def random_labels(n_samples, n_classes):
|
|
return rng.randint(low=0, high=n_classes, size=n_samples)
|
|
|
|
|
|
# %%
|
|
# Another function will use the `random_labels` function to create a fixed set
|
|
# of ground truth labels (`labels_a`) distributed in `n_classes` and then score
|
|
# several sets of randomly "predicted" labels (`labels_b`) to assess the
|
|
# variability of a given metric at a given `n_clusters`.
|
|
|
|
|
|
def fixed_classes_uniform_labelings_scores(
|
|
score_func, n_samples, n_clusters_range, n_classes, n_runs=5
|
|
):
|
|
scores = np.zeros((len(n_clusters_range), n_runs))
|
|
labels_a = random_labels(n_samples=n_samples, n_classes=n_classes)
|
|
|
|
for i, n_clusters in enumerate(n_clusters_range):
|
|
for j in range(n_runs):
|
|
labels_b = random_labels(n_samples=n_samples, n_classes=n_clusters)
|
|
scores[i, j] = score_func(labels_a, labels_b)
|
|
return scores
|
|
|
|
|
|
# %%
|
|
# In this first example we set the number of classes (true number of clusters) to
|
|
# `n_classes=10`. The number of clusters varies over the values provided by
|
|
# `n_clusters_range`.
|
|
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
n_samples = 1000
|
|
n_classes = 10
|
|
n_clusters_range = np.linspace(2, 100, 10).astype(int)
|
|
plots = []
|
|
names = []
|
|
|
|
sns.color_palette("colorblind")
|
|
plt.figure(1)
|
|
|
|
for marker, (score_name, score_func) in zip("d^vx.,", score_funcs):
|
|
scores = fixed_classes_uniform_labelings_scores(
|
|
score_func, n_samples, n_clusters_range, n_classes=n_classes
|
|
)
|
|
plots.append(
|
|
plt.errorbar(
|
|
n_clusters_range,
|
|
scores.mean(axis=1),
|
|
scores.std(axis=1),
|
|
alpha=0.8,
|
|
linewidth=1,
|
|
marker=marker,
|
|
)[0]
|
|
)
|
|
names.append(score_name)
|
|
|
|
plt.title(
|
|
"Clustering measures for random uniform labeling\n"
|
|
f"against reference assignment with {n_classes} classes"
|
|
)
|
|
plt.xlabel(f"Number of clusters (Number of samples is fixed to {n_samples})")
|
|
plt.ylabel("Score value")
|
|
plt.ylim(bottom=-0.05, top=1.05)
|
|
plt.legend(plots, names, bbox_to_anchor=(0.5, 0.5))
|
|
plt.show()
|
|
|
|
# %%
|
|
# The Rand index saturates for `n_clusters` > `n_classes`. Other non-adjusted
|
|
# measures such as the V-Measure show a linear dependency between the number of
|
|
# clusters and the number of samples.
|
|
#
|
|
# Adjusted for chance measure, such as ARI and AMI, display some random
|
|
# variations centered around a mean score of 0.0, independently of the number of
|
|
# samples and clusters.
|
|
#
|
|
# Second experiment: varying number of classes and clusters
|
|
# ---------------------------------------------------------
|
|
#
|
|
# In this section we define a similar function that uses several metrics to
|
|
# score 2 uniformly-distributed random labelings. In this case, the number of
|
|
# classes and assigned number of clusters are matched for each possible value in
|
|
# `n_clusters_range`.
|
|
|
|
|
|
def uniform_labelings_scores(score_func, n_samples, n_clusters_range, n_runs=5):
|
|
scores = np.zeros((len(n_clusters_range), n_runs))
|
|
|
|
for i, n_clusters in enumerate(n_clusters_range):
|
|
for j in range(n_runs):
|
|
labels_a = random_labels(n_samples=n_samples, n_classes=n_clusters)
|
|
labels_b = random_labels(n_samples=n_samples, n_classes=n_clusters)
|
|
scores[i, j] = score_func(labels_a, labels_b)
|
|
return scores
|
|
|
|
|
|
# %%
|
|
# In this case, we use `n_samples=100` to show the effect of having a number of
|
|
# clusters similar or equal to the number of samples.
|
|
|
|
n_samples = 100
|
|
n_clusters_range = np.linspace(2, n_samples, 10).astype(int)
|
|
|
|
plt.figure(2)
|
|
|
|
plots = []
|
|
names = []
|
|
|
|
for marker, (score_name, score_func) in zip("d^vx.,", score_funcs):
|
|
scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
|
|
plots.append(
|
|
plt.errorbar(
|
|
n_clusters_range,
|
|
np.median(scores, axis=1),
|
|
scores.std(axis=1),
|
|
alpha=0.8,
|
|
linewidth=2,
|
|
marker=marker,
|
|
)[0]
|
|
)
|
|
names.append(score_name)
|
|
|
|
plt.title(
|
|
"Clustering measures for 2 random uniform labelings\nwith equal number of clusters"
|
|
)
|
|
plt.xlabel(f"Number of clusters (Number of samples is fixed to {n_samples})")
|
|
plt.ylabel("Score value")
|
|
plt.legend(plots, names)
|
|
plt.ylim(bottom=-0.05, top=1.05)
|
|
plt.show()
|
|
|
|
# %%
|
|
# We observe similar results as for the first experiment: adjusted for chance
|
|
# metrics stay constantly near zero while other metrics tend to get larger with
|
|
# finer-grained labelings. The mean V-measure of random labeling increases
|
|
# significantly as the number of clusters is closer to the total number of
|
|
# samples used to compute the measure. Furthermore, raw Mutual Information is
|
|
# unbounded from above and its scale depends on the dimensions of the clustering
|
|
# problem and the cardinality of the ground truth classes. This is why the
|
|
# curve goes off the chart.
|
|
#
|
|
# Only adjusted measures can hence be safely used as a consensus index to
|
|
# evaluate the average stability of clustering algorithms for a given value of k
|
|
# on various overlapping sub-samples of the dataset.
|
|
#
|
|
# Non-adjusted clustering evaluation metric can therefore be misleading as they
|
|
# output large values for fine-grained labelings, one could be lead to think
|
|
# that the labeling has captured meaningful groups while they can be totally
|
|
# random. In particular, such non-adjusted metrics should not be used to compare
|
|
# the results of different clustering algorithms that output a different number
|
|
# of clusters.
|