126 lines
4.1 KiB
Python
126 lines
4.1 KiB
Python
|
"""
|
||
|
========================================
|
||
|
Label Propagation digits active learning
|
||
|
========================================
|
||
|
|
||
|
Demonstrates an active learning technique to learn handwritten digits
|
||
|
using label propagation.
|
||
|
|
||
|
We start by training a label propagation model with only 10 labeled points,
|
||
|
then we select the top five most uncertain points to label. Next, we train
|
||
|
with 15 labeled points (original 10 + 5 new ones). We repeat this process
|
||
|
four times to have a model trained with 30 labeled examples. Note you can
|
||
|
increase this to label more than 30 by changing `max_iterations`. Labeling
|
||
|
more than 30 can be useful to get a sense for the speed of convergence of
|
||
|
this active learning technique.
|
||
|
|
||
|
A plot will appear showing the top 5 most uncertain digits for each iteration
|
||
|
of training. These may or may not contain mistakes, but we will train the next
|
||
|
model with their true labels.
|
||
|
|
||
|
"""
|
||
|
|
||
|
# Authors: Clay Woolam <clay@woolam.org>
|
||
|
# License: BSD
|
||
|
|
||
|
import matplotlib.pyplot as plt
|
||
|
import numpy as np
|
||
|
from scipy import stats
|
||
|
|
||
|
from sklearn import datasets
|
||
|
from sklearn.metrics import classification_report, confusion_matrix
|
||
|
from sklearn.semi_supervised import LabelSpreading
|
||
|
|
||
|
digits = datasets.load_digits()
|
||
|
rng = np.random.RandomState(0)
|
||
|
indices = np.arange(len(digits.data))
|
||
|
rng.shuffle(indices)
|
||
|
|
||
|
X = digits.data[indices[:330]]
|
||
|
y = digits.target[indices[:330]]
|
||
|
images = digits.images[indices[:330]]
|
||
|
|
||
|
n_total_samples = len(y)
|
||
|
n_labeled_points = 40
|
||
|
max_iterations = 5
|
||
|
|
||
|
unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
|
||
|
f = plt.figure()
|
||
|
|
||
|
for i in range(max_iterations):
|
||
|
if len(unlabeled_indices) == 0:
|
||
|
print("No unlabeled items left to label.")
|
||
|
break
|
||
|
y_train = np.copy(y)
|
||
|
y_train[unlabeled_indices] = -1
|
||
|
|
||
|
lp_model = LabelSpreading(gamma=0.25, max_iter=20)
|
||
|
lp_model.fit(X, y_train)
|
||
|
|
||
|
predicted_labels = lp_model.transduction_[unlabeled_indices]
|
||
|
true_labels = y[unlabeled_indices]
|
||
|
|
||
|
cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
|
||
|
|
||
|
print("Iteration %i %s" % (i, 70 * "_"))
|
||
|
print(
|
||
|
"Label Spreading model: %d labeled & %d unlabeled (%d total)"
|
||
|
% (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
|
||
|
)
|
||
|
|
||
|
print(classification_report(true_labels, predicted_labels))
|
||
|
|
||
|
print("Confusion matrix")
|
||
|
print(cm)
|
||
|
|
||
|
# compute the entropies of transduced label distributions
|
||
|
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
|
||
|
|
||
|
# select up to 5 digit examples that the classifier is most uncertain about
|
||
|
uncertainty_index = np.argsort(pred_entropies)[::-1]
|
||
|
uncertainty_index = uncertainty_index[
|
||
|
np.isin(uncertainty_index, unlabeled_indices)
|
||
|
][:5]
|
||
|
|
||
|
# keep track of indices that we get labels for
|
||
|
delete_indices = np.array([], dtype=int)
|
||
|
|
||
|
# for more than 5 iterations, visualize the gain only on the first 5
|
||
|
if i < 5:
|
||
|
f.text(
|
||
|
0.05,
|
||
|
(1 - (i + 1) * 0.183),
|
||
|
"model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10),
|
||
|
size=10,
|
||
|
)
|
||
|
for index, image_index in enumerate(uncertainty_index):
|
||
|
image = images[image_index]
|
||
|
|
||
|
# for more than 5 iterations, visualize the gain only on the first 5
|
||
|
if i < 5:
|
||
|
sub = f.add_subplot(5, 5, index + 1 + (5 * i))
|
||
|
sub.imshow(image, cmap=plt.cm.gray_r, interpolation="none")
|
||
|
sub.set_title(
|
||
|
"predict: %i\ntrue: %i"
|
||
|
% (lp_model.transduction_[image_index], y[image_index]),
|
||
|
size=10,
|
||
|
)
|
||
|
sub.axis("off")
|
||
|
|
||
|
# labeling 5 points, remote from labeled set
|
||
|
(delete_index,) = np.where(unlabeled_indices == image_index)
|
||
|
delete_indices = np.concatenate((delete_indices, delete_index))
|
||
|
|
||
|
unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
|
||
|
n_labeled_points += len(uncertainty_index)
|
||
|
|
||
|
f.suptitle(
|
||
|
(
|
||
|
"Active learning with Label Propagation.\nRows show 5 most "
|
||
|
"uncertain labels to learn with the next model."
|
||
|
),
|
||
|
y=1.15,
|
||
|
)
|
||
|
plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, hspace=0.85)
|
||
|
plt.show()
|