标签传播(label Propagation)
强调一种主动学习工具去学习手写数字。开始训练一个只有10个标签的标签传播模型,然后我们选择5个最不确定的去标记。之后我们训练15个标签点。重复4次,最后得到一个拥有30个标签的例子。例子来自于scikit-learn官网
print(__doc__) # Authors: Clay Woolam <clay@woolam.org> # Licence: BSD import numpy as np import matplotlib.pyplot as plt from scipy import stats from sklearn import datasets from sklearn.semi_supervised import label_propagation from sklearn.metrics import classification_report, confusion_matrix digits = datasets.load_digits() #导入数据集 rng = np.random.RandomState(0) indices = np.arange(len(digits.data)) #indices是索引号为0-1788的数组 rng.shuffle(indices) #随机 X = digits.data[indices[:330]] #取索引号为前330的数据 y = digits.target[indices[:330]] #取索引号为前330的目标数据 images = digits.images[indices[:330]] n_total_samples = len(y) #共330个 n_labeled_points = 10 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:] #没有标记后320个索引 f = plt.figure() for i in range(5): #循环5次 y_train = np.copy(y) #索引号为前330个的目标数据 y_train[unlabeled_indices] = -1 #后面320个都是-1 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) #自定义标签传播模型 lp_model.fit(X, y_train) # 应用,进行训练 predicted_labels = lp_model.transduction_[unlabeled_indices] #进行传播,之后得到预测的值 true_labels = y[unlabeled_indices] #真实值 cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print('Iteration %i %s' % (i, 70 * '_')) print("Label Spreading model: %d labeled & %d unlabeled (%d total)" % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) print(classification_report(true_labels, predicted_labels)) print("Confusion matrix") print(cm) # compute the entropies of transduced label distributions pred_entropies = stats.distributions.entropy( lp_model.label_distributions_.T) # select five digit examples that the classifier is most uncertain about uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:] # keep track of indices that we get labels for delete_indices = np.array([]) f.text(.05, (1 - (i + 1) * .183), "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), size=10) for index, image_index in enumerate(uncertainty_index): image = images[image_index] sub = f.add_subplot(5, 5, index + 1 + (5 * i)) sub.imshow(image, cmap=plt.cm.gray_r) sub.set_title('predict: %i\ntrue: %i' % ( lp_model.transduction_[image_index], y[image_index]), size=10) sub.axis('off') # labeling 5 points, remote from labeled set delete_index, = np.where(unlabeled_indices == image_index) delete_indices = np.concatenate((delete_indices, delete_index)) unlabeled_indices = np.delete(unlabeled_indices, delete_indices) n_labeled_points += 5 f.suptitle("Active learning with Label Propagation.\nRows show 5 most " "uncertain labels to learn with the next model.") plt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45) plt.show()