代码中实现了两种图的构建方法:RBF和KNN
labelpropagation.py
import time
import numpy as np
def navie_knn(dataSet, query, k):
numSamples = dataSet.shape[0]
# 1. 计算欧氏距离
diff = np.tile(query, (numSamples, 1)) - dataSet
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis=1)
# 2. 对距离进行排序
sortedDistIndices = np.argsort(squaredDist)
if k > len(sortedDistIndices):
k = len(sortedDistIndices)
# 返回排序后的距离
return sortedDistIndices[0:k]
# 建立一个大图
def buildGraph(MatX, kernel_type, rbf_sigma=None, knn_num_neighbors=None):
num_samples = MatX.shape[0]
affinity_matrix = np.zeros((num_samples, num_samples), np.float32)
# 判断是否满足条件,是否为rbf类型
if kernel_type == 'type':
if rbf_sigma == None:
raise ValueError('You should input a sigma of rbf kernel!')
for i in range(num_samples):
row_sum = 0.0
for j in range(num_samples):
# 计算距离diff的值
diff = MatX[i, :] - MatX[j, :]
affinity_matrix[i][j] = np.exp(sum(diff ** 2) / (-2.0 * rbf_sigma ** 2))
row_sum += affinity_matrix[i][j]
affinity_matrix[i][:] /= row_sum
elif kernel_type == 'knn':
if knn_num_neighbors == None:
raise ValueError('You should input a k of knn kernel!')
for i in range(num_samples):
k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors)
affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors
else:
raise NameError('Not support kernel type! You can use knn of rbf!')
return affinity_matrix
# labelpropagation标签传播算法
def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='rbf', rbf_sigma=1.5, knn_num_neighbors=10,
max_iter=500, tol=1e-3):
# initialize 初始化
# 有标签样本数量
num_label_samples = Mat_Label.shape[0]
# 无标签样本数量
num_unlabel_samples = Mat_Unlabel.shape[0]
# 总样本数量
num_samples = num_label_samples + num_unlabel_samples
# 去除labels中的重复数据,并进行排序后输出
labels_list = np.unique(labels)
num_classes = len(labels_list)
# 实现数组垂直合并
MatX = np.vstack((Mat_Label, Mat_Unlabel))
# 返回一个给定形状和类型且填充为0的数组
clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)
for i in range(num_label_samples):
clamp_data_label[i][labels[i]] = 1.0
label_function = np.zeros((num_samples, num_classes), np.float32)
label_function[0:num_label_samples] = clamp_data_label
label_function[num_label_samples:num_samples] = -1
# 构建图
affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)
# 开始标签传播
iter = 0;
pre_label_function = np.zeros((num_samples, num_classes), np.float32)
changed = np.abs(pre_label_function - label_function).sum()
while iter < max_iter and changed > tol:
if iter % 1 == 0:
print("------>>>迭代 %d/%d.changed: %f" % (iter, max_iter, changed))
pre_label_function = label_function
iter += 1
# propagation传播,使用np.dot函数计算矩阵乘积
label_function = np.dot(affinity_matrix, label_function)
# 将标签的数值限制在区间(一系列的值)内
label_function[0:num_label_samples] = clamp_data_label
# check converge,计算数组各元素的绝对值
changed = np.abs(pre_label_function - label_function).sum()
# 得到未分类标签数据的类别
unlabel_data_labels = np.zeros(num_unlabel_samples)
for i in range(num_unlabel_samples):
# 取出label_function[i+num_label_samples]中元素最大值所对应的索引
unlabel_data_labels[i] = np.argmax(label_function[i + num_label_samples])
return unlabel_data_labels
testlp.py
from labelpropagation import labelPropagation
import time
import numpy as np
import math
def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels):
import matplotlib.pyplot as plt
for i in range(Mat_Label.shape[0]):
if int(labels[i]) == 0:
plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr') # 以Dr绘制图形
elif int(labels[i]) == 1:
plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db')
else:
plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy')
for i in range(Mat_Unlabel.shape[0]):
if int(unlabel_data_labels[i]) == 0:
plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or')
elif int(unlabel_data_labels[i]) == 1:
plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob')
else:
plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy')
# 绘制分类图形
plt.xlabel('X1')
plt.ylabel('X2')
# 设置取值区间
plt.xlim(0.0, 12.)
plt.ylim(0.0, 12.)
plt.show()
def loadCircleData(num_data):
# 定义中心点坐标
center = np.array([5.0,5.0])
# 定义内圆半径值
radiu_inner = 2
# 定义外圆半径值
radiu_outer = 4
# 计算内圆数量
num_inner = int(num_data / 3)
# 计算外圆数量
num_outer = num_data - num_inner
data = []
theta = 0.0
for i in range(num_inner):
pho = (theta % 360) * math.pi / 180
tmp = np.zeros(2,np.float32)
tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0]
tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1]
data.append(tmp)
theta += 2
theta = 0.0
for i in range(num_outer):
pho = (theta % 360) * math.pi / 180
tmp = np.zeros(2, np.float32)
tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0]
tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1]
data.append(tmp)
theta += 1
Mat_Label = np.zeros((2,2),np.float32)
Mat_Label[0] = center + np.array([-radiu_inner + 0.5,0])
Mat_Label[1] = center + np.array([-radiu_outer + 0.5,0])
labels = [0,1]
Mat_Unlabel = np.vstack(data)
return Mat_Label,labels,Mat_Unlabel
def loadBandData(num_unlabel_samples):
Mat_label = np.array([[5.0,2.],[5.0,8.0]])
labels = [0,1] # 定义分类标签
num_dim = Mat_label.shape[1]
Mat_Unlabel = np.zeros((num_unlabel_samples,num_dim),np.float32)
# Mat_Unlabel赋值为0
# 以下代码得到两类分类后的数据
Mat_Unlabel[:num_unlabel_samples/2,:] = (np.random.rand(num_unlabel_samples/2,num_dim) - 0.5) * np.array([3,1]) + Mat_label[0]
Mat_Unlabel[num_unlabel_samples/2:num_unlabel_samples,:] = (np.random.rand(num_unlabel_samples/2,num_dim) - 0.5) * np.array([3,1]) + Mat_label[1]
return Mat_label,labels,Mat_Unlabel
# 主函数
if __name__ == "__main__":
num_unlabel_samples = 800
Mat_label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples)
# 使用rbf时,sigma参数的设置很重要,按照数据集选择的,具体要考虑
# 连个数据点之间的距离,他也会影响到收敛的速度,所以使用knn内核更好,更稳定
# unlabel_data_labels = labelPropagation(Mat_label,Mat_Unlabel,labels,kernel_type='rbf',rbf_sigma=0.2)
unlabel_data_labels = labelPropagation(Mat_label, Mat_Unlabel, labels, kernel_type='knn', knn_num_neighbors=10,max_iter=300)
# 显示
show(Mat_label,labels,Mat_Unlabel,unlabel_data_labels)
结果展示
可视化结果
迭代结果