机器学习（十一） ------python实现标签传播算法(LP算法)

最新推荐文章于 2022-05-16 12:00:00 发布

菜鸟08哥

最新推荐文章于 2022-05-16 12:00:00 发布

阅读量2.7k

点赞数 2

分类专栏： python 机器学习文章标签： 1024程序员节

本文链接：https://blog.csdn.net/weixin_43693650/article/details/120939787

版权

python 同时被 2 个专栏收录

15 篇文章 6 订阅

订阅专栏

机器学习

13 篇文章 2 订阅

订阅专栏

代码中实现了两种图的构建方法：RBF和KNN

labelpropagation.py

import time
import numpy as np


def navie_knn(dataSet, query, k):
    numSamples = dataSet.shape[0]
    # 1. 计算欧氏距离
    diff = np.tile(query, (numSamples, 1)) - dataSet
    squaredDiff = diff ** 2
    squaredDist = np.sum(squaredDiff, axis=1)

    # 2. 对距离进行排序
    sortedDistIndices = np.argsort(squaredDist)
    if k > len(sortedDistIndices):
        k = len(sortedDistIndices)

    # 返回排序后的距离
    return sortedDistIndices[0:k]

# 建立一个大图
def buildGraph(MatX, kernel_type, rbf_sigma=None, knn_num_neighbors=None):
    num_samples = MatX.shape[0]
    affinity_matrix = np.zeros((num_samples, num_samples), np.float32)
    # 判断是否满足条件，是否为rbf类型
    if kernel_type == 'type':
        if rbf_sigma == None:
            raise ValueError('You should input a sigma of rbf kernel!')

        for i in range(num_samples):
            row_sum = 0.0
            for j in range(num_samples):
                # 计算距离diff的值
                diff = MatX[i, :] - MatX[j, :]
                affinity_matrix[i][j] = np.exp(sum(diff ** 2) / (-2.0 * rbf_sigma ** 2))

                row_sum += affinity_matrix[i][j]
            affinity_matrix[i][:] /= row_sum
    elif kernel_type == 'knn':
        if knn_num_neighbors == None:
            raise ValueError('You should input a k of knn kernel!')
        for i in range(num_samples):
            k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors)
            affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors
    else:
        raise NameError('Not support kernel type! You can use knn of rbf!')

    return affinity_matrix


# labelpropagation标签传播算法
def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='rbf', rbf_sigma=1.5, knn_num_neighbors=10,
                     max_iter=500, tol=1e-3):
    # initialize 初始化
    # 有标签样本数量
    num_label_samples = Mat_Label.shape[0]
    # 无标签样本数量
    num_unlabel_samples = Mat_Unlabel.shape[0]
    # 总样本数量
    num_samples = num_label_samples + num_unlabel_samples
    # 去除labels中的重复数据，并进行排序后输出
    labels_list = np.unique(labels)
    num_classes = len(labels_list)
    # 实现数组垂直合并
    MatX = np.vstack((Mat_Label, Mat_Unlabel))
    # 返回一个给定形状和类型且填充为0的数组
    clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)
    for i in range(num_label_samples):
        clamp_data_label[i][labels[i]] = 1.0
    label_function = np.zeros((num_samples, num_classes), np.float32)

    label_function[0:num_label_samples] = clamp_data_label
    label_function[num_label_samples:num_samples] = -1
    # 构建图
    affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)
    # 开始标签传播
    iter = 0;
    pre_label_function = np.zeros((num_samples, num_classes), np.float32)
    changed = np.abs(pre_label_function - label_function).sum()

    while iter < max_iter and changed > tol:
        if iter % 1 == 0:
            print("------>>>迭代 %d/%d.changed: %f" % (iter, max_iter, changed))
        pre_label_function = label_function
        iter += 1

        # propagation传播，使用np.dot函数计算矩阵乘积
        label_function = np.dot(affinity_matrix, label_function)
        # 将标签的数值限制在区间（一系列的值）内
        label_function[0:num_label_samples] = clamp_data_label
        # check converge,计算数组各元素的绝对值
        changed = np.abs(pre_label_function - label_function).sum()
    # 得到未分类标签数据的类别
    unlabel_data_labels = np.zeros(num_unlabel_samples)
    for i in range(num_unlabel_samples):
        # 取出label_function[i+num_label_samples]中元素最大值所对应的索引
        unlabel_data_labels[i] = np.argmax(label_function[i + num_label_samples])
    return unlabel_data_labels

testlp.py

from labelpropagation import labelPropagation
import time
import numpy as np
import math


def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels):
    import matplotlib.pyplot as plt
    for i in range(Mat_Label.shape[0]):
        if int(labels[i]) == 0:
            plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr')  # 以Dr绘制图形
        elif int(labels[i]) == 1:
            plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db')
        else:
            plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy')

    for i in range(Mat_Unlabel.shape[0]):
        if int(unlabel_data_labels[i]) == 0:
            plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or')
        elif int(unlabel_data_labels[i]) == 1:
            plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob')
        else:
            plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy')

    # 绘制分类图形
    plt.xlabel('X1')
    plt.ylabel('X2')
    # 设置取值区间
    plt.xlim(0.0, 12.)
    plt.ylim(0.0, 12.)
    plt.show()

def loadCircleData(num_data):
    # 定义中心点坐标
    center = np.array([5.0,5.0])
    # 定义内圆半径值
    radiu_inner = 2
    # 定义外圆半径值
    radiu_outer = 4
    # 计算内圆数量
    num_inner = int(num_data / 3)
    # 计算外圆数量
    num_outer = num_data - num_inner
    data = []
    theta = 0.0
    for i in range(num_inner):
        pho = (theta % 360) * math.pi / 180
        tmp = np.zeros(2,np.float32)
        tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0]
        tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1]
        data.append(tmp)
        theta += 2
    theta = 0.0
    for i in range(num_outer):
        pho = (theta % 360) * math.pi / 180
        tmp = np.zeros(2, np.float32)
        tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0]
        tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1]
        data.append(tmp)
        theta += 1
    Mat_Label = np.zeros((2,2),np.float32)
    Mat_Label[0] = center + np.array([-radiu_inner + 0.5,0])
    Mat_Label[1] = center + np.array([-radiu_outer + 0.5,0])
    labels = [0,1]
    Mat_Unlabel = np.vstack(data)
    return Mat_Label,labels,Mat_Unlabel


def loadBandData(num_unlabel_samples):
    Mat_label = np.array([[5.0,2.],[5.0,8.0]])
    labels = [0,1] # 定义分类标签
    num_dim = Mat_label.shape[1]
    Mat_Unlabel = np.zeros((num_unlabel_samples,num_dim),np.float32)
    # Mat_Unlabel赋值为0
    # 以下代码得到两类分类后的数据
    Mat_Unlabel[:num_unlabel_samples/2,:] = (np.random.rand(num_unlabel_samples/2,num_dim) - 0.5) * np.array([3,1]) + Mat_label[0]
    Mat_Unlabel[num_unlabel_samples/2:num_unlabel_samples,:] = (np.random.rand(num_unlabel_samples/2,num_dim) - 0.5) * np.array([3,1]) + Mat_label[1]

    return Mat_label,labels,Mat_Unlabel

# 主函数
if __name__ == "__main__":
    num_unlabel_samples = 800
    Mat_label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples)
    # 使用rbf时，sigma参数的设置很重要，按照数据集选择的，具体要考虑
    # 连个数据点之间的距离，他也会影响到收敛的速度，所以使用knn内核更好，更稳定
    # unlabel_data_labels = labelPropagation(Mat_label,Mat_Unlabel,labels,kernel_type='rbf',rbf_sigma=0.2)
    unlabel_data_labels = labelPropagation(Mat_label, Mat_Unlabel, labels, kernel_type='knn', knn_num_neighbors=10,max_iter=300)
    # 显示
    show(Mat_label,labels,Mat_Unlabel,unlabel_data_labels)

结果展示

可视化结果在这里插入图片描述
迭代结果

菜鸟08哥

关注

2
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
0
评论
机器学习（十一） ------python实现标签传播算法(LP算法)

代码中实现了两种图的构建方法：RBF和KNNlabelpropagation.pyimport timeimport numpy as npdef navie_knn(dataSet, query, k): numSamples = dataSet.shape[0] # 1. 计算欧氏距离 diff = np.tile(query, (numSamples, 1)) - dataSet squaredDiff = diff ** 2 squaredDist
复制链接

扫一扫