SMDP代码实现

SMDP代码实现

时间:2022/6/29

1.util类

主要提供数据集的读取,交叉验证时数据集的划分等

import numpy as np
from scipy.io import loadmat


def get_index(num_bags=92, para_k=10, seed=None):
    '''
    交叉验证时生成测试集和训练集
    :param num_bags: 包数量
    :param para_k: k
    :param seed: 随机种子
    :return: test train
    '''
    if seed is not None:
        np.random.seed(seed)
    temp_rand_idx = np.random.permutation(num_bags)

    temp_fold = int(np.ceil(num_bags / para_k))
    ret_tr_idx = {}
    ret_te_idx = {}
    for i in range(para_k):
        temp_tr_idx = temp_rand_idx[0: i * temp_fold].tolist()
        temp_tr_idx.extend(temp_rand_idx[(i + 1) * temp_fold:])
        ret_tr_idx[i] = temp_tr_idx
        ret_te_idx[i] = temp_rand_idx[i * temp_fold: (i + 1) * temp_fold].tolist()
    return ret_tr_idx, ret_te_idx


def load_data(path='Benchmark/musk1+.mat'):
    '''
    加载包含包标签和实例标签的数据集
    :param path: 
    :return: 包 标签
    '''
    data = loadmat(path)['data']
    bags, labels = [], []
    for i in range(len(data)):
        bags.append(data[i, 0][:, :-1])
        labels.append(data[i, 1][0, 0])
    labels = np.array(labels)
    return bags, labels


if __name__ == '__main__':
    load_data('../MILframe/data/benchmark/musk1+.mat')

2.Density Peak类

提供密度峰值聚类相关功能。

# Coding:utf-8
# @Time:2022/6/23,18:46
# @Auther:zhang
# @file:DensityPeak.py
# @Software:PyCharm
import numpy as np
import pandas as pd


class DensityPeak:
    """
    密度峰值聚类算法
    """

    def __init__(self, distanceMatrix, dcRatio=0.2, clusterNumRatio=0.05, dcType="max", kernel="gaussian"):
        '''
        构造器,初始化相关参数
        :param distanceMatrix: 数据集的距离矩阵
        :param dcRatio: 半径比率 通常是0.2
        :param dcType: 半径计算类型 包括‘max’,'ave','min' Hausdorff距离等
        :param kernel: 密度计算时选取的计算函数 包括'cutoff-kernel' 'gaussian-kernel'
        '''
        # 实例间距离矩阵
        self.distance_m = distanceMatrix
        # 半径比率
        self.dcRatio_f = dcRatio
        # 半径类型
        self.dcType = dcType
        # 密度计算核
        self.kernel = kernel
        # 簇中心数量占比
        self.clusterCenterRatio_f = clusterNumRatio
        # 密度向量,存储密度
        self.densities_l = []
        # 存储master
        self.masters_l = []
        # 存储实例到其master的距离
        self.distanceToMaster_l = []
        # 代表性向量,存储实例的代表性
        self.representativeness_l = []
        # 簇中心
        self.clusterCenter_l = []
        # 实例数量
        self.numSample = 0
        # 半径dc
        self.dc_f = 0
        # 数据集最大实例间距离
        self.maxDistance = 0
        # 聚类标签
        self.label_l = []
        # 簇块 一个字典 簇号:[簇块]
        self.clusters_d = {}

        self.__initDensityPeak()

    def __initDensityPeak(self):
        '''
        初始化
        :return:
        '''
        # 实例数量
        self.numSample = len(self.distance_m)
        # 最大实例间距离
        self.maxDistance = self.getMaxDistance()
        # 计算半径dc
        self.dc_f = self.getDc()
        # 计算密度
        self.densities_l = self.computeDensities()
        # 计算实例到master的距离
        self.computeDistanceToMaster()
        # 计算实例的代表性
        self.computePriority()

    def getDc(self):
        '''
        计算半径dc
        :return:
        '''
        resultDc = 0.0
        match self.dcType:
            case "max":
                '''
                计算最大Hausdorff距离
                '''
                resultDc = self.maxDistance
            case "ave":
                '''
                平均Hausdorff距离
                '''
                resultDc = np.mean(self.distance_m)
            case "min":
                '''
                最小Hausdorff距离
                '''
                resultDc = np.min(self.distance_m)

        return resultDc * self.dcRatio_f

    def getMaxDistance(self):
        '''
        计算实例间最大距离
        :return:
        '''
        return np.max(self.distance_m)

    def computeDensities(self):
        '''
        计算密度,按照给定的kernel进行计算
        :return:
        '''
        # 按照高斯核计算
        if self.kernel == 'gaussian':
            # 方法一,使用循环
            # temp_local_density_list = []
            # for i in range(0, self.numSample):
            #     temp_local_density_list.append(self.gaussian_kernel(i))

            # 方法二,使用矩阵运算
            temp_local_density_list = np.sum(1 / (np.exp(np.power(self.distance_m / self.dc_f, 2))), axis=1)
            return temp_local_density_list
        # 按照截断核计算
        elif self.kernel == 'cutoff':
            temp_local_density_list = []
            for i in range(0, self.numSample):
                temp_local_density_list.append(self.cutoff_kernel(i))
            return temp_local_density_list

    def gaussian_kernel(self, i):
        '''
        高斯核计算密度
        :param i: 实例标号
        :return: 密度
        '''
        tempDensity = 0
        for j in range(len(self.distance_m[i])):
            tempDistance = self.distance_m[i][j]
            tempDensity += np.exp(-(tempDistance / self.dc_f) ** 2)
        return tempDensity

    def cutoff_kernel(self, i):
        '''
        截断核计算密度
        :param i: 实例标号
        :return: 密度
        '''
        tempDensity = 0
        for j in range(len(self.distance_m[i])):
            tempDistance = self.distance_m[i][j]
            tempDensity += self.F(tempDistance - self.dc_f)
        return tempDensity

    def F(self, x):
        '''
        截断核计算辅助函数
        :param x: 距离差值
        :return:
        '''
        if x < 0:
            return 1
        else:
            return 0

    def computeDistanceToMaster(self):
        '''
        计算实例到master的距离,同时确定实例的master
        :return:
        '''
        # 将密度降序排序,返回索引
        tempSortDensityIndex = np.argsort(self.densities_l)[::-1]
        # 初始化距离向量
        self.distanceToMaster_l = np.zeros(self.numSample)
        # 密度最高的获得最高优先级
        self.distanceToMaster_l[tempSortDensityIndex[0]] = self.maxDistance
        # 初始化master向量
        self.masters_l = np.zeros(self.numSample, dtype=int)
        # 密度最高的自己是自己的master
        self.masters_l[tempSortDensityIndex[0]] = -1

        # 计算距离和master
        # 选择密度大于自己且距离最近的作为自己的master
        for i in range(1, self.numSample):
            tempIndex = tempSortDensityIndex[i]
            self.masters_l[tempIndex] = tempSortDensityIndex[
                np.argmin(self.distance_m[tempIndex][tempSortDensityIndex[:i]])]
            self.distanceToMaster_l[tempIndex] = np.min(self.distance_m[tempIndex][tempSortDensityIndex[:i]])

    def computePriority(self):
        '''
        计算代表性(优先级)
        :return:
        '''
        self.representativeness_l = np.multiply(self.densities_l, self.distanceToMaster_l)

    def getLabel(self, i):
        '''
        获取实例的标签
        :param i: 实例标号
        :return: 实例聚类标签
        '''
        if self.label_l[i] < 0:
            return self.label_l[i]
        else:
            # 实例没有标签,则使用其master的标签作为自己的标签 聚类中即为聚类簇号
            return self.getLabel(self.masters_l[i])

    def getClusterCenter(self):
        n = int(self.numSample * self.clusterCenterRatio_f)
        return np.argsort(self.representativeness_l)[-n:][::-1]

    def cluster(self):
        '''
        按照比例计算聚类簇中心个数 进行聚类
        :param clusterRatio: 簇中心占比
        :return:
        '''
        n = int(self.numSample * self.clusterCenterRatio_f)
        self.cluster(n=n)

    def cluster(self, n=3):
        '''
        按照给定的簇中心个数进行聚类
        :param n: 簇中心个数
        :return:
        '''

        # 初始化标签向量
        self.label_l = np.zeros(self.numSample, dtype=int)
        # 初始化聚类中心
        self.clusterCenter_l = np.argsort(self.representativeness_l)[-n:][::-1]
        # 初始化簇号 使用簇号作为聚类标签
        for i in range(n):
            self.label_l[self.clusterCenter_l[i]] = -i - 1

        # 统计聚类标签
        for i in range(self.numSample):
            if self.label_l[i] < 0:
                continue
            self.label_l[i] = self.getLabel(self.masters_l[i])

        # 初始化聚类簇块
        self.clusters_d = {key: [] for key in self.label_l[self.clusterCenter_l]}

        # 按照聚类结果划分簇块
        for i in self.label_l[self.clusterCenter_l]:
            self.clusters_d[i] += [j for j in range(self.numSample) if self.label_l[j] == i]

    @staticmethod
    def getDistanceByEuclid(instance1, instance2):
        '''
        按照欧氏距离计算实例间距离
        :param instance1: 实例1
        :param instance2: 实例2
        :return: 欧氏距离
        '''
        dist = 0
        for key in range(len(instance1)):
            dist += (float(instance1[key]) - float(instance2[key])) ** 2
        return dist ** 0.5


if __name__ == '__main__':
    # 使用iris数据集进行测试
    dataset = pd.read_csv('dataset/iris.csv')

    distanceMartix = []
    data = dataset.to_numpy()[:, 1:5]
    # 计算距离矩阵
    for i in range(len(data)):
        tempdistances_l = [DensityPeak.getDistanceByEuclid(data[i], data[j]) for j in range(len(data))]
        distanceMartix.append(tempdistances_l)
    distanceMartix = np.array(distanceMartix)
    # 进行聚类
    dp = DensityPeak(distanceMartix, 0.2, "max")
    dp.cluster()
    for key, value in dp.clusters_d.items():
        print("簇号=", key, ",clustet= ", value)
        print("簇长度=", len(value))

3.SMDP类

SMDP核心代码

# Coding:utf-8
# @Time:2022/6/24,12:17
# @Auther:zhang
# @file:SMDP.py
# @Software:PyCharm

import os
import numpy as np
import utils
from sklearn.metrics import euclidean_distances as eucl
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from DensityPeak import DensityPeak


class SMDP:

    def __init__(self, datasetPath, dcRatio, bagRatio, distanceType="ave", kernel="gaussian"):
        '''
        初始化
        :param datasetPath: 数据集路径
        :param dcRatio: 半径计算比率
        :param bagRatio: 关键包比例
        :param distanceType: 距离计算类型
        :param kernel: 密度计算核
        '''
        self.datasetPath = datasetPath
        # 加载数据集
        self.bags_l, self.labels_l = utils.load_data(self.datasetPath)
        self.numBags = len(self.bags_l)
        self.distanceType = distanceType
        self.kernel = kernel
        self.dcRatio = dcRatio
        self.keyBagRatio = bagRatio

        # 计算数据集距离矩阵
        self.distanceMatrix = self.getDistanceMatrix()
        # 进行密度峰值计算
        dp = DensityPeak(self.distanceMatrix, dcRatio=dcRatio, clusterNumRatio=self.keyBagRatio)
        # 获取聚类中心,即关键包
        center_idx = dp.getClusterCenter()
        # 关键包
        self.vectors = self.distanceMatrix[:, center_idx]

    def getDistanceMatrix(self):
        '''
        计算数据集距离矩阵
        :return:
        '''

        # 设置距离矩阵保存路径 like "distance/musk1+_ave_h.npy"
        distanceMatrixSavePath = 'distance/' + self.datasetPath.split('/')[-1].split('.')[
            0] + '_' + self.distanceType + '.npy'

        if os.path.exists(distanceMatrixSavePath):
            print('load computed distance matrix...')
            distanceMatrix = np.load(distanceMatrixSavePath)
            return distanceMatrix
        # 使用平均Hausdorff距离计算包之间的距离
        print('computing distance matrix...')
        distanceMatrix = np.zeros((self.numBags, self.numBags))
        if self.distanceType == 'ave':
            for i in tqdm(range(self.numBags), desc='computing'):
                for j in range(i, self.numBags):
                    distanceMatrix[i, j] = distanceMatrix[j, i] = self.ave_Hausdorff(i, j)
        np.save(distanceMatrixSavePath, distanceMatrix)
        return distanceMatrix

    def ave_Hausdorff(self, i, j):
        '''
        平均Hausdorff距离
        :param i: 实例i
        :param j: 实例j
        :return: 计算距离
        '''
        if i == j:
            return 0
        tempDis1 = np.sum(np.min(eucl(self.bags_l[i], self.bags_l[j]), axis=1))
        tempDis2 = np.sum(np.min(eucl(self.bags_l[j], self.bags_l[i]), axis=1))
        return (tempDis1 + tempDis2) / (self.bags_l[i].shape[0] + self.bags_l[j].shape[0])

    def run(self, train_idx, test_idx):
        '''
        启动SMDP
        :param train_idx: 训练样本索引
        :param test_idx: 测试样本索引
        :return:
        '''
        train_vector = self.vectors[train_idx]
        test_vector = self.vectors[test_idx]

        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(train_vector, self.labels_l[train_idx])
        predicts = knn.predict(test_vector)
        accuracy = np.sum(predicts == self.labels_l[test_idx]) / len(test_idx)
        return accuracy * 100

    def one_cv(self, k):
        '''
        一次k折交叉验证
        :param k: k
        :return: 交叉验证准确率
        '''
        trainset_idx_l, testset_idx_l = utils.get_index(self.numBags, k)
        accuracy_l = []
        for i in range(k):
            tempAcc = self.run(trainset_idx_l[i], testset_idx_l[i])
            accuracy_l.append(tempAcc)
        return np.mean(accuracy_l)

    def n_cv(self, n, k):
        '''
        n重k折交叉验证
        :param n: 训练次数
        :param k: k
        :return: 平均准确率
        '''
        accuracy_l = []
        for i in range(n):
            tempAcc = self.one_cv(k)
            accuracy_l.append(tempAcc)
        return float(np.mean(accuracy_l)), float(np.std(accuracy_l, ddof=1))


if __name__ == '__main__':
    path = 'Benchmark/musk1+.mat'
    measure = 'ave'
    kernel = 'gaussian'
    r, u = 0.2, 0.6
    k, n = 10, 10  # n次k vc
    smdp = SMDP(path, distanceType=measure, kernel=kernel, dcRatio=r, bagRatio=u)
    acc, std = smdp.n_cv(n, k)
    print(path.split('/')[-1])
    print('$%.1f_{%.1f}$' % (acc, std))
    print(smdp.bags_l[1].shape)

4.运行测试

使用musk1+数据集进行测试
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值