SMDP代码实现
时间:2022/6/29
1.util类
主要提供数据集的读取,交叉验证时数据集的划分等
import numpy as np
from scipy.io import loadmat
def get_index(num_bags=92, para_k=10, seed=None):
'''
交叉验证时生成测试集和训练集
:param num_bags: 包数量
:param para_k: k
:param seed: 随机种子
:return: test train
'''
if seed is not None:
np.random.seed(seed)
temp_rand_idx = np.random.permutation(num_bags)
temp_fold = int(np.ceil(num_bags / para_k))
ret_tr_idx = {}
ret_te_idx = {}
for i in range(para_k):
temp_tr_idx = temp_rand_idx[0: i * temp_fold].tolist()
temp_tr_idx.extend(temp_rand_idx[(i + 1) * temp_fold:])
ret_tr_idx[i] = temp_tr_idx
ret_te_idx[i] = temp_rand_idx[i * temp_fold: (i + 1) * temp_fold].tolist()
return ret_tr_idx, ret_te_idx
def load_data(path='Benchmark/musk1+.mat'):
'''
加载包含包标签和实例标签的数据集
:param path:
:return: 包 标签
'''
data = loadmat(path)['data']
bags, labels = [], []
for i in range(len(data)):
bags.append(data[i, 0][:, :-1])
labels.append(data[i, 1][0, 0])
labels = np.array(labels)
return bags, labels
if __name__ == '__main__':
load_data('../MILframe/data/benchmark/musk1+.mat')
2.Density Peak类
提供密度峰值聚类相关功能。
# Coding:utf-8
# @Time:2022/6/23,18:46
# @Auther:zhang
# @file:DensityPeak.py
# @Software:PyCharm
import numpy as np
import pandas as pd
class DensityPeak:
"""
密度峰值聚类算法
"""
def __init__(self, distanceMatrix, dcRatio=0.2, clusterNumRatio=0.05, dcType="max", kernel="gaussian"):
'''
构造器,初始化相关参数
:param distanceMatrix: 数据集的距离矩阵
:param dcRatio: 半径比率 通常是0.2
:param dcType: 半径计算类型 包括‘max’,'ave','min' Hausdorff距离等
:param kernel: 密度计算时选取的计算函数 包括'cutoff-kernel' 'gaussian-kernel'
'''
# 实例间距离矩阵
self.distance_m = distanceMatrix
# 半径比率
self.dcRatio_f = dcRatio
# 半径类型
self.dcType = dcType
# 密度计算核
self.kernel = kernel
# 簇中心数量占比
self.clusterCenterRatio_f = clusterNumRatio
# 密度向量,存储密度
self.densities_l = []
# 存储master
self.masters_l = []
# 存储实例到其master的距离
self.distanceToMaster_l = []
# 代表性向量,存储实例的代表性
self.representativeness_l = []
# 簇中心
self.clusterCenter_l = []
# 实例数量
self.numSample = 0
# 半径dc
self.dc_f = 0
# 数据集最大实例间距离
self.maxDistance = 0
# 聚类标签
self.label_l = []
# 簇块 一个字典 簇号:[簇块]
self.clusters_d = {}
self.__initDensityPeak()
def __initDensityPeak(self):
'''
初始化
:return:
'''
# 实例数量
self.numSample = len(self.distance_m)
# 最大实例间距离
self.maxDistance = self.getMaxDistance()
# 计算半径dc
self.dc_f = self.getDc()
# 计算密度
self.densities_l = self.computeDensities()
# 计算实例到master的距离
self.computeDistanceToMaster()
# 计算实例的代表性
self.computePriority()
def getDc(self):
'''
计算半径dc
:return:
'''
resultDc = 0.0
match self.dcType:
case "max":
'''
计算最大Hausdorff距离
'''
resultDc = self.maxDistance
case "ave":
'''
平均Hausdorff距离
'''
resultDc = np.mean(self.distance_m)
case "min":
'''
最小Hausdorff距离
'''
resultDc = np.min(self.distance_m)
return resultDc * self.dcRatio_f
def getMaxDistance(self):
'''
计算实例间最大距离
:return:
'''
return np.max(self.distance_m)
def computeDensities(self):
'''
计算密度,按照给定的kernel进行计算
:return:
'''
# 按照高斯核计算
if self.kernel == 'gaussian':
# 方法一,使用循环
# temp_local_density_list = []
# for i in range(0, self.numSample):
# temp_local_density_list.append(self.gaussian_kernel(i))
# 方法二,使用矩阵运算
temp_local_density_list = np.sum(1 / (np.exp(np.power(self.distance_m / self.dc_f, 2))), axis=1)
return temp_local_density_list
# 按照截断核计算
elif self.kernel == 'cutoff':
temp_local_density_list = []
for i in range(0, self.numSample):
temp_local_density_list.append(self.cutoff_kernel(i))
return temp_local_density_list
def gaussian_kernel(self, i):
'''
高斯核计算密度
:param i: 实例标号
:return: 密度
'''
tempDensity = 0
for j in range(len(self.distance_m[i])):
tempDistance = self.distance_m[i][j]
tempDensity += np.exp(-(tempDistance / self.dc_f) ** 2)
return tempDensity
def cutoff_kernel(self, i):
'''
截断核计算密度
:param i: 实例标号
:return: 密度
'''
tempDensity = 0
for j in range(len(self.distance_m[i])):
tempDistance = self.distance_m[i][j]
tempDensity += self.F(tempDistance - self.dc_f)
return tempDensity
def F(self, x):
'''
截断核计算辅助函数
:param x: 距离差值
:return:
'''
if x < 0:
return 1
else:
return 0
def computeDistanceToMaster(self):
'''
计算实例到master的距离,同时确定实例的master
:return:
'''
# 将密度降序排序,返回索引
tempSortDensityIndex = np.argsort(self.densities_l)[::-1]
# 初始化距离向量
self.distanceToMaster_l = np.zeros(self.numSample)
# 密度最高的获得最高优先级
self.distanceToMaster_l[tempSortDensityIndex[0]] = self.maxDistance
# 初始化master向量
self.masters_l = np.zeros(self.numSample, dtype=int)
# 密度最高的自己是自己的master
self.masters_l[tempSortDensityIndex[0]] = -1
# 计算距离和master
# 选择密度大于自己且距离最近的作为自己的master
for i in range(1, self.numSample):
tempIndex = tempSortDensityIndex[i]
self.masters_l[tempIndex] = tempSortDensityIndex[
np.argmin(self.distance_m[tempIndex][tempSortDensityIndex[:i]])]
self.distanceToMaster_l[tempIndex] = np.min(self.distance_m[tempIndex][tempSortDensityIndex[:i]])
def computePriority(self):
'''
计算代表性(优先级)
:return:
'''
self.representativeness_l = np.multiply(self.densities_l, self.distanceToMaster_l)
def getLabel(self, i):
'''
获取实例的标签
:param i: 实例标号
:return: 实例聚类标签
'''
if self.label_l[i] < 0:
return self.label_l[i]
else:
# 实例没有标签,则使用其master的标签作为自己的标签 聚类中即为聚类簇号
return self.getLabel(self.masters_l[i])
def getClusterCenter(self):
n = int(self.numSample * self.clusterCenterRatio_f)
return np.argsort(self.representativeness_l)[-n:][::-1]
def cluster(self):
'''
按照比例计算聚类簇中心个数 进行聚类
:param clusterRatio: 簇中心占比
:return:
'''
n = int(self.numSample * self.clusterCenterRatio_f)
self.cluster(n=n)
def cluster(self, n=3):
'''
按照给定的簇中心个数进行聚类
:param n: 簇中心个数
:return:
'''
# 初始化标签向量
self.label_l = np.zeros(self.numSample, dtype=int)
# 初始化聚类中心
self.clusterCenter_l = np.argsort(self.representativeness_l)[-n:][::-1]
# 初始化簇号 使用簇号作为聚类标签
for i in range(n):
self.label_l[self.clusterCenter_l[i]] = -i - 1
# 统计聚类标签
for i in range(self.numSample):
if self.label_l[i] < 0:
continue
self.label_l[i] = self.getLabel(self.masters_l[i])
# 初始化聚类簇块
self.clusters_d = {key: [] for key in self.label_l[self.clusterCenter_l]}
# 按照聚类结果划分簇块
for i in self.label_l[self.clusterCenter_l]:
self.clusters_d[i] += [j for j in range(self.numSample) if self.label_l[j] == i]
@staticmethod
def getDistanceByEuclid(instance1, instance2):
'''
按照欧氏距离计算实例间距离
:param instance1: 实例1
:param instance2: 实例2
:return: 欧氏距离
'''
dist = 0
for key in range(len(instance1)):
dist += (float(instance1[key]) - float(instance2[key])) ** 2
return dist ** 0.5
if __name__ == '__main__':
# 使用iris数据集进行测试
dataset = pd.read_csv('dataset/iris.csv')
distanceMartix = []
data = dataset.to_numpy()[:, 1:5]
# 计算距离矩阵
for i in range(len(data)):
tempdistances_l = [DensityPeak.getDistanceByEuclid(data[i], data[j]) for j in range(len(data))]
distanceMartix.append(tempdistances_l)
distanceMartix = np.array(distanceMartix)
# 进行聚类
dp = DensityPeak(distanceMartix, 0.2, "max")
dp.cluster()
for key, value in dp.clusters_d.items():
print("簇号=", key, ",clustet= ", value)
print("簇长度=", len(value))
3.SMDP类
SMDP核心代码
# Coding:utf-8
# @Time:2022/6/24,12:17
# @Auther:zhang
# @file:SMDP.py
# @Software:PyCharm
import os
import numpy as np
import utils
from sklearn.metrics import euclidean_distances as eucl
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from DensityPeak import DensityPeak
class SMDP:
def __init__(self, datasetPath, dcRatio, bagRatio, distanceType="ave", kernel="gaussian"):
'''
初始化
:param datasetPath: 数据集路径
:param dcRatio: 半径计算比率
:param bagRatio: 关键包比例
:param distanceType: 距离计算类型
:param kernel: 密度计算核
'''
self.datasetPath = datasetPath
# 加载数据集
self.bags_l, self.labels_l = utils.load_data(self.datasetPath)
self.numBags = len(self.bags_l)
self.distanceType = distanceType
self.kernel = kernel
self.dcRatio = dcRatio
self.keyBagRatio = bagRatio
# 计算数据集距离矩阵
self.distanceMatrix = self.getDistanceMatrix()
# 进行密度峰值计算
dp = DensityPeak(self.distanceMatrix, dcRatio=dcRatio, clusterNumRatio=self.keyBagRatio)
# 获取聚类中心,即关键包
center_idx = dp.getClusterCenter()
# 关键包
self.vectors = self.distanceMatrix[:, center_idx]
def getDistanceMatrix(self):
'''
计算数据集距离矩阵
:return:
'''
# 设置距离矩阵保存路径 like "distance/musk1+_ave_h.npy"
distanceMatrixSavePath = 'distance/' + self.datasetPath.split('/')[-1].split('.')[
0] + '_' + self.distanceType + '.npy'
if os.path.exists(distanceMatrixSavePath):
print('load computed distance matrix...')
distanceMatrix = np.load(distanceMatrixSavePath)
return distanceMatrix
# 使用平均Hausdorff距离计算包之间的距离
print('computing distance matrix...')
distanceMatrix = np.zeros((self.numBags, self.numBags))
if self.distanceType == 'ave':
for i in tqdm(range(self.numBags), desc='computing'):
for j in range(i, self.numBags):
distanceMatrix[i, j] = distanceMatrix[j, i] = self.ave_Hausdorff(i, j)
np.save(distanceMatrixSavePath, distanceMatrix)
return distanceMatrix
def ave_Hausdorff(self, i, j):
'''
平均Hausdorff距离
:param i: 实例i
:param j: 实例j
:return: 计算距离
'''
if i == j:
return 0
tempDis1 = np.sum(np.min(eucl(self.bags_l[i], self.bags_l[j]), axis=1))
tempDis2 = np.sum(np.min(eucl(self.bags_l[j], self.bags_l[i]), axis=1))
return (tempDis1 + tempDis2) / (self.bags_l[i].shape[0] + self.bags_l[j].shape[0])
def run(self, train_idx, test_idx):
'''
启动SMDP
:param train_idx: 训练样本索引
:param test_idx: 测试样本索引
:return:
'''
train_vector = self.vectors[train_idx]
test_vector = self.vectors[test_idx]
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_vector, self.labels_l[train_idx])
predicts = knn.predict(test_vector)
accuracy = np.sum(predicts == self.labels_l[test_idx]) / len(test_idx)
return accuracy * 100
def one_cv(self, k):
'''
一次k折交叉验证
:param k: k
:return: 交叉验证准确率
'''
trainset_idx_l, testset_idx_l = utils.get_index(self.numBags, k)
accuracy_l = []
for i in range(k):
tempAcc = self.run(trainset_idx_l[i], testset_idx_l[i])
accuracy_l.append(tempAcc)
return np.mean(accuracy_l)
def n_cv(self, n, k):
'''
n重k折交叉验证
:param n: 训练次数
:param k: k
:return: 平均准确率
'''
accuracy_l = []
for i in range(n):
tempAcc = self.one_cv(k)
accuracy_l.append(tempAcc)
return float(np.mean(accuracy_l)), float(np.std(accuracy_l, ddof=1))
if __name__ == '__main__':
path = 'Benchmark/musk1+.mat'
measure = 'ave'
kernel = 'gaussian'
r, u = 0.2, 0.6
k, n = 10, 10 # n次k vc
smdp = SMDP(path, distanceType=measure, kernel=kernel, dcRatio=r, bagRatio=u)
acc, std = smdp.n_cv(n, k)
print(path.split('/')[-1])
print('$%.1f_{%.1f}$' % (acc, std))
print(smdp.bags_l[1].shape)
4.运行测试
使用musk1+数据集进行测试