import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
class DPC(object):
def __init__(self, X, clusterNum, numK):
self.X = X
self.nSample = X.shape[0]
self.clusterNum = clusterNum
self.numK = numK
self.rho = np.zeros(self.nSample)
self.delta = np.zeros(self.nSample)
self.gamma = np.zeros(self.nSample)
self.leader = np.ones(self.nSample, dtype=int) * int(-1)
self.distMatrix = pairwise_distances(X,metric="euclidean")
# ----------------------------------------------
# -------------KNN based delta------------------
deltaK = np.zeros(self.nSample)
ordids = np.argsort(self.distMatrix)
for i in range(self.nSample):
deltaK[i] = self.distMatrix[i][ordids[i,self.numK+1]]
miuK = np.mean(deltaK)
tempSum = 0
for i in range(self.nSample):
tempSum += (deltaK[i] - miuK)**2
self.distCut = miuK + np.sqrt(tempSum/(self.nSample-1))
print(self.distCut)
# -------------------------------------------------------
self.rho = np.sum(np.exp(-(self.distMatrix/self.distCut)**2), axis=1)
self.order_rho = np.flipud(np.argsort(self.rho))
# -------------密度最大点的delta-------------
self.delta[self.order_rho[0]] = np.max(self.distMatrix[self.order_rho[0],:])
self.leader[self.order_rho[0]] = -1
# -----------获取非密度最大点的delta和leader--------
for i in range(1, self.nSample):
min_dist = np.inf
min_idx = -1
for j in range(i):
dist = self.distMatrix[self.order_rho[i], self.order_rho[j]]
if dist < min_dist:
min_dist = dist
min_idx = self.order_rho[j]
self.delta[self.order_rho[i]] = min_dist
self.leader[self.order_rho[i]] = min_idx
self.gamma = self.rho * self.delta
self.order_gamma = np.flipud(np.argsort(self.gamma))
# --------给聚类中心分配簇标签----------
self.clusterIndex = np.ones(self.nSample, dtype=int) * (-1)
for i in range(self.clusterNum):
self.clusterIndex[self.order_gamma[i]] = i
for i in range(self.nSample):
if self.clusterIndex[self.order_rho[i]] == -1:
self.clusterIndex[self.order_rho[i]] = self.clusterIndex[self.leader[self.order_rho[i]]]
if __name__ == '__main__':
# --------------------------------------#
data = np.array(pd.read_csv(r'D:\ExperimentalData\Aggregation\aggregation.csv', header=None))
# data = np.array(pd.read_csv(r'D:\ExperimentalData\R15\R15.csv', header=None))
# data = np.array(pd.read_csv(r'D:\ExperimentalData\Jain\Jain.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
model = DPC(X,clusterNum=21,numK=16)
cluster_id = model.clusterIndex
# plt.scatter(X[:,0],X[:,1])
# plt.show()
plt.scatter(X[:,0],X[:,1],c=cluster_id)
plt.show()
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
class DPC(object):
def __init__(self, X, clusterNum, distPercent):
self.X = X
self.nSample = X.shape[0]
self.clusterNum = clusterNum
self.distPercent = distPercent
self.rho = np.zeros(self.nSample)
self.delta = np.zeros(self.nSample)
self.gamma = np.zeros(self.nSample)
self.leader = np.ones(self.nSample, dtype=int) * int(-1)
self.distMatrix = pairwise_distances(X,metric="euclidean")
self.distCut = np.max(self.distMatrix) * (self.distPercent / 100)
self.rho = np.sum(np.exp(-(self.distMatrix/self.distCut)**2), axis=1)
self.order_rho = np.flipud(np.argsort(self.rho))
# -------------密度最大点的delta-------------
self.delta[self.order_rho[0]] = np.max(self.distMatrix[self.order_rho[0],:])
self.leader[self.order_rho[0]] = -1
# -----------获取非密度最大点的delta和leader--------
for i in range(1, self.nSample):
min_dist = np.inf
min_idx = -1
for j in range(i):
dist = self.distMatrix[self.order_rho[i], self.order_rho[j]]
if dist < min_dist:
min_dist = dist
min_idx = self.order_rho[j]
self.delta[self.order_rho[i]] = min_dist
self.leader[self.order_rho[i]] = min_idx
self.gamma = self.rho * self.delta
self.order_gamma = np.flipud(np.argsort(self.gamma))
# --------给聚类中心分配簇标签----------
self.clusterIndex = np.ones(self.nSample, dtype=int) * (-1)
for i in range(self.clusterNum):
self.clusterIndex[self.order_gamma[i]] = i
for i in range(self.nSample):
if self.clusterIndex[self.order_rho[i]] == -1:
self.clusterIndex[self.order_rho[i]] = self.clusterIndex[self.leader[self.order_rho[i]]]
if __name__ == '__main__':
# --------------------------------------#
# data = np.array(pd.read_csv(r'D:\ExperimentalData\Aggregation\aggregation.csv', header=None))
# data = np.array(pd.read_csv(r'D:\ExperimentalData\R15\R15.csv', header=None))
data = np.array(pd.read_csv(r'D:\ExperimentalData\Jain\Jain.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
model = DPC(X,clusterNum=2,distPercent=2)
cluster_id = model.clusterIndex
# plt.scatter(X[:,0],X[:,1])
# plt.show()
plt.scatter(X[:,0],X[:,1],c=cluster_id)
plt.show()
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from scipy.spatial.distance import pdist,squareform
from collections import OrderedDict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io as scio
############%%%%%%-------------------%%%%%%%%%############
############%%%%%%-------------------%%%%%%%%%############
#--------获取截断距离的函数:没有问题-----------#
def getDistCut(distList,distPercent):
maxDist = max(distList)
distCut = maxDist * distPercent / 100
return distCut
#--------获取样本密度的函数:没有问题-----------#
def getRho(n,distMatrix,distCut):
rho = np.zeros(n,dtype=float)
for i in range(n-1):
for j in range(i+1,n):
if distMatrix[i,j] < distCut:
rho[i] += 1
rho[j] += 1
return rho
def getGammaOrderIndex(n,rho,distMatrix):
rhoOrdIndex = np.flipud(np.argsort(rho))
delta = np.zeros(n,dtype=float)
leader = np.ones(n,dtype=int) * (-1)
#-----------获取块密度最大点的Delta----------------#
maxdist = 0
for i in range(n):
if distMatrix[rhoOrdIndex[0],i] > maxdist:
maxdist = distMatrix[rhoOrdIndex[0],i]
delta[rhoOrdIndex[0]] = maxdist
leader[rhoOrdIndex[0]] = -1
# -----------获取非密度最大点的Delta----------------#
for i in range(1,n):
mindist = np.inf
minindex = -1
for j in range(i):
if distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]] < mindist:
mindist = distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]]
minindex = rhoOrdIndex[j]
delta[rhoOrdIndex[i]] = mindist
leader[rhoOrdIndex[i]] = minindex
gamma = delta * rho
gammaOrderIndex = np.flipud(np.argsort(gamma))
return rhoOrdIndex,gamma,gammaOrderIndex,leader
def getDPCA(n,rhoOrdIndex,gammaOrderIndex,leader,blockNum):
#-----------初始化样本类簇索引----------------------#
clusterIndex = np.ones(n,dtype=int) * (-1)
# --------给聚类中心分配簇标签----------------------#
for i in range(blockNum):
clusterIndex[gammaOrderIndex[i]] = i
#---------开始聚类---------------------------------#
for i in range(n):
if clusterIndex[rhoOrdIndex[i]] == -1:
clusterIndex[rhoOrdIndex[i]] = clusterIndex[leader[rhoOrdIndex[i]]]
##-------------初始化一个空字典,用于存储类簇---------------##
clusterSet = OrderedDict()
#--------字典初始化,使用列表存储类簇-----------#
for i in range(blockNum):
clusterSet[i] = []
#---将每个样本根据类簇标号分配到字典当中---#
for i in range(n):
clusterSet[clusterIndex[i]].append(i)
return clusterSet
if __name__ == '__main__':
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Glass\glass.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Spiral\spiral.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Aggregation(788)\aggregation.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Three blobs\ThreeBlobs.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\COIL-20\COIL20_PCA.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Banknote\banknote.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# pca = PCA(0.9)
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Semeion\semeion.csv', header=None))
# X = data[:, :-1]
# X = pca.fit_transform(X)
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Robot Navigation(5456)\Robot_Navigation_24.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Twonorm\twonorm.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Electrical Grid Stability Simulated Data Data Set\ELectricalGrid.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Pendigits\pendigits.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\HTRU2 Data Set\HTRU_2.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Avila\avila.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Dataset for Sensorless Drive Diagnosis(58509)\Sensorless_drive_diagnosis.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Satlog(shuttle)\Satlog(shuttle).csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
# --------------------------------------#
# mnist = fetch_mldata('MNIST original')
# X = mnist['data']
# y = mnist['target']
# --------------------------------------#
# X, y = fetch_covtype(return_X_y=True)
#################上面是数据##########################
n = X.shape[0]
classNum = len(set(y))
blockNum = 7
distList = pdist(X, metric='cityblock')
distMatrix = squareform(distList)
distCut = getDistCut(distList,distPercent=7)
rho = getRho(n,distMatrix,distCut)
rhoOrdIndex, gamma, gammaOrderIndex, leader = getGammaOrderIndex(n,rho,distMatrix)
clusterSet = getDPCA(n, rhoOrdIndex, gammaOrderIndex, leader, blockNum)
budget = 50
for k,v in clusterSet.items():
E = X[v]
plt.scatter(E[:,0],E[:,1])
plt.show()
使用类封装
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from scipy.spatial.distance import pdist, squareform
from collections import OrderedDict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io as scio
############%%%%%%-------------------%%%%%%%%%############
############%%%%%%-------------------%%%%%%%%%############
class DPC(object):
def __init__(self,X,clusterNum,distPercent):
self.X = X
self.N = X.shape[0]
self.clusterNum = clusterNum
self.distPercent = distPercent
self.distCut = 0
self.rho = np.zeros(self.N,dtype=float)
self.delta = np.zeros(self.N,dtype=float)
self.gamma = np.zeros(self.N,dtype=float)
self.leader = np.ones(self.N,dtype=int) * int(-1)
self.distList = pdist(self.X,metric='euclidean')
self.distMatrix = squareform(self.distList)
self.clusterIdx = np.ones(self.N,dtype=int) * (-1)
def getDistCut(self):
maxDist = max(self.distList)
distCut = maxDist * self.distPercent /100
return distCut
def getRho(self):
self.distCut = self.getDistCut()
rho = np.zeros(self.N, dtype=float)
for i in range(self.N -1):
for j in range(i+1,self.N):
if self.distMatrix[i,j] < self.distCut:
rho[i] += 1
rho[j] += 1
return rho
def getGammaOrderIndex(self):
self.rho = self.getRho()
rhoOrdIndex = np.flipud(np.argsort(self.rho))
# -----------获取块密度最大点的Delta----------------#
maxdist = 0
for i in range(self.N):
if self.distMatrix[rhoOrdIndex[0], i] > maxdist:
maxdist = self.distMatrix[rhoOrdIndex[0], i]
self.delta[rhoOrdIndex[0]] = maxdist
self.leader[rhoOrdIndex[0]] = -1
# -----------获取非密度最大点的Delta----------------#
for i in range(1, self.N):
mindist = np.inf
minindex = -1
for j in range(i):
if self.distMatrix[rhoOrdIndex[i], rhoOrdIndex[j]] < mindist:
mindist = self.distMatrix[rhoOrdIndex[i], rhoOrdIndex[j]]
minindex = rhoOrdIndex[j]
self.delta[rhoOrdIndex[i]] = mindist
self.leader[rhoOrdIndex[i]] = minindex
self.gamma = self.delta * self.rho
gammaOrderIndex = np.flipud(np.argsort(self.gamma))
return gammaOrderIndex,rhoOrdIndex
def getDPC(self):
gammaOrderIndex,rhoOrdIndex = self.getGammaOrderIndex()
# -----------给聚类中心分配簇标签------------------#
for i in range(self.clusterNum):
self.clusterIdx[gammaOrderIndex[i]] = i
# --------开始聚类-----------------------#
for i in range(self.N):
if self.clusterIdx[rhoOrdIndex[i]] == -1:
self.clusterIdx[rhoOrdIndex[i]] = self.clusterIdx[self.leader[rhoOrdIndex[i]]]
##-------------初始化一个空字典,用于存储类簇---------------##
clusterSet = OrderedDict()
# --------字典初始化,使用列表存储类簇-----------#
for i in range(self.clusterNum):
clusterSet[i] = []
# ---将每个样本根据类簇标号分配到字典当中---#
for i in range(self.N):
clusterSet[self.clusterIdx[i]].append(i)
return clusterSet
if __name__ == '__main__':
# --------------------------------------#
data = np.array(pd.read_csv(r'D:\牛牛\ExperimentalData\Aggregation\aggregation.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
dpc = DPC(X,clusterNum=7,distPercent=7)
clusterSet = dpc.getDPC()
budget = 50
for k, v in clusterSet.items():
E = X[v]
plt.scatter(E[:, 0], E[:, 1])
plt.show()
使用了类封装(并基于KNN方式确定截断距离)代码如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.spatial.distance import pdist,squareform
class DPCA(object):
def __init__(self,X,neighborNum,blockNum):
self.X = X
self.N = X.shape[0]
self.K = neighborNum
self.blockNum = blockNum
self.distCut = 0
self.rho = np.zeros(self.N,dtype=float)
self.delta = np.zeros(self.N,dtype=float)
self.gamma = np.zeros(self.N,dtype=float)
self.leader = np.ones(self.N,dtype=int) * int(-1)
self.distMatrix = squareform(pdist(self.X,metric='euclidean'))
self.clusterIdx = np.ones(self.N,dtype=int) * (-1)
def get_distCut(self):
deltaK = np.zeros(self.N,dtype=float)
for i in range(self.N):
ordIdx = np.argsort(self.distMatrix[i])
deltaK[i] = self.distMatrix[i][ordIdx[self.K+1]]
miuK = np.mean(deltaK)
tempSum = 0
for i in range(self.N):
tempSum += (deltaK[i] - miuK)**2
self.distCut = miuK + np.sqrt(tempSum/(self.N-1))
def get_Rho(self):
for i in range(self.N-1):
for j in range(i+1,self.N):
self.rho[i] = self.rho[i] + np.exp(-(self.distMatrix[i,j]/self.distCut)**2)
self.rho[j] = self.rho[j] + np.exp(-(self.distMatrix[i,j]/self.distCut)**2)
def DPCA(self):
rhoOrdIndex = np.flipud(np.argsort(self.rho))
maxdist = 0
for ele in range(self.N):
if self.distMatrix[rhoOrdIndex[0],ele]>maxdist:
maxdist = self.distMatrix[rhoOrdIndex[0],ele]
self.delta[rhoOrdIndex[0]] = maxdist
for i in range(1,self.N):
mindist = np.inf
minindex = -1
for j in range(i):
if self.distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]] < mindist:
mindist = self.distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]]
minindex = rhoOrdIndex[j]
self.delta[rhoOrdIndex[i]] = mindist
self.leader[rhoOrdIndex[i]] = minindex
self.gamma = self.delta * self.rho
gammaOrdIdx = np.flipud(np.argsort(self.gamma))
# 初始化聚类中心
for k in range(self.blockNum):
self.clusterIdx[gammaOrdIdx[k]] = k
# 对中心点以外样本进行聚类
for i in range(self.N):
if self.clusterIdx[rhoOrdIndex[i]] == -1:
self.clusterIdx[rhoOrdIndex[i]] = self.clusterIdx[self.leader[rhoOrdIndex[i]]]
clusterSet = OrderedDict()
for k in range(self.blockNum):
clusterSet[k] = []
for i in range(self.N):
clusterSet[self.clusterIdx[i]].append(i)
return clusterSet
if __name__ == '__main__':
# ----------------Aggregation(neighborNum=3)----------------------#
data = np.array(pd.read_csv(r'D:\牛牛\ExperimentalData\Aggregation\aggregation.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
neighborNum = 10
blockNum = 7
dpc = DPCA(X,neighborNum,blockNum)
dpc.get_distCut()
dpc.get_Rho()
clusterSet = dpc.DPCA()
for v in clusterSet.values():
E = X[v]
plt.scatter(E[:,0],E[:,1])
plt.show()