Python:相对标准的DPC

import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler

class DPC(object):
    def __init__(self, X, clusterNum, numK):
        self.X = X
        self.nSample = X.shape[0]
        self.clusterNum = clusterNum
        self.numK = numK
        self.rho = np.zeros(self.nSample)
        self.delta = np.zeros(self.nSample)
        self.gamma = np.zeros(self.nSample)
        self.leader = np.ones(self.nSample, dtype=int) * int(-1)
        self.distMatrix = pairwise_distances(X,metric="euclidean")
        # ----------------------------------------------

        # -------------KNN based delta------------------
        deltaK = np.zeros(self.nSample)
        ordids = np.argsort(self.distMatrix)
        for i in range(self.nSample):
            deltaK[i] = self.distMatrix[i][ordids[i,self.numK+1]]
        miuK = np.mean(deltaK)
        tempSum = 0
        for i in range(self.nSample):
            tempSum += (deltaK[i] - miuK)**2
        self.distCut = miuK + np.sqrt(tempSum/(self.nSample-1))
        print(self.distCut)

        # -------------------------------------------------------
        self.rho = np.sum(np.exp(-(self.distMatrix/self.distCut)**2), axis=1)
        self.order_rho = np.flipud(np.argsort(self.rho))


        # -------------密度最大点的delta-------------
        self.delta[self.order_rho[0]] = np.max(self.distMatrix[self.order_rho[0],:])
        self.leader[self.order_rho[0]] = -1
        # -----------获取非密度最大点的delta和leader--------
        for i in range(1, self.nSample):
            min_dist = np.inf
            min_idx = -1
            for j in range(i):
                dist = self.distMatrix[self.order_rho[i], self.order_rho[j]]
                if dist < min_dist:
                    min_dist = dist
                    min_idx = self.order_rho[j]
            self.delta[self.order_rho[i]] = min_dist
            self.leader[self.order_rho[i]] = min_idx
        self.gamma = self.rho * self.delta
        self.order_gamma = np.flipud(np.argsort(self.gamma))

        # --------给聚类中心分配簇标签----------
        self.clusterIndex = np.ones(self.nSample, dtype=int) * (-1)
        for i in range(self.clusterNum):
            self.clusterIndex[self.order_gamma[i]] = i

        for i in range(self.nSample):
            if self.clusterIndex[self.order_rho[i]] == -1:
                self.clusterIndex[self.order_rho[i]] = self.clusterIndex[self.leader[self.order_rho[i]]]



if __name__ == '__main__':
    # --------------------------------------#
    data = np.array(pd.read_csv(r'D:\ExperimentalData\Aggregation\aggregation.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\R15\R15.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\Jain\Jain.csv', header=None))


    X = data[:, :-1]
    y = data[:, -1]
    model = DPC(X,clusterNum=21,numK=16)
    cluster_id = model.clusterIndex


    # plt.scatter(X[:,0],X[:,1])
    # plt.show()
    plt.scatter(X[:,0],X[:,1],c=cluster_id)
    plt.show()















import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler

class DPC(object):
    def __init__(self, X, clusterNum, distPercent):
        self.X = X
        self.nSample = X.shape[0]
        self.clusterNum = clusterNum
        self.distPercent = distPercent
        self.rho = np.zeros(self.nSample)
        self.delta = np.zeros(self.nSample)
        self.gamma = np.zeros(self.nSample)
        self.leader = np.ones(self.nSample, dtype=int) * int(-1)
        self.distMatrix = pairwise_distances(X,metric="euclidean")
        self.distCut = np.max(self.distMatrix) * (self.distPercent / 100)
        self.rho = np.sum(np.exp(-(self.distMatrix/self.distCut)**2), axis=1)
        self.order_rho = np.flipud(np.argsort(self.rho))


        # -------------密度最大点的delta-------------
        self.delta[self.order_rho[0]] = np.max(self.distMatrix[self.order_rho[0],:])
        self.leader[self.order_rho[0]] = -1
        # -----------获取非密度最大点的delta和leader--------
        for i in range(1, self.nSample):
            min_dist = np.inf
            min_idx = -1
            for j in range(i):
                dist = self.distMatrix[self.order_rho[i], self.order_rho[j]]
                if dist < min_dist:
                    min_dist = dist
                    min_idx = self.order_rho[j]
            self.delta[self.order_rho[i]] = min_dist
            self.leader[self.order_rho[i]] = min_idx
        self.gamma = self.rho * self.delta
        self.order_gamma = np.flipud(np.argsort(self.gamma))

        # --------给聚类中心分配簇标签----------
        self.clusterIndex = np.ones(self.nSample, dtype=int) * (-1)
        for i in range(self.clusterNum):
            self.clusterIndex[self.order_gamma[i]] = i

        for i in range(self.nSample):
            if self.clusterIndex[self.order_rho[i]] == -1:
                self.clusterIndex[self.order_rho[i]] = self.clusterIndex[self.leader[self.order_rho[i]]]



if __name__ == '__main__':
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\Aggregation\aggregation.csv', header=None))
    # data = np.array(pd.read_csv(r'D:\ExperimentalData\R15\R15.csv', header=None))
    data = np.array(pd.read_csv(r'D:\ExperimentalData\Jain\Jain.csv', header=None))


    X = data[:, :-1]
    y = data[:, -1]
    model = DPC(X,clusterNum=2,distPercent=2)
    cluster_id = model.clusterIndex


    # plt.scatter(X[:,0],X[:,1])
    # plt.show()
    plt.scatter(X[:,0],X[:,1],c=cluster_id)
    plt.show()















import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from scipy.spatial.distance import pdist,squareform
from collections import OrderedDict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io as scio

############%%%%%%-------------------%%%%%%%%%############

############%%%%%%-------------------%%%%%%%%%############

#--------获取截断距离的函数:没有问题-----------#
def getDistCut(distList,distPercent):
    maxDist = max(distList)
    distCut = maxDist * distPercent / 100
    return distCut
#--------获取样本密度的函数:没有问题-----------#
def getRho(n,distMatrix,distCut):
    rho = np.zeros(n,dtype=float)
    for i in range(n-1):
        for j in range(i+1,n):
            if distMatrix[i,j] < distCut:
                rho[i] += 1
                rho[j] += 1
    return rho

def getGammaOrderIndex(n,rho,distMatrix):
    rhoOrdIndex = np.flipud(np.argsort(rho))
    delta = np.zeros(n,dtype=float)
    leader = np.ones(n,dtype=int) * (-1)
    #-----------获取块密度最大点的Delta----------------#
    maxdist = 0
    for i in range(n):
        if distMatrix[rhoOrdIndex[0],i] > maxdist:
            maxdist = distMatrix[rhoOrdIndex[0],i]
    delta[rhoOrdIndex[0]] = maxdist
    leader[rhoOrdIndex[0]] = -1
    # -----------获取非密度最大点的Delta----------------#
    for i in range(1,n):
        mindist = np.inf
        minindex = -1
        for j in range(i):
            if distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]] < mindist:
                mindist = distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]]
                minindex = rhoOrdIndex[j]
        delta[rhoOrdIndex[i]] = mindist
        leader[rhoOrdIndex[i]] = minindex
    gamma = delta * rho
    gammaOrderIndex = np.flipud(np.argsort(gamma))
    return rhoOrdIndex,gamma,gammaOrderIndex,leader

def getDPCA(n,rhoOrdIndex,gammaOrderIndex,leader,blockNum):
    #-----------初始化样本类簇索引----------------------#
    clusterIndex = np.ones(n,dtype=int) * (-1)
    # --------给聚类中心分配簇标签----------------------#
    for i in range(blockNum):
        clusterIndex[gammaOrderIndex[i]] = i
    #---------开始聚类---------------------------------#
    for i in range(n):
        if clusterIndex[rhoOrdIndex[i]] == -1:
            clusterIndex[rhoOrdIndex[i]] = clusterIndex[leader[rhoOrdIndex[i]]]
    ##-------------初始化一个空字典,用于存储类簇---------------##
    clusterSet = OrderedDict()
    #--------字典初始化,使用列表存储类簇-----------#
    for i in range(blockNum):
        clusterSet[i] = []
    #---将每个样本根据类簇标号分配到字典当中---#
    for i in range(n):
        clusterSet[clusterIndex[i]].append(i)
    return clusterSet

if __name__ == '__main__':
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Glass\glass.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Spiral\spiral.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Aggregation(788)\aggregation.csv', header=None))
    X = data[:, :-1]
    y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Three blobs\ThreeBlobs.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\COIL-20\COIL20_PCA.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Banknote\banknote.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # pca = PCA(0.9)
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Semeion\semeion.csv', header=None))
    # X = data[:, :-1]
    # X = pca.fit_transform(X)
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Robot Navigation(5456)\Robot_Navigation_24.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Twonorm\twonorm.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Electrical Grid Stability Simulated Data Data Set\ELectricalGrid.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Pendigits\pendigits.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\HTRU2 Data Set\HTRU_2.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Avila\avila.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\1-Dataset for Sensorless Drive Diagnosis(58509)\Sensorless_drive_diagnosis.csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # data = np.array(pd.read_csv(r'E:\dataset\ExperimentalData\Satlog(shuttle)\Satlog(shuttle).csv', header=None))
    # X = data[:, :-1]
    # y = data[:, -1]
    # --------------------------------------#
    # mnist = fetch_mldata('MNIST original')
    # X = mnist['data']
    # y = mnist['target']
    # --------------------------------------#
    # X, y = fetch_covtype(return_X_y=True)
    #################上面是数据##########################
    n = X.shape[0]
    classNum = len(set(y))
    blockNum = 7
    distList = pdist(X, metric='cityblock')
    distMatrix = squareform(distList)
    distCut = getDistCut(distList,distPercent=7)
    rho = getRho(n,distMatrix,distCut)
    rhoOrdIndex, gamma, gammaOrderIndex, leader = getGammaOrderIndex(n,rho,distMatrix)
    clusterSet = getDPCA(n, rhoOrdIndex, gammaOrderIndex, leader, blockNum)

    budget = 50
    for k,v in clusterSet.items():
        E = X[v]
        plt.scatter(E[:,0],E[:,1])
    plt.show()

使用类封装 

import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from scipy.spatial.distance import pdist, squareform
from collections import OrderedDict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io as scio


############%%%%%%-------------------%%%%%%%%%############

############%%%%%%-------------------%%%%%%%%%############
class DPC(object):
    def __init__(self,X,clusterNum,distPercent):
        self.X = X
        self.N = X.shape[0]
        self.clusterNum = clusterNum
        self.distPercent = distPercent
        self.distCut = 0
        self.rho = np.zeros(self.N,dtype=float)
        self.delta = np.zeros(self.N,dtype=float)
        self.gamma = np.zeros(self.N,dtype=float)
        self.leader = np.ones(self.N,dtype=int) * int(-1)
        self.distList = pdist(self.X,metric='euclidean')
        self.distMatrix = squareform(self.distList)
        self.clusterIdx = np.ones(self.N,dtype=int) * (-1)

    def getDistCut(self):
        maxDist = max(self.distList)
        distCut = maxDist * self.distPercent /100
        return distCut

    def getRho(self):
        self.distCut = self.getDistCut()
        rho = np.zeros(self.N, dtype=float)
        for i in range(self.N -1):
            for j in range(i+1,self.N):
                if self.distMatrix[i,j] < self.distCut:
                    rho[i] += 1
                    rho[j] += 1
        return rho
    def getGammaOrderIndex(self):
        self.rho = self.getRho()
        rhoOrdIndex = np.flipud(np.argsort(self.rho))
        # -----------获取块密度最大点的Delta----------------#
        maxdist = 0
        for i in range(self.N):
            if self.distMatrix[rhoOrdIndex[0], i] > maxdist:
                maxdist = self.distMatrix[rhoOrdIndex[0], i]
        self.delta[rhoOrdIndex[0]] = maxdist
        self.leader[rhoOrdIndex[0]] = -1
        # -----------获取非密度最大点的Delta----------------#
        for i in range(1, self.N):
            mindist = np.inf
            minindex = -1
            for j in range(i):
                if self.distMatrix[rhoOrdIndex[i], rhoOrdIndex[j]] < mindist:
                    mindist = self.distMatrix[rhoOrdIndex[i], rhoOrdIndex[j]]
                    minindex = rhoOrdIndex[j]
            self.delta[rhoOrdIndex[i]] = mindist
            self.leader[rhoOrdIndex[i]] = minindex
        self.gamma = self.delta * self.rho
        gammaOrderIndex = np.flipud(np.argsort(self.gamma))
        return gammaOrderIndex,rhoOrdIndex
    def getDPC(self):
        gammaOrderIndex,rhoOrdIndex = self.getGammaOrderIndex()
        # -----------给聚类中心分配簇标签------------------#
        for i in range(self.clusterNum):
            self.clusterIdx[gammaOrderIndex[i]] = i
        # --------开始聚类-----------------------#
        for i in range(self.N):
            if self.clusterIdx[rhoOrdIndex[i]] == -1:
                self.clusterIdx[rhoOrdIndex[i]] = self.clusterIdx[self.leader[rhoOrdIndex[i]]]
        ##-------------初始化一个空字典,用于存储类簇---------------##
        clusterSet = OrderedDict()
        # --------字典初始化,使用列表存储类簇-----------#
        for i in range(self.clusterNum):
            clusterSet[i] = []
        # ---将每个样本根据类簇标号分配到字典当中---#
        for i in range(self.N):
            clusterSet[self.clusterIdx[i]].append(i)
        return clusterSet

if __name__ == '__main__':
    # --------------------------------------#
    data = np.array(pd.read_csv(r'D:\牛牛\ExperimentalData\Aggregation\aggregation.csv', header=None))
    X = data[:, :-1]
    y = data[:, -1]
    dpc = DPC(X,clusterNum=7,distPercent=7)
    clusterSet = dpc.getDPC()

    budget = 50
    for k, v in clusterSet.items():
        E = X[v]
        plt.scatter(E[:, 0], E[:, 1])
    plt.show()

使用了类封装(并基于KNN方式确定截断距离)代码如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from scipy.spatial.distance import pdist,squareform


class DPCA(object):
    def __init__(self,X,neighborNum,blockNum):
        self.X = X
        self.N = X.shape[0]
        self.K = neighborNum
        self.blockNum = blockNum
        self.distCut = 0
        self.rho = np.zeros(self.N,dtype=float)
        self.delta = np.zeros(self.N,dtype=float)
        self.gamma = np.zeros(self.N,dtype=float)
        self.leader = np.ones(self.N,dtype=int) * int(-1)
        self.distMatrix = squareform(pdist(self.X,metric='euclidean'))
        self.clusterIdx = np.ones(self.N,dtype=int) * (-1)
    def get_distCut(self):
        deltaK = np.zeros(self.N,dtype=float)
        for i in range(self.N):
            ordIdx = np.argsort(self.distMatrix[i])
            deltaK[i] = self.distMatrix[i][ordIdx[self.K+1]]
        miuK = np.mean(deltaK)
        tempSum = 0
        for i in range(self.N):
            tempSum += (deltaK[i] - miuK)**2
        self.distCut = miuK + np.sqrt(tempSum/(self.N-1))
    def get_Rho(self):
        for i in range(self.N-1):
            for j in range(i+1,self.N):
                self.rho[i] = self.rho[i] + np.exp(-(self.distMatrix[i,j]/self.distCut)**2)
                self.rho[j] = self.rho[j] + np.exp(-(self.distMatrix[i,j]/self.distCut)**2)
    def DPCA(self):
        rhoOrdIndex = np.flipud(np.argsort(self.rho))
        maxdist = 0
        for ele in range(self.N):
            if self.distMatrix[rhoOrdIndex[0],ele]>maxdist:
                maxdist = self.distMatrix[rhoOrdIndex[0],ele]
        self.delta[rhoOrdIndex[0]] = maxdist

        for i in range(1,self.N):
            mindist = np.inf
            minindex = -1
            for j in range(i):
                if self.distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]] < mindist:
                    mindist = self.distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]]
                    minindex = rhoOrdIndex[j]
            self.delta[rhoOrdIndex[i]] = mindist
            self.leader[rhoOrdIndex[i]] = minindex
        self.gamma = self.delta * self.rho
        gammaOrdIdx = np.flipud(np.argsort(self.gamma))
        # 初始化聚类中心
        for k in range(self.blockNum):
            self.clusterIdx[gammaOrdIdx[k]] = k
        # 对中心点以外样本进行聚类
        for i in range(self.N):
            if self.clusterIdx[rhoOrdIndex[i]] == -1:
                self.clusterIdx[rhoOrdIndex[i]] = self.clusterIdx[self.leader[rhoOrdIndex[i]]]
        clusterSet = OrderedDict()
        for k in range(self.blockNum):
            clusterSet[k] = []
        for i in range(self.N):
            clusterSet[self.clusterIdx[i]].append(i)
        return clusterSet

if __name__ == '__main__':
    # ----------------Aggregation(neighborNum=3)----------------------#
    data = np.array(pd.read_csv(r'D:\牛牛\ExperimentalData\Aggregation\aggregation.csv', header=None))
    X = data[:, :-1]
    y = data[:, -1]
    neighborNum = 10
    blockNum = 7
    dpc = DPCA(X,neighborNum,blockNum)
    dpc.get_distCut()
    dpc.get_Rho()
    clusterSet = dpc.DPCA()
    for v in clusterSet.values():
        E = X[v]
        plt.scatter(E[:,0],E[:,1])
    plt.show()

  • 5
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 14
    评论
评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DeniuHe

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值