import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform
from collections import OrderedDict
from itertools import combinations,product
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelPropagation
from sklearn import metrics
from sklearn import datasets
from sklearn.metrics import mean_squared_error,accuracy_score,mean_absolute_error,f1_score
def getDistCut(distList,distPercent):
return max(distList) * distPercent / 100
def getRho(n,distMatrix,distCut):
rho = np.zeros(n,dtype=float)
for i in range(n-1):
for j in range(i+1,n):
rho[i] = rho[i] + np.exp(-(distMatrix[i, j] / distCut) ** 2)
rho[j] = rho[j] + np.exp(-(distMatrix[i, j] / distCut) ** 2)
return rho
#------------密度峰值聚类------------------#
def DPCA(n,distMatrix,rho,blockNum):
rhoOrdIndex = np.flipud(np.argsort(rho))
delta = np.zeros(n,dtype=float)
leader = np.ones(n,dtype=int) * int(-1)
'''获取密度最大样本的Delta和Leader'''
maxdist = 0
for ele in range(n):
if distMatrix[rhoOrdIndex[0],ele] > maxdist:
maxdist = distMatrix[rhoOrdIndex[0],ele]
delta[rhoOrdIndex[0]] = maxdist
'''获取非密度最大样本的Delta和Leader'''
for i in range(1,n):
mindist = np.inf
minindex = -1
for j in range(i):
if distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]] < mindist:
mindist = distMatrix[rhoOrdIndex[i],rhoOrdIndex[j]]
minindex = rhoOrdIndex[j]
delta[rhoOrdIndex[i]] = mindist
leader[rhoOrdIndex[i]] = minindex
gamma = delta * rho
gammaOrdIdx = np.flipud(np.argsort(gamma))
'''开始聚类'''
clusterIdx = np.ones(n,dtype=int) * (-1)
#------初始化聚类中心-------#
for k in range(blockNum):
clusterIdx[gammaOrdIdx[k]] = k
#------对中心点以外样本进行聚类-----------#
for i in range(n):
if clusterIdx[rhoOrdIndex[i]] == -1:
clusterIdx[rhoOrdIndex[i]] = clusterIdx[leader[rhoOrdIndex[i]]]
#-----------使用字典存储类簇----------------#
clusterSet = OrderedDict()
for k in range(blockNum):
clusterSet[k] = []
for i in range(n):
clusterSet[clusterIdx[i]].append(i)
return clusterSet
X, y = datasets.make_blobs(n_samples=500, n_features=2, centers=3, cluster_std=[1, 1, 1], random_state=104)
n = len(X)
Gamma = 0.5
distPercent = 5
distList = pdist(X,metric='euclidean')
distMatrix = squareform(distList)
distCut = getDistCut(distList,distPercent)
rho = getRho(n,distMatrix,distCut)
blockNum = 3
# clusterSet = DPCA(n,distMatrix,rho,blockNum)
# clusterSet = K_means(n,X,blockNum)
clusterSet = SpecClust(n,X,Gamma,blockNum)