参考博客:https://blog.csdn.net/songbinxu/article/details/80838865
修正原文中代码瑕疵!
谱聚类
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist,squareform
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from itertools import cycle,islice
def genTwoCircles(n_samples=1000):
X,y = datasets.make_circles(n_samples,factor=0.5,noise=0.05)
return X,y
def distance(A,B): #计算两点之间距离 距离计算不一样
return np.linalg.norm(A-B)
def adjacentMatrix_KNN(data,k,sigma=1.0): #利用KNN获取邻接矩阵
n = len(data)
dist_matrix = squareform(pdist(data,metric='euclidean'))
AdjMat = np.zeros((n,n))
for i in range(n):
dist_with_index = zip(dist_matrix[i],range(n))
dist_with_index = sorted(dist_with_index,key= lambda x:x[0])
neighbours_id = [dist_with_index[m][1] for m in range(k+1)]
for j in neighbours_id:
AdjMat[i,j] = np.exp(-((dist_matrix[i,j])**2)/2/sigma/sigma)
AdjMat[j,i] = AdjMat[i,j]
return AdjMat
# '''
# for idx,each in enumerate(dist_matrix):
# index_array = np.argsort(each)
# AdjMat[idx][index_array[1:k+1]] = 1
# # for i in range(n):
# # for j in range(n):
# # AdjMat[i]
# AdjMatrix = (AdjMat + AdjMat.T)/2
# return AdjMatrix
# '''
def LaplacianMatrix(adjacentMatrix): #获取标准的拉普拉斯矩阵
# compute the Degree Matrix: D = diag(sum(邻接矩阵))
degreeMatrix = np.sum(adjacentMatrix,axis=1)
#计算拉普拉斯矩阵: L= D-A
laplacianMatrix = np.diag(degreeMatrix) - adjacentMatrix
#下面是进行标准化normalize :D^(-1/2) L D^(-1/2)
sqrtDegreeMatrix = np.diag(1.0 / (degreeMatrix ** (0.5)))
return np.dot(np.dot(sqrtDegreeMatrix,laplacianMatrix),sqrtDegreeMatrix)
def getEigVec(LaMat,n_cluster): #获取k个最小特征值
eigVal,eigVec = np.linalg.eig(LaMat)
# index_eigVal = np.argsort(eigVal)
# O_index = index_eigVal[0:n_cluster]
# O_eigVec = eigVec[:,O_index]
# return np.real(O_eigVec)
return np.real(eigVec)
def plot(X,y_sp,y_km):
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00']),int(max(y_km) + 1))))
plt.subplot(121)
plt.scatter(X[:,0], X[:,1], s=10, color=colors[y_sp])
plt.title("Spectral Clustering")
plt.subplot(122)
plt.scatter(X[:,0], X[:,1], s=10, color=colors[y_km])
plt.title("Kmeans Clustering")
plt.show()
if __name__ =="__main__":
#获取数据
# data1 = np.loadtxt(r'E:\dataset\clusterData\ringData.txt')
# data2 = np.loadtxt(r'E:\dataset\clusterData\Gaussian.txt')
# y1 = np.zeros((len(data1),1))
# y2 = np.ones((len(data2),1))
# X = np.vstack((data1,data2)) #数据
# y = np.vstack((y1,y2)) #真实标记
X,y = genTwoCircles(n_samples=1000)
k = 2
A = adjacentMatrix_KNN(X,5,sigma=1.0)
LapMat = LaplacianMatrix(A)
H = getEigVec(LapMat,n_cluster=k)
y_sp = KMeans(n_clusters=k).fit_predict(H)
y_km = KMeans(n_clusters=k).fit_predict(X)
plot(X,y_sp,y_km)