数据集地址:https://download.csdn.net/download/fanzonghao/10855060
import numpy as np
import math as m
import random
import matplotlib.pyplot as plt
import evaluate as eva
# flame.txt
# Jain_cluster=2.txt
# Aggregation_cluster=7.txt
# Spiral_cluster=3.txt
# Pathbased_cluster=3.txt
data_path = "./Aggregation_cluster=7.txt"
# 导入数据
def load_data():
points = np.loadtxt(data_path, delimiter='\t')
return points
def cal_dis(data, clu, k):
"""
计算质点与数据点的距离
:param data: 样本点
:param clu: 质点集合
:param k: 类别个数
:return: 质心与样本点距离矩阵
"""
dis = []
for i in range(len(data)):
dis.append([])
for j in range(k):
dis[i].append(m.sqrt((data[i, 0] - clu[j, 0])**2 + (data[i, 1]-clu[j, 1])**2))
return np.asarray(dis)
def divide(data, dis):
"""
对数据点分组
:param data: 样本集合
:param dis: 质心与所有样本的距离
:param k: 类别个数
:return: 分割后样本
"""
clusterRes = [0] * len(data)
for i in range(len(data)):
seq = np.argsort(dis[i])
clusterRes[i] = seq[0]
return np.asarray(clusterRes)
def center(data, clusterRes, k):
"""
计算质心
:param group: 分组后样本
:param k: 类别个数
:return: 计算得到的质心
"""
clunew = []
for i in range(k):
# 计算每个组的新质心
idx = np.where(clusterRes == i)
sum = data[idx].sum(axis=0)
avg_sum = sum/len(data[idx])
clunew.append(avg_sum)
clunew = np.asarray(clunew)
return clunew[:, 0: 2]
def classfy(data, clu, k):
"""
迭代收敛更新质心
:param data: 样本集合
:param clu: 质心集合
:param k: 类别个数
:return: 误差, 新质心
"""
clulist = cal_dis(data, clu, k)
print('clulist=',clulist)
clusterRes = divide(data, clulist)
print('clusterRes=',clusterRes)
clunew = center(data, clusterRes, k)
err = clunew - clu
return err, clunew, k, clusterRes
def plotRes(data, clusterRes, clusterNum):
"""
结果可视化
:param data:样本集
:param clusterRes:聚类结果
:param clusterNum: 类个数
:return:
"""
nPoints = len(data)
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
for i in range(clusterNum):
color = scatterColors[i % len(scatterColors)]
x1 = []; y1 = []
for j in range(nPoints):
if clusterRes[j] == i:
x1.append(data[j, 0])
y1.append(data[j, 1])
plt.scatter(x1, y1, c=color, alpha=1, marker='+')
plt.show()
if __name__ == '__main__':
k = 5 # 类别个数
data = load_data()
print(data)
clu = random.sample(data[:, 0:2].tolist(), k) # 随机取质心
print('clu=',clu)
clu = np.asarray(clu)
err, clunew, k, clusterRes = classfy(data, clu, k)
while np.any(abs(err) > 0):
print(clunew)
err, clunew, k, clusterRes = classfy(data, clunew, k)
print('err=',err)
#画出最新的中心点
plt.plot(clunew[:,0], clunew[:,1],'o',color='red')
clulist = cal_dis(data, clunew, k)
clusterResult = divide(data, clulist)
print('clusterResult',clusterResult)
nmi, acc, purity = eva.eva(clusterResult, np.asarray(data[:, 2]))
print(nmi, acc, purity)
plotRes(data, clusterResult, k)
evaluate.py:
import math
import numpy as np
def eva(A, B):
# 样本点数
total = len(A)
A_ids = set(A)
B_ids = set(B)
MI = 0
eps = 1.4e-45
acc = 0
purity = 0
for idA in A_ids:
max_purity = 0.0
for idB in B_ids:
idAOccur = np.where(A == idA) # 返回下标
idBOccur = np.where(B == idB)
idABOccur = np.intersect1d(idAOccur, idBOccur)
px = 1.0*len(idAOccur[0])/total
py = 1.0*len(idBOccur[0])/total
pxy = 1.0*len(idABOccur)/total
MI = MI + pxy*math.log(pxy/(px*py)+eps, 2) # 互信息计算
if idA == idB:
acc = acc + pxy # 准确度计算
if len(idABOccur) > max_purity: # 纯度计算
max_purity = len(idABOccur)
purity = purity + 1.0*len(idABOccur)/total
# 标准化互信息
Hx = 0
for idA in A_ids:
idAOccurCount = 1.0*len(np.where(A == idA)[0])
Hx = Hx - (idAOccurCount/total)*math.log(idAOccurCount/total+eps, 2)
Hy = 0
for idB in B_ids:
idBOccurCount = 1.0*len(np.where(B == idB)[0])
Hy = Hy - (idBOccurCount/total)*math.log(idBOccurCount/total+eps, 2)
NMI = 2.0*MI/(Hx+Hy)
return NMI, acc, purity
k等于2:
k等于3:
k等于4:
k==5:
k==7:
另一个数据集:
另一个数据集:
另一个数据集:
另一个数据集:
通过上述结果展示,我们可以看到K-means算法有很多明显的缺陷:
1. K-means算法在多种不同情况下的聚类表现得并不太好,我们可以看到K-means得到的簇更偏向于球形,这意味着 K-means 算法不能处理非球形簇的聚类问题 ,而现实中数据的分布情况是十分复杂的,所以K-means算法不太适用于现实大多数情况。
2。K-means算法需要预先确定聚类的个数 。
3. K-means 算法对初始选取的聚类中心点敏感 。我们可以看到在第一个数据聚类中,K-means算法会把不同簇聚合在一起来满足球形状簇的聚类,而这种情况下得到的中心点在两个簇中间。这说明此时K-means陷入了一个局部最优解,而陷入局部最优的一个原因是初始化中心点的位置不太好。