影响 Kmeans 算法的可能因素:
• 如何选择距离?
对数据点进行类别划分时,需要计算点到点之间的距离。距离有很多种。 例如 欧式距离,𝑑2 (𝑋, 𝑌) = √(𝑥1 − 𝑦1 ) 2 + (𝑥2 − 𝑦2 ) 2, 其中𝑋 = (𝑥1, 𝑥2)和𝑌 = (𝑦1, 𝑦2)是两个不同的点; L1 范 数距离𝑑1 (𝑋, 𝑌) = |𝑥1 − 𝑦1 | + |𝑥2 − 𝑦2|。
• 如何确定合理的 K 值?K-Means算法之K值的选择 – 标点符
K-means聚类最优k值的选取_qq_15738501的博客-CSDN博客_轮廓系数判断k的最优
• 如何选择 K 个初始类簇的中心点?
1、随机选择K个点作为初始的类簇中心点,但是该方法在有些情况下的效果较差
2、选择批次距离尽可能远的K个点
首先随机选择一个点作为第一个初始类簇中心点,然后选择距离该点最远的那个点作为第二个初始类簇中心点,然后再选择距离前两个点的最近距离最大的点作为第三个初始类簇的中心点,以此类推,直至选出K个初始类簇中心点。
3、选用层次聚类或者Canopy算法进行初始聚类,然后利用这些类簇的中心点作为KMeans算法初始类簇中心点
中心选取方式一:从x-y中选取min、max
- x = np.random.randint(min(data[:,0].tolist()),max(data[:,0].tolist()), size=(K,1))
- y = np.random.randint(min(data[:,1].tolist()),max(data[:,1].tolist()), size=(K,1))
- ctrs = np.hstack((x,y))
中心选取方式二:随机产生中心点
- #ctrs = np.random.randint(-10,20, size=(K, 2))
中心选取方式三:观察数据后自定义
- #ctrs = [[0,0],[6,-6],[13,0],[8,5],[6,0],[13,13],[-6,6],[8,-5],[0,-6],[2,2]]
2. import numpy as np
3. import matplotlib.pyplot as plt
4. import sys
5.
6. data = np.loadtxt("./Lab4.dat")
7.
8. def Kmeans(X, K):
9. cidx = np.zeros((X.shape[0],1),dtype=np.uint8)
10. x = np.random.randint(min(data[:,0].tolist()),max(data[:,0].tolist()), size=(K,1))
11. y = np.random.randint(min(data[:,1].tolist()),max(data[:,1].tolist()), size=(K,1))
12. ctrs = np.hstack((x,y))
13. #ctrs = np.random.randint(-10,20, size=(K, 2))
14. #ctrs = [[0,0],[6,-6],[13,0],[8,5],[6,0],[13,13],[-6,6],[8,-5],[0,-6],[2,2]]
15. #ctrs = [[0, 0], [30,30]]
16. ctrs,cidx = update_center(cidx,ctrs,X,K)
17. return ctrs,cidx
18.
19. def Distance(p1,p2):
20. return np.power(((p1[0]-p2[0])**2+(p1[1]-p2[1])**2),0.5)
21.
22. def update_center(cidx,ctrs,X,K):
23. cidx_former = cidx
24.
25. for i in range(X.shape[0]):
26. d =sys.maxsize
27. for j in range(K):
28. if Distance(X[i], ctrs[j]) < d:
29. d = Distance(X[i], ctrs[j])
30. cidx[i] = j
31. for i in range(K):
32. x = y = 0
33. K_num = ((cidx.flatten()).tolist()).count(i)
34. for j in range(K_num):
35. x += X[np.where(cidx == i)[0][j]][0]
36. y += X[np.where(cidx == i)[0][j]][1]
37. x = x/(K_num+1)
38. y = y/(K_num+1)
39. ctrs[i][0] = x
40. ctrs[i][1] = y
41. if cidx_former.tolist()!=cidx.tolist():
42. ctrs,cidx = update_center(cidx, ctrs, X, K)
43.
44. return ctrs,cidx
45.
46. def calSSE(X):
47. SSE = []
48. N = X.shape[0]
49. for i in range(1,9):
50. sse = 0
51. ctrs,cidx = Kmeans(X,i)
52. for j in range(N):
53. sse += Distance(X[j],ctrs[cidx[j][0]])**2
54. sse /= N
55. SSE.append(sse)
56. return SSE
57.
58.
59. SSE = calSSE(data)
60. plt.figure()
61. plt.plot(np.arange(1,9,1),SSE,label='SSE-K',c='purple')
62. plt.xlabel('K')
63. plt.ylabel("SSE")
64. plt.legend()
65. plt.show()
66.
67. ctrs,cidx = Kmeans(data,4)
68. color_1234={0:'r',1:'b',2:'g',3:'purple',4:'y',5:'k',6:'m'}
69. plt.figure()
70. for i in range(4):
71. K_num = ((cidx.flatten()).tolist()).count(i)
72. data_1234 = []
73. for j in range(K_num):
74. data_1234.append(data[np.where(cidx == i)[0][j]].tolist())
75. if data_1234:
76. data_1234 = np.array(data_1234)
77. plt.scatter(data_1234[:, 0], data_1234[:, 1], label='class:' + str((i + 1)), c=color_1234[i])
78.
79. plt.legend(loc='upper left')#左上角显示legend
80. plt.show()
进阶代码:
def Kmeans(X, K):
cidx = np.zeros(((X.shape[0],1)), dtype=np.uint8)
ctrs = get_farthest_k_center(X,K)
ctrs, cidx = update_center(cidx, ctrs, X, K)
return ctrs, cidx
def Distance(p1, p2):
return np.power(((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2+ (p1[2] - p2[2]) ** 2), 0.5)
def update_center(cidx, ctrs, X, K):
cidx_former = cidx
for i in range(X.shape[0]):
d = sys.maxsize
for j in range(K):
if Distance(X[i], ctrs[j]) < d:
d = Distance(X[i], ctrs[j])
cidx[i] = j
for i in range(K):
x = y = 0
K_num = ((cidx.flatten()).tolist()).count(i)
for j in range(K_num):
x += X[np.where(cidx == i)[0][j]][0]
y += X[np.where(cidx == i)[0][j]][1]
x = x / (K_num + 1)
y = y / (K_num + 1)
ctrs[i][0] = x
ctrs[i][1] = y
if cidx_former.tolist() != cidx.tolist():
ctrs, cidx = update_center(cidx, ctrs, X, K)
return ctrs, cidx
def cal_Silhouette_Coeff(data):
S_list = []
for k in range(2,7):
ctrs, cidx = Kmeans(data, k)
cidx = (cidx.flatten()).tolist()
s = 0
for i in range (int(data.shape[0])):
a = 0
b = [0]*k
for j in range(k):
for l in range(int(data.shape[0])):
if cidx[j]==cidx[l]:
b[j] += Distance(data[i], data[l])
a = b[cidx[i]]/cidx.count(cidx[i])
b.pop(cidx[i])
b = np.min(b)/cidx.count(cidx[np.argmin(b)])
s += (b-a)/(max(a,b))
s /= data.shape[0]
S_list.append(s)
return int(np.argmax(S_list)+2)
def get_farthest_k_center(data, k):
center = []
x = np.random.randint(min(data[:,0].tolist()),max(data[:,0].tolist()), 1)
y = np.random.randint(min(data[:, 1].tolist()),max(data[:, 1].tolist()), 1)
center.append([x,y])
for i in range(k-1):
#还要生成多少个
distance = []
for j in range(len(center)):
#有多少个才算多少个距离
distance.append((np.sum((pow(data-center[j], 2)), axis=1)).tolist())
if len(center)!=1:
min_distance = np.min(distance, axis=0)
max_distance_idx = np.argmax(min_distance)
center.append((data[max_distance_idx].flatten()).tolist())
else:
min_distance = np.max(distance, axis=0)
max_distance_idx = np.argmax(min_distance)
center.append((data[max_distance_idx]).tolist())
return center