1.实验要求
使用k_means.txt作为数据集,编写程序实现K-均值聚类算法,要求定义欧氏距离作为距离函数,K为4,随机初始化。聚类后对结果进行可视化,使用不同颜色标记聚类结果
2.实验步骤
3.代码实现
1.计算欧氏距离,封装为函数
# 计算欧氏距离
def calEuclidean(x, y):
dist = np.sqrt(np.sum(np.square(x-y))) # np.array 类型的数据可以直接进行向量、矩阵加减运算。np.square 是对每个元素求平均
return dist
2.绘制数据分布图,观察数据分布
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.figure(num=1)
plt.scatter(data_set[:, 0], data_set[:, 1], color='k')
plt.title("数据分布图")
plt.show()
3.随机选择4个点作为初始质心
K = 4
temp = list(range(1, len(data_set)+1))
label = np.array(temp)
sample_list = [i for i in range(len(data_set))]
sample_list = random.sample(sample_list, K) # 随机选择4个点
zhixin = data_set[sample_list, :]
label = label[sample_list]
4.初始化4个簇
初始化cluster数组,cluster1~cluster4分别对应4个簇。
cluster1 = np.array(zhixin[0])
cluster2 = np.array(zhixin[1])
cluster3 = np.array(zhixin[2])
cluster4 = np.array(zhixin[3])
5.计算每个点到4个初始质心的距离
j = 0
dist = np.zeros([80, 4]) # 距离矩阵
for i in data_set:
dist[j][0] = calEuclidean(i, zhixin[0])
dist[j][1] = calEuclidean(i, zhixin[1])
dist[j][2] = calEuclidean(i, zhixin[2])
dist[j][3] = calEuclidean(i, zhixin[3])
# 将点分类为欧氏距离最小的一类
min_dist = min(dist[j])
if min_dist == dist[j][0]:
cluster1 = np.append(cluster1, i)
elif min_dist == dist[j][1]:
cluster2 = np.append(cluster2, i)
elif min_dist == dist[j][2]:
cluster3 = np.append(cluster3, i)
else:
cluster4 = np.append(cluster4, i)
j = j + 1
cluster1 = np.reshape(cluster1, [-1, 2])
cluster2 = np.reshape(cluster2, [-1, 2])
cluster3 = np.reshape(cluster3, [-1, 2])
cluster4 = np.reshape(cluster4, [-1, 2])
由于向cluster append数据点后cluster不会自动添加为新的一行,所以需要reshape来实现向cluster加入一行元素,其中[-1,2]的2表示2列,-1表示行数未知,由数组而定。
6.绘制初始分类图
plt.scatter(cluster1[:, 0], cluster1[:, 1], color='red')
plt.scatter(cluster2[:, 0], cluster2[:, 1], color='blue')
plt.scatter(cluster3[:, 0], cluster3[:, 1], color='yellow')
plt.scatter(cluster4[:, 0], cluster4[:, 1], color='purple')
plt.scatter(zhixin[:, 0], zhixin[:, 1], color='k')
plt.title("初始分类图")
plt.show()
7.循环
重新计算每个簇的质心,根据新的质心再进行划分直到每个簇的质心,不再改变或loop次数超过looLimit时,只需要满足两者的其中一个条件,就可以停止循环。
status = 1
temp_zhixin = np.zeros([4, 2])
loop = 0
loop_limit = 1000 # 最大循环次数
while status:
temp_zhixin[0] = zhixin[0]
temp_zhixin[1] = zhixin[1]
temp_zhixin[2] = zhixin[2]
temp_zhixin[3] = zhixin[3]
# 重新计算每个簇的质心
zhixin[0] = np.mean(cluster1, axis=0)
zhixin[1] = np.mean(cluster2, axis=0)
zhixin[2] = np.mean(cluster3, axis=0)
zhixin[3] = np.mean(cluster4, axis=0)
j = 0
# 清除cluster中的数据
cluster1 = np.delete(cluster1, [0, 1], axis=1)
cluster2 = np.delete(cluster2, [0, 1], axis=1)
cluster3 = np.delete(cluster3, [0, 1], axis=1)
cluster4 = np.delete(cluster4, [0, 1], axis=1)
for i in data_set:
dist[j][0] = calEuclidean(i, zhixin[0])
dist[j][1] = calEuclidean(i, zhixin[1])
dist[j][2] = calEuclidean(i, zhixin[2])
dist[j][3] = calEuclidean(i, zhixin[3])
min_dist = min(dist[j])
if min_dist == dist[j][0]:
cluster1 = np.append(cluster1, i)
elif min_dist == dist[j][1]:
cluster2 = np.append(cluster2, i)
elif min_dist == dist[j][2]:
cluster3 = np.append(cluster3, i)
else:
cluster4 = np.append(cluster4, i)
j = j + 1
cluster1 = np.reshape(cluster1, [-1, 2])
cluster2 = np.reshape(cluster2, [-1, 2])
cluster3 = np.reshape(cluster3, [-1, 2])
cluster4 = np.reshape(cluster4, [-1, 2])
loop = loop + 1
# 如果质心不再变化则终止循环
if np.array_equal(temp_zhixin, zhixin):
status = 0
# 如果循环次数超过最大循环次数则终止循环
if loop > loop_limit:
status = 0
关于numyp.delete()
numpy中的delete函数有三个参数:
numpy.delete(arr, obj, axis)
arr:需要处理的矩阵
obj:在什么位置处理
axis:这是一个可选参数,axis = None,1,0, axis = 0 arr按行删除,axis = 1 arr按列删除所有操作都是在arr的副本进行,需要有变量接收返回值。
如果想要删除多行或多列,则第二个参数使用列表即可定义要删除的行或列
关于np.array_equal()
官方解释:
Parameters: 参数
a1, a2 : array_like
Input arrays.Returns: 返回值
b : bool
Returns True if the arrays are equal.样例1,两个np类型数组:
np.array_equal(np.array([1, 2]), np.array([1, 2])) # True
样例2,两个列表:
np.array_equal([1, 2], [1, 2]) # True
样例3,一个np数组和一个列表:
np.array_equal(np.array([1, 2]), [1, 2]) #True
样例4:
np.array_equal([1, 2], [1, 2, 3]) #False
8.绘制最终结果图
plt.scatter(cluster1[:, 0], cluster1[:, 1], color='red')
plt.scatter(cluster2[:, 0], cluster2[:, 1], color='blue')
plt.scatter(cluster3[:, 0], cluster3[:, 1], color='yellow')
plt.scatter(cluster4[:, 0], cluster4[:, 1], color='purple')
plt.scatter(zhixin[:, 0], zhixin[:, 1], color='k')
plt.title("聚类结果图")
plt.show()
9.数据集 k_means.txt
1.658985,4.285136
-3.453687,3.424321
4.838138,-1.151539
-5.379713,-3.362104
0.972564,2.924086
-3.567919,1.531611
0.450614,-3.302219
-3.487105,-1.724432
2.668759,1.594842
-3.156485,3.191137
3.165506,-3.999838
-2.786837,-3.099354
4.208187,2.984927
-2.123337,2.943366
0.704199,-0.479481
-0.392370,-3.963704
2.831667,1.574018
-0.790153,3.343144
2.943496,-3.357075
-3.195883,-2.283926
2.336445,2.875106
-1.786345,2.554248
2.190101,-1.906020
-3.403367,-2.778288
1.778124,3.880832
-1.688346,2.230267
2.592976,-2.054368
-4.007257,-3.207066
2.257734,3.387564
-2.679011,0.785119
0.939512,-4.023563
-3.674424,-2.261084
2.046259,2.735279
-3.189470,1.780269
4.372646,-0.822248
-2.579316,-3.497576
1.889034,5.190400
-0.798747,2.185588
2.836520,-2.658556
-3.837877,-3.253815
2.096701,3.886007
-2.709034,2.923887
3.367037,-3.184789
-2.121479,-4.232586
2.329546,3.179764
-3.284816,3.273099
3.091414,-3.815232
-3.762093,-2.432191
3.542056,2.778832
-1.736822,4.241041
2.127073,-2.983680
-4.323818,-3.938116
3.792121,5.135768
-4.786473,3.358547
2.624081,-3.260715
-4.009299,-2.978115
2.493525,1.963710
-2.513661,2.642162
1.864375,-3.176309
-3.171184,-3.572452
2.894220,2.489128
-2.562539,2.884438
3.491078,-3.947487
-2.565729,-2.012114
3.332948,3.983102
-1.616805,3.573188
2.280615,-2.559444
-2.651229,-3.103198
2.321395,3.154987
-1.685703,2.939697
3.031012,-3.620252
-4.599622,-2.185829
4.196223,1.126677
-2.133863,3.093686
4.668892,-2.562705
-2.793241,-2.149706
2.884105,3.043438
-2.967647,2.848696
4.479332,-1.764772
-4.905566,-2.911070
4.完整代码
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
# 加载数据
data_set = pd.read_csv('k_means.txt', header=None).values
# 计算欧氏距离
def calEuclidean(x, y):
dist = np.sqrt(np.sum(np.square(x-y)))
return dist
def K_means():
# 绘制数据分布图
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.figure(num=1)
plt.scatter(data_set[:, 0], data_set[:, 1], color='k')
plt.title("数据分布图")
# 随机选K个点作为质心
K = 4
temp = list(range(1, len(data_set)+1))
label = np.array(temp)
sample_list = [i for i in range(len(data_set))]
sample_list = random.sample(sample_list, K) # 随机选择4个点
zhixin = data_set[sample_list, :]
label = label[sample_list]
#print("zhixin = ", zhixin)
plt.scatter(zhixin[:, 0], zhixin[:, 1], color='r')
# 初始化cluster数组
cluster1 = np.array(zhixin[0])
cluster2 = np.array(zhixin[1])
cluster3 = np.array(zhixin[2])
cluster4 = np.array(zhixin[3])
# 计算每个点到K个质心的距离
j = 0
dist = np.zeros([80, 4]) # 距离矩阵
for i in data_set:
dist[j][0] = calEuclidean(i, zhixin[0])
dist[j][1] = calEuclidean(i, zhixin[1])
dist[j][2] = calEuclidean(i, zhixin[2])
dist[j][3] = calEuclidean(i, zhixin[3])
# 将点分类为欧氏距离最小的一类
min_dist = min(dist[j])
if min_dist == dist[j][0]:
cluster1 = np.append(cluster1, i)
elif min_dist == dist[j][1]:
cluster2 = np.append(cluster2, i)
elif min_dist == dist[j][2]:
cluster3 = np.append(cluster3, i)
else:
cluster4 = np.append(cluster4, i)
j = j + 1
cluster1 = np.reshape(cluster1, [-1, 2])
cluster2 = np.reshape(cluster2, [-1, 2])
cluster3 = np.reshape(cluster3, [-1, 2])
cluster4 = np.reshape(cluster4, [-1, 2])
plt.figure(num=2)
plt.scatter(cluster1[:, 0], cluster1[:, 1], color='red')
plt.scatter(cluster2[:, 0], cluster2[:, 1], color='blue')
plt.scatter(cluster3[:, 0], cluster3[:, 1], color='yellow')
plt.scatter(cluster4[:, 0], cluster4[:, 1], color='purple')
plt.scatter(zhixin[:, 0], zhixin[:, 1], color='k')
plt.title("初始分类图")
status = 1
temp_zhixin = np.zeros([4, 2])
loop = 0
loop_limit = 1000 # 最大循环次数
while status:
temp_zhixin[0] = zhixin[0]
temp_zhixin[1] = zhixin[1]
temp_zhixin[2] = zhixin[2]
temp_zhixin[3] = zhixin[3]
# 重新计算每个簇的质心
zhixin[0] = np.mean(cluster1, axis=0)
zhixin[1] = np.mean(cluster2, axis=0)
zhixin[2] = np.mean(cluster3, axis=0)
zhixin[3] = np.mean(cluster4, axis=0)
j = 0
# 清除cluster中的数据
cluster1 = np.delete(cluster1, [0, 1], axis=1)
cluster2 = np.delete(cluster2, [0, 1], axis=1)
cluster3 = np.delete(cluster3, [0, 1], axis=1)
cluster4 = np.delete(cluster4, [0, 1], axis=1)
for i in data_set:
dist[j][0] = calEuclidean(i, zhixin[0])
dist[j][1] = calEuclidean(i, zhixin[1])
dist[j][2] = calEuclidean(i, zhixin[2])
dist[j][3] = calEuclidean(i, zhixin[3])
min_dist = min(dist[j])
if min_dist == dist[j][0]:
cluster1 = np.append(cluster1, i)
elif min_dist == dist[j][1]:
cluster2 = np.append(cluster2, i)
elif min_dist == dist[j][2]:
cluster3 = np.append(cluster3, i)
else:
cluster4 = np.append(cluster4, i)
j = j + 1
cluster1 = np.reshape(cluster1, [-1, 2])
cluster2 = np.reshape(cluster2, [-1, 2])
cluster3 = np.reshape(cluster3, [-1, 2])
cluster4 = np.reshape(cluster4, [-1, 2])
loop = loop + 1
# 如果质心不再变化则终止循环
if np.array_equal(temp_zhixin, zhixin):
status = 0
# 如果循环次数超过最大循环次数则终止循环
if loop > loop_limit:
status = 0
plt.figure(num=3)
plt.scatter(cluster1[:, 0], cluster1[:, 1], color='red')
plt.scatter(cluster2[:, 0], cluster2[:, 1], color='blue')
plt.scatter(cluster3[:, 0], cluster3[:, 1], color='yellow')
plt.scatter(cluster4[:, 0], cluster4[:, 1], color='purple')
plt.scatter(zhixin[:, 0], zhixin[:, 1], color='k')
plt.title("聚类结果图")
plt.show()
if __name__ == '__main__':
K_means()