选择txt文件作为数据集进行测试
import numpy as np
import operator
import matplotlib.pyplot as plt
#求样本点和每个簇的中心的距离,并返回距离最小的样本点
def mindistance(X, Y):
Dist = float('inf')
k = 0
for i in range(len(Y)):
dist = np.linalg.norm(X - Y[i])
if dist < Dist:
k = i
Dist = dist
return k
def k_means(k, Cluster, examples, C):
C = []
for i in range(3):
C.append([])
for i in range(len(examples)):
index = mindistance(examples[i], Cluster)
C[index].append(examples[i])
# 更新中心点
for i in range(len(C)):
sum = np.array([0.0, 0.0])
for j in range(len(C[i])):
sum += C[i][j]
Cluster[i] = sum / len(C[i])
flag = []
for j in range(k):
flag.append(Cluster[i])
return C
def dataload(filename):
X = []
with open(filename) as f:
fr = f.readlines()
for line in fr:
currLine = line.strip().split(' ')
linearr = []
for i in range(2):
linearr.append(float(currLine[i]))
X.append(np.array(linearr))
return X
dataSet = dataload('testSet.txt')
C = []
for i in range(3):
C.append([])
Cluster = []
#以前三个样本点为初始样本中心点
for i in range(3):
Cluster.append(dataSet[i])
C1 = k_means(3, Cluster, dataSet, C)
while(operator.eq(C1, k_means(3, Cluster, dataSet, C1)) == False):
C1 = k_means(3, Cluster, dataSet, C1)
picture = []
for i in range(3):
print('第' + str(i+1) +'类有' + str(len(C1[i])) + "个样本,分别是:")
for j in range(len(C1[i])):
print(C1[i][j])
picture.append(np.array(C1[i][j]))
x_1,x_2,x_3 = [],[],[]
y_1,y_2,y_3 = [],[],[]
for i in range(len(C1[0])):
x_1.append(picture[i][0])
y_1.append(picture[i][1])
plt.scatter(x_1,y_1,color = 'r',label = 'first kind')
for i in range(len(C1[0]),len(C1[1])+len(C1[0])):
x_2.append(picture[i][0])
y_2.append(picture[i][1])
plt.scatter(x_2,y_2,color = 'y',label = 'second kind')
for i in range(len(C1[1])+len(C1[0]),len(C1[0])+len(C1[1])+len(C1[2])):
x_3.append(picture[i][0])
y_3.append(picture[i][1])
plt.scatter(x_3,y_3,color = 'b',label = 'third kind')
plt.legend()
plt.show()
数据集可以自己找,我这里是分成了三类