要求是不使用sk-learn等机器学习框架。
数据使用的是uci里的iris数据。
iris数据
# -coding: utf-8
import random
import matplotlib.pyplot as plt
f = open(r"C:\Users\65465\Documents\data\iris.txt")
line = f.readline()
data_list = []
while line:
nume = list(map(str,line.split()))
data_list.append(nume)
line = f.readline()
f.close()
result = []
for element in data_list:
for e in element:
result.append(e)
ala = []
i = 0
for ele in result:
strm = ele
strm.split(",")
ala.insert(i,strm.split(","))
i += 1
data_value = []
for al in ala:
for a in al:
data_value.append(a)
num = 1
data1 = []
data2 = []
data3 = []
data4 = []
name = []
for k in range(len(data_value)-int(len(data_value) % 5)):
if num == 1:
data1.append(float(data_value[k]))
elif num == 2:
data2.append(float(data_value[k]))
elif num == 3:
data3.append(float(data_value[k]))
elif num == 4:
data4.append(float(data_value[k]))
else:
num = 0
name.append(data_value[k])
num +=1
def ran(data1, k):
rand = []
for i in range(0, k):
rand.append(random.randint(0,len(data1)))
return rand
def kmeans(k, data1, data2):
change_bar = 1
ch = []
for i in range(6):
ch.append(1)
randint = ran(data1, k)
sample1 = []
sample2 = []
for i in range(0,k):
sample1.append(data1[randint[i]])
sample2.append(data2[randint[i]])
while change_bar == 1:
mean = []
for i in range(0,2*k):
mean.append([])
for i in range(0,2*k):
if (i%2) == 0:
mean[i].append(sample1[i//2])
else:
mean[i].append(sample2[i//2])
for i in range(0,len(data1)):
minim = 9999
for j in range(0,k):
dis = pow(pow(data1[i]-sample1[j],2)+pow(data2[i]-sample2[j],2),0.5)
if dis < minim:
minim = dis
sort_s = j
mean[sort_s].append(data1[i])
mean[sort_s+k].append(data2[i])
for i in range(0,k):
if sample1[i] != sum(mean[i])/len(mean[i]):
sample1[i] = sum(mean[i]) / len(mean[i])
ch[i] = 1
else:
ch[i] = 0
if sample2[i] != sum(mean[i+k])/len(mean[i+k]):
sample2[i] = sum(mean[i+k])/len(mean[i+k])
ch[i+k] = 1
else:
ch[i+k] = 0
if ch[0] & ch[1] & ch[2] & ch[3] & ch[4] & ch[5]:
change_bar = 0
# 画图
plt.scatter(mean[0], mean[3], c = 'red', marker = 'o' , label = 'Cluster1')
plt.scatter(mean[1], mean[4], c = 'green', marker = '*' , label = 'Cluster2')
plt.scatter(mean[2], mean[5], c = 'blue', marker = '+' , label = 'Cluster3')
plt.xlabel('data1')
plt.ylabel('data2')
plt.legend()
plt.show()
# 调用kmeans函数
k = 3 # 聚类簇数k
kmeans(k,data1,data2)
运行结果如图:
可以看出数据拟合情况较好。