K聚类的作用:
对于给定的数据集通过欧式距离进行分类
假定对给定的数据集分成3类
算法步骤
1 读取数据集,将数据集读取成矩阵
2 随机在矩阵中找三个点A,B, C作为中心点
3 计算每一个到三个中心点的距离,将该点归到距离最近的中心点,最终分成A,B,C三类
4 将已归好的三类求每类的平均值作为新的中心点记为A1,B1,C1
5 重复3与4步骤直至分的3类不在发生改变
数据集文件
链接:https://pan.baidu.com/s/1uBya2epJuiqYC5K8DSkImw
提取码:4y28
代码如下
import csv
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # 空间三维画图
import time
start = time.time()
#读取csv文件
# 对utf-8编码
def readCSV_UTF8(path):
with open(path, "r", encoding='UTF-8') as file:
data = csv.reader(file)
list = []
for row in data:
list.append(row)
return list
# 对gbk编码
def readCSV_gbk(path):
with open(path, "r") as file: # gbk
data = csv.reader(file)
list = []
for row in data:
list.append(row)
return list
list = readCSV_UTF8("iris.csv") #此时LIST列表中是字符
list.pop(0)
for i in range(len(list)):
list[i] = list[i][:4]
#将字符转化成浮点数方便运算
for i in range(len(list)):
for j in range(len(list[i])):
list[i][j] = float(list[i][j])
data = np.array(list) #data是矩阵
def distE(zhidian, list): #距离计算函数 计算欧氏距离
if len(zhidian) != len(list):
return False
else:
m = 0
for i in range(len(list)):
n = (list[i] - zhidian[i])**2
m += n
return m
#聚类函数
def cluster(x, y, z, list):
list1 = []
list2 = []
list3 = []
list1.append(x)
list2.append(y)
list3.append(z)
for i in range(len(list)):
if distE(x, list[i]) <= distE(y, list[i]) and distE(x, list[i]) <= distE(z, list[i]):
list1.append(list[i])
elif distE(y, list[i]) <= distE(x, list[i]) and distE(y, list[i]) <= distE(z, list[i]):
list2.append(list[i])
else:
list3.append(list[i])
return list1, list2, list3
#求质心函数
def centre(a, b, c):
a = np.array(a)
b = np.array(b)
c = np.array(c)
centre1 = np.sum(a, axis=0)/len(a)
centre2 = np.sum(b, axis=0)/len(b)
centre3 = np.sum(c, axis=0)/len(c)
centre1 = centre1.tolist()
centre2 = centre2.tolist()
centre3 = centre3.tolist()
return centre1, centre2, centre3
a,b,c =cluster(list[1], list[50], list[149], list)
centre1, centre2, centre3 = centre(a,b,c)
'''test_list = []
test_list.append(centre1)
test_list.append(centre2)
test_list.append(centre3)'''
#进行循环迭代
for i in range(100): #这里应该用while循环,但是我用while老是报错所以麻烦了一些用的for循环
if i == 0:
list1, list2, list3 = cluster(centre1, centre2, centre3, list)
m, n, p = centre(list1, list2, list3)
else:
list1, list2, list3 = cluster(m, n, p, list)
m, n, p = centre(list1, list2, list3)
'''test_list.append(m)
test_list.append(n)
test_list.append(p)
if ((test_list[len(test_list)] == test_list[len(test_list)-3]) &
(test_list[len(test_list)-1] == test_list[len(test_list)-4]) &
(test_list[len(test_list)-2] == test_list[len(test_list)-5])):
break
else:
pass'''
array1 = np.array(list1)
array2 = np.array(list2)
array3 = np.array(list3)
#将数据可视化
x1 = array1[:, 0]
y1 = array1[:, 1]
z1 = array1[:, 2]
x2 = array2[:, 0]
y2 = array2[:, 1]
z2 = array2[:, 2]
x3 = array3[:, 0]
y3 = array3[:, 1]
z3 = array3[:, 2]
fig = plt.figure() #创建一个图
#ax = fig.add_subplot(111, projection='3d')
ax = Axes3D(fig)
ax.scatter(x1, y1, z1, c = 'r', marker='o', s=100*array1[:, 3])
ax.scatter(x2, y2, z2, c = 'b', marker='o', s=100*array2[:, 3])
ax.scatter(x3, y3, z3, c = 'y', marker='o', s=100*array3[:, 3])
x0 = data[:, 0]
y0 = data[:, 1]
z0 = data[:, 2]
fig = plt.figure() #创建一个图
ax = Axes3D(fig)
ax.scatter(x0, y0, z0)
end = time.time()
print(end - start)
plt.show()