import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
#K = 4 需要的质心数量
# 初始化数据
def init_data():
df = pd.read_csv('/mnt/d/EDA/Cluster/cluster_data.csv')
nd = np.array(df)
# 归一化
data = (nd - np.min(nd,axis=0)) / (np.max(nd,axis=0) - np.min(nd,axis=0))
return data
# 距离使用欧几里得距离
def euclidean_metric(vector,centre):
distance = np.sqrt(np.sum(np.square(vector-centre),axis=1))
return distance
# 确定初始质心
def init_centre(data, K):
z = np.zeros((K,data.shape[1]))
for i in range(K):
x = random.randint(0,len(data)) #随机产生一个索引值,通过索引得到一个质心的位置矩阵
z[i] = data[x]
return z
# 开始计算距离质心最小距离
def K_means(K=4):
data = init_data()
z = init_centre(data, K)
centre_all = [np.zeros((K,data.shape[1]))]
centre_all.append(z)
run_time = 0
# 循环新的质心并计算距离,将数据重新划分到簇中
while ((centre_all[-1] == centre_all[-2]).all()==False) and (run_time<100):
# for y in range(10):
li = []
for i in data:
li.append(euclidean_metric(i,z))
li = np.array(li)
group = []
for i in range(0,len(li)):
point = [0,0]
point[0] = i # index in data
point[1] = np.argmin(li[i]) # belong to which cluster(group)
group.append(point)
group = np.array(group)
# 根据划分出来的簇的中心更新质心
new_c = []
for i in range(K):
t = data[group[:,1]==i] # numpy可以使用布尔值来进行索引
new_centre = np.mean(t,axis=0)
new_c.append(new_centre)
z = np.array(new_c)
centre_all.append(z)
run_time += 1
# 绘制图像
colorlist = ['b','r','g','purple'] #对每一个簇都用不同的颜色标记
plt.figure()
for i in range(4):
t = data[group[:,1]==i]
plt.scatter(t[:,0],t[:,1],c=colorlist[i])
plt.show()