一、一个用于聚类的算法,咨询查询聚类的相关知识,可再去了解聚类与分类,监督式等等
二、思路
1.列出m个(x,y)需要分的点,初始化黑色,K=3,给出3个随机初始中心点,初始化3个颜色
2.计算距离,得到关于每个点最近的中心点,让颜色与中心点相同表示一类
3.更新中心点:选取该类的点中的平均值
4.重复2,3,直到中心点不会变化
三、代码
可用jupyter在线实现,有兴趣把代码copy上去感受感受
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.DataFrame({
'x':[12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69,
72],
'y':[39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24]
})
print(df)
k=3
np.random.seed(234)
centroids={
i+1:[np.random.randint(0,80),np.random.randint(0,80)]
for i in range(k)
}
print(centroids)
colmap={1:'r',2:'g',3:'b'}
plt.scatter(df['x'],df['y'],color='k')
for i in centroids.keys():
plt.scatter(centroids[i][0],centroids[i][1],color=colmap[i])
plt.xlim(0,80)
plt.ylim(0,80)
plt.show()
#第一个图,初始图,随机中心点
def assignment(df,centroids):
for i in centroids.keys():
df['distance_from_{}'.format(i)]=np.sqrt(
(df['x']-centroids[i][0])**2+(df['y']-centroids[i][1])**2
)
#往df里面加入算与3个中心点距离的数组
name=['distance_from_{}'.format(i) for i in centroids ]
df['closest']=df.loc[:,name].idxmin(axis=1)
#这里closest这一列会显示distance_from_1/2/3
df['closest']=df['closest'].map(lambda x:int(x.lstrip('distance_from_{}')))
#改名
df['color']=df['closest'].map(lambda x:colmap[x])
return df
df = assignment(df,centroids)
print(df)
plt.scatter(df['x'],df['y'],color=df['color'],alpha=0.5)
#非中心点经过closest得到了颜色,透明度为0.5,浅一点
for i in centroids.keys():
plt.scatter(centroids[i][0],centroids[i][1],color=colmap[i])
plt.xlim(0,80)
plt.ylim(0,80)
plt.show()
def update():
for i in centroids.keys():
centroids[i][0]=np.mean(df[df['closest']==i]['x'])
#df['closest']==i之前和i归为一类的点,将x的平均值作为该类中心点的横坐标
centroids[i][1]=np.mean(df[df['closest']==i]['y'])
update()
#更新中心点
plt.scatter(df['x'],df['y'],color=df['color'],alpha=0.5)
for i in centroids.keys():
plt.scatter(centroids[i][0],centroids[i][1],color=colmap[i])
plt.xlim(0,80)
plt.ylim(0,80)
plt.show()
while True:
old_closest=df['closest']
update()
df=assignment(df,centroids)
if old_closest.equals(df['closest']):
break
plt.scatter(df['x'],df['y'],color=df['color'],alpha=0.5)
for i in centroids.keys():
plt.scatter(centroids[i][0],centroids[i][1],color=colmap[i])
plt.xlim(0,80)
plt.ylim(0,80)
plt.show()
x y 0 12 39 1 20 36 2 28 30 3 18 52 4 29 54 5 33 46 6 24 55 7 45 59 8 45 63 9 52 70 10 51 66 11 52 63 12 55 58 13 53 23 14 55 14 15 61 8 16 64 19 17 69 7 18 72 24 {1: [72, 31], 2: [68, 57], 3: [33, 3]}
x y distance_from_1 distance_from_2 distance_from_3 closest color 0 12 39 60.530984 58.821765 41.677332 3 b 1 20 36 52.239832 52.392748 35.468296 3 b 2 28 30 44.011362 48.259714 27.459060 3 b 3 18 52 57.939624 50.249378 51.244512 2 g 4 29 54 48.764741 39.115214 51.156622 2 g 5 33 46 41.785165 36.687873 43.000000 2 g 6 24 55 53.665631 44.045431 52.773099 2 g 7 45 59 38.897301 23.086793 57.271284 2 g 8 45 63 41.868843 23.769729 61.188234 2 g 9 52 70 43.829214 20.615528 69.641941 2 g 10 51 66 40.816663 19.235384 65.520989 2 g 11 52 63 37.735925 17.088007 62.936476 2 g 12 55 58 31.906112 13.038405 59.236813 2 g 13 53 23 20.615528 37.161808 28.284271 1 r 14 55 14 24.041631 44.922155 24.596748 1 r 15 61 8 25.495098 49.497475 28.442925 1 r 16 64 19 14.422205 38.209946 34.885527 1 r 17 69 7 24.186773 50.009999 36.221541 1 r 18 72 24 7.000000 33.241540 44.294469 1 r