介绍
K-Means是一种经典的无监督聚类算法。可以将N个样本划分到K个簇中,使得相似的样本能尽量分到同一个类中。其衡量相似度的计算方法,是欧式距离。
优点:
- 速度快。
- 原理简单。
缺点:
- 需要确定K值,数据多的话一般可以取多次K实验。
- 对异常点很敏感。
- 初始点的选择可能有影响。
代码
import numpy as np
class K_Means(object):
# k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
def __init__(self, n_clusters=2, tolerance=0.0001, max_iter=200):
self.k_ = n_clusters
self.tolerance_ = tolerance
self.max_iter_ = max_iter
def fit(self, data):
#一开始随机选择K个点作为初始点
self.centers_={}
for i in range(self.k_):
self.centers_[i]=data[i]
#迭代开始
for i in range(self.max_iter_):
self.clf_={}
for i in range(self.k_):
self.clf_[i]=[]
#计算每个点和选的K个点的欧式距离
for feature in data:
distances=[]
for center in self.centers_:
distances.append(np.linalg.norm(feature-self.centers_[center]))
# 放到哪个类中去
classification=distances.index(min(distances))
self.clf_[classification].append(feature)
# 算K组中每项的平均值
prev_centers=dict(self.centers_)
for c in self.clf_:
self.centers_[c]=np.average(self.clf_[c],axis=0)
# '中心点'是否在误差范围内
# 类似深度学习中的梯度下降
optimized=True
for center in self.centers_:
org_centers=prev_centers[center]
cur_centers=self.centers_[center]
if np.sum((cur_centers-org_centers)/org_centers*100.0)>self.tolerance_:
optimized=False
if optimized:
break
def predict(self, p_datas):
result = []
for feature in p_datas:
distances = []
for center in self.centers_:
distances.append(np.linalg.norm(feature - self.centers_[center]))
# 放到哪个类中去
classification = distances.index(min(distances))
result.append(classification)
return result
if __name__ == '__main__':
x = np.array([[1,2],[3,4],[5,6],[10,11],[12,13],[14,15],[1.5,2.5]])
k_means = K_Means(n_clusters=2)
k_means.fit(x)
cat = k_means.predict(x)
print(cat)
结果如下:
参考
https://blog.csdn.net/xc_zhou/article/details/88247783
https://blog.csdn.net/hanxia159357/article/details/81530361
https://blog.csdn.net/hqh131360239/article/details/79061535