K-means聚类步骤:
1.随机选取K个中心点;
2.每个数据点分配给K个中心点;
3.通过每个类的均值重新计算中心点;
4.对步骤2-3进行迭代计算。
import numpy as np
import random
class K_means():
def __init__(self,n_clusters=2,tolerance=0.0001,max_iter=300):
#n_clusters是k(聚类数量),tolerance是允许误差,max_iter是迭代次数
self.k_ = n_clusters
self.tolerance = tolerance
self.max_iter_=max_iter
def fit(self,data):
#fit分为Estep和Mstep,分别计算上述步骤2-3.
#random.sample(a,b)是指把返回b个a范围内的索引
centers = data[random.sample(range.shape[0],self.k_)]
old_centers = np.copy(centers)
labels = [[] for i in range(self.k_)]
for iter_ in range(self.max_iter_):
for idx,point in enumerate(data):
diff = np.linalg.norm(old_centers-point,axis=1)
labels[argmin(diff)].append(idx)
for i in range(self.k_):
points = data[labels[i],:]
centers[i] = np.mean(points,axis=0)
if np.sum(np.abs(old_centers-centers))<self.tolerance*self.k_:
break
old_centers = np.copy(centers)
self.centers = centers
self.fitted = True
def predict(self,p_data):
result = []
if not self.fitted:
print("unfitted")
return result
for point in p_data:
diff = np.linalg.norm(self.centers-point,axis=1)
result.append(argmin(diff))
return result