一、初始化聚类中心
随机选择k个聚类中心:
def init_centroids(X, k):
m, n = X.shape
centroids = np.zeros((k, n))
idx = np.random.randint(0, m, k)
for i in range(k):
centroids[i,:] = X[idx[i],:]
return centroids
二、簇分配
为每个样本找到离它最近的簇中心:
def find_closest_centroids(X, centroids):
m = X.shape[0]
k = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(k):
dist = np.sum((X[i,:] - centroids[j,:]) ** 2)
if dist < min_dist:
min_dist = dist
idx[i] = j
return idx
三、更新各个聚簇中心坐标
计算簇的聚类中心。 聚类中心是当前分配给簇的所有样本的平均值。
def compute_centroids(X, idx, k):
m, n = X.shape
centroids = np.zeros((k, n))
for i in range(k):
indices = np.where(idx == i)
centroids[i,:] = (np.sum(X[indices,:], axis=1) / len(indices[0])).ravel()
return centroids
四、K-Means算法
def run_k_means(X, initial_centroids, max_iters):
m, n = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(m)
centroids = initial_centroids
for i in range(max_iters):
idx = find_closest_centroids(X, centroids)
centroids = compute_centroids(X, idx, k)
return idx, centroids
五、使用sklearn库进行K-means算法
https://blog.csdn.net/sinat_26917383/article/details/70240628
from sklearn.cluster import KMeans
model = KMeans(n_clusters=16,n_init=100,n_jobs=-1) #n_init设置获取初始簇中心的更迭次数,防止局部最优 n_jobs设置并行(使用CPU数,-1则使用所有CPU)
model.fit(X) #开始聚类
centroids = model.cluster_centers_ #获取聚簇中心
C = model.predict(X) #获取每个数据点的对应聚簇中心的索引