import numpy as np
# 计算欧式距离
def euclDistance(vector1, vector2):
return np.sqrt(sum((vector2 - vector1) ** 2))
# 初始化质心
def initCentroids(dataSet, k):
n_samples, n_feature = dataSet.shape
# 初始化一个n_feature维数组记录每一个质心
centroids = np.zeros((k, n_feature))
# 随机选出k个质心
for i in range(k):
# 随机选出一个样本的索引
index = int(np.random.uniform(0, n_samples))
# 记录质心
centroids[i, :] = dataSet[index, :]
return centroids
def kmeans(dataSet, k):
n_samples = dataSet.shape[0]
# 一个二维数组,第一列记录样本所在的簇,第二列记录此样本到质心的距离
samples_info = np.array(np.zeros((n_samples, 2)))
cluster_changed = True
# 初始化质心
centroids = initCentroids(dataSet, k)
while cluster_changed:
cluster_changed = False
# 遍历每一个样本
for i in range(n_samples):
# 样本到质心最小距离
min_dist = np.inf
# 样本所在的簇
min_index = 0
# 遍历每一个质心
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < min_dist:
min_dist = distance
samples_info[i, 1] = min_dist
# 如果样本的所属的簇发生了变化
if samples_info[i, 0] != min_index:
cluster_changed = True
samples_info[i, 0] = min_index
# 更新质心
for j in range(k):
# 获取所在类别为j的簇内所有样本的索引
cluster_index = np.nonzero(samples_info[:, 0] == j)
# 类别为j的簇内所有样本
temp = dataSet[cluster_index]
# 计算质心
centroids[j, :] = np.mean(temp, axis=0)
return centroids, samples_info
if __name__ == '__main__':
data = np.genfromtxt("kmeans.txt", delimiter=" ")
centroids, samples_info = kmeans(data, k)
print(centroids)
print(samples_info)
k-Means算法实现
最新推荐文章于 2022-08-11 22:23:36 发布