K-means核心算法:
def kMeans(dataSet, k, distMeans=distEclud, createCent=randCent):
"""
输入:数据集, 聚类个数, 距离计算函数, 生成随机质心函数
输出:质心矩阵, 簇分配和距离矩阵
"""
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m): # 寻找最近的质心
minDist = INF
minIndex = -1
for j in range(k):
distJI = distMeans(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2
for cent in range(k): # 更新质心的位置
# 取出clusterAssment第一列索引,转换为numpy类型,判断索引是否为cent,返回为元素为bool类型的numpy数组
# 然后取出值为真的位置索引,二维元组类型, 取出第一维,为值为真的行索引
# 取出相关索引的样本, nonzero(clusterAssment[:, 0].A &#