python中kmeans函数_Python实现kMeans(k均值聚类)

Python实现kMeans(k均值聚类)

运行环境

Pyhton3

numpy(科学计算包)

matplotlib(画图所需,不画图可不必)

计算过程

st=>start: 开始

e=>end: 结束

op1=>operation: 读入数据

op2=>operation: 随机初始化聚类中心

cond=>condition: 是否聚类是否变化

op3=>operation: 寻找最近的点加入聚类

op4=>operation: 更新聚类中心

op5=>operation: 输出结果

st->op1->op2->op3->op4->cond

cond(yes)->op3

cond(no)->op5->e

输入样例

/* 788points.txt */

15.55,28.65

14.9,27.55

14.45,28.35

14.15,28.8

13.75,28.05

13.35,28.45

13,29.15

13.45,27.5

13.6,26.5

12.8,27.35

12.4,27.85

12.3,28.4

12.2,28.65

13.4,25.1

12.95,25.95

788points.txt完整文件:下载

代码实现

# -*- coding: utf-8 -*-

__author__ = 'Wsine'

from numpy import *

import matplotlib.pyplot as plt

import operator

import time

INF = 9999999.0

def loadDataSet(fileName, splitChar='\t'):

"""

输入:文件名

输出:数据集

描述:从文件读入数据集

"""

dataSet = []

with open(fileName) as fr:

for line in fr.readlines():

curline = line.strip().split(splitChar)

fltline = list(map(float, curline))

dataSet.append(fltline)

return dataSet

# def createDataSet():

# """

# 输出:数据集

# 描述:生成数据集

# """

# dataSet = [[0.0, 2.0],

# [0.0, 0.0],

# [1.5, 0.0],

# [5.0, 0.0],

# [5.0, 2.0]]

# return dataSet

def distEclud(vecA, vecB):

"""

输入:向量A, 向量B

输出:两个向量的欧式距离

"""

return sqrt(sum(power(vecA - vecB, 2)))

def randCent(dataSet, k):

"""

输入:数据集, 聚类个数

输出:k个随机质心的矩阵

"""

n = shape(dataSet)[1]

centroids = mat(zeros((k, n)))

for j in range(n):

minJ = min(dataSet[:, j])

rangeJ = float(max(dataSet[:, j]) - minJ)

centroids[:, j] = minJ + rangeJ * random.rand(k, 1)

return centroids

def kMeans(dataSet, k, distMeans=distEclud, createCent=randCent):

"""

输入:数据集, 聚类个数, 距离计算函数, 生成随机质心函数

输出:质心矩阵, 簇分配和距离矩阵

"""

m = shape(dataSet)[0]

clusterAssment = mat(zeros((m, 2)))

centroids = createCent(dataSet, k)

clusterChanged = True

while clusterChanged:

clusterChanged = False

for i in range(m): # 寻找最近的质心

minDist = INF #设定的距离阈值,这么大没什么用

minIndex = -1 #假设的质心点的位置

for j in range(k): #对dataSet的点挨个与0 - k质心点做对比,找出最近的点

distJI = distMeans(centroids[j, :], dataSet[i, :]) #计算dataSet的点与假设质心点之间的欧几里得距离

if distJI < minDist: #dataSet的每个点都要与centroids质心点进行比较,找出距离质心最近的点(质心坐标为0 - k)

minDist = distJI

minIndex = j

if clusterAssment[i, 0] != minIndex: #clusterAssment[i, 0]所有点都是0,都不会相等

clusterChanged = True

clusterAssment[i, :] = minIndex, minDist**2 #clusterAssment[i,:]具有i行,第一列的值表示dataSet属于0 - k的哪一个,cA第一列的取值范围是0 - k

for cent in range(k): # 更新质心的位置,只是更新了质心点,没有在质心点的基础上进行迭代?

ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] #寻找出clusterAssment[:, 0]第一列有i个值,i个值的样子大概是【1,2,3,1,1,2】,寻找出值为1的下标分别是0,3,4

centroids[cent, :] = mean(ptsInClust, axis=0) #axis = 0:压缩行,对各列求均值,返回 1* n 矩阵

return centroids, clusterAssment

def plotFeature(dataSet, centroids, clusterAssment): # clusterAssment的行列数与dataSet一样,必须保证行列要与dataSet对齐

m = shape(centroids)[0] #返回矩阵的行数,【1】表示返回矩阵的列数

fig = plt.figure() #画二维图

scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '

scatterColors = ['blue', 'green', 'yellow', 'purple', 'orange', 'black', 'brown'] #定义颜色

ax = fig.add_subplot(111) #图框的个数

for i in range(m): #m的取值0 - k

ptsInCurCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :] #比方说,第一次调出第一列为0的所有值的下标,第二次调出所有为1的下标

markerStyle = scatterMarkers[i % len(scatterMarkers)]

colorSytle = scatterColors[i % len(scatterColors)]

ax.scatter(ptsInCurCluster[:, 0].flatten().A[0], ptsInCurCluster[:, 1].flatten().A[0], marker=markerStyle, c=colorSytle, s=90) #https://www.jianshu.com/p/53e49c02c469

ax.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0], marker='+', c='red', s=300) #定义质心点的形状、颜色

def main():

#dataSet = loadDataSet('testSet2.txt')

dataSet = loadDataSet('788points.txt', splitChar=',')

#dataSet = createDataSet()

dataSet = mat(dataSet)

resultCentroids, clustAssing = kMeans(dataSet, 6)

print('*******************')

print(resultCentroids)

print('*******************')

plotFeature(dataSet, resultCentroids, clustAssing)

if __name__ == '__main__':

start = time.clock()

main()

end = time.clock()

print('finish all in %s' % str(end - start))

plt.show()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
K-means是一种常用的聚类算法,而K-means++是K-means算法的优化版本,它能够更好地初始化聚类心,从而得到更好的聚类效果。下面是PythonK-means和K-means++的实现方法。 K-means实现: ```python import numpy as np def kmeans(X, k, max_iter=100): n_samples, n_features = X.shape centroids = X[np.random.choice(n_samples, k, replace=False)] for i in range(max_iter): clusters = [[] for _ in range(k)] for idx, x in enumerate(X): distances = [np.linalg.norm(x - c) for c in centroids] clusters[np.argmin(distances)].append(idx) new_centroids = np.zeros((k, n_features)) for idx, cluster in enumerate(clusters): new_centroids[idx] = np.mean(X[cluster], axis=0) if np.allclose(new_centroids, centroids): break centroids = new_centroids return centroids, clusters ``` K-means++实现: ```python import numpy as np def kmeans_pp(X, k, max_iter=100): n_samples, n_features = X.shape centroids = [] # choose first centroid randomly idx = np.random.choice(n_samples, 1, replace=False) centroids.append(X[idx]) # choose the rest of the centroids using k-means++ algorithm for i in range(1, k): distances = np.zeros(n_samples) for j, x in enumerate(X): distances[j] = np.min([np.linalg.norm(x - c) for c in centroids]) probabilities = distances / np.sum(distances) cumulative_probabilities = np.cumsum(probabilities) idx = np.searchsorted(cumulative_probabilities, np.random.rand()) centroids.append(X[idx]) centroids = np.array(centroids) # run k-means algorithm with the initial centroids for i in range(max_iter): clusters = [[] for _ in range(k)] for idx, x in enumerate(X): distances = [np.linalg.norm(x - c) for c in centroids] clusters[np.argmin(distances)].append(idx) new_centroids = np.zeros((k, n_features)) for idx, cluster in enumerate(clusters): new_centroids[idx] = np.mean(X[cluster], axis=0) if np.allclose(new_centroids, centroids): break centroids = new_centroids return centroids, clusters ``` 这两个函数的输入参数相同,其X是数据集,k是聚类数量,max_iter是最大迭代次数。函数返回聚类心和每个数据点所属的聚类编号。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值