kMeans聚类的python实现

from numpy import *
import matplotlib.pyplot as plt

#辅助函数
#载入数据集
def loadDataSet(filename):
    dataMat = []
    f = open(filename)
    for line in f.readlines():
        curLine = line.strip().split('\t')
        #python3.之后需要用list(map())
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return dataMat

#返回两个点的欧氏距离
def distEclud(vecA,vecB):
    return sqrt(sum(power(vecA-vecB,2)))

#构建一个包含k个随机质心的集合
def randCent(dataSet,k):
    #获取每一位维的度数
    n = shape(dataSet)[1]
    #生成(k,n)维空数组矩阵
    centroids = mat(zeros((k,n)))
    #在minJ到maxJ之间生成随机质心填充 centroids
    for j in range(n):
        minJ = min(dataSet[:,j])
        rangeJ = float(max(dataSet[:,j]) - minJ)
        centroids[:,j] = minJ + rangeJ * random.rand(k,1)
    return centroids

def kMeans(dataSet,k,dist = distEclud, createCent = randCent):
    m = shape(dataSet)[0]
    #长度为m的label数组
    label = zeros((1,m))[0]
    centroids = createCent(dataSet,k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        #为每个点寻找最近的质心
        for i in range(m):
            minDist = inf; minIndex = -1;
            for j in range(k):
                distJI = dist(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI;minIndex = j
            if(label[i] != minIndex):
                clusterChanged = True
                label[i] = minIndex 
        print(centroids)

        #重新计算质心的位置
        for cent in range(k):
            ptsInclust = dataSet[nonzero(label == cent)[0]]
            centroids[cent,:] = mean(ptsInclust,axis = 0)

    return centroids , label 


if __name__ == '__main__':
    k = 4
    filename = 'testSet.txt'
    dataSet = loadDataSet(filename)
    dataArray = array(dataSet)
    #dataMat = mat(loadDataSet(filename))
    #plt.plot(dataArray[:,0],dataArray[:,1],'o')
    centroids,label = kMeans(dataArray,k)

    str = 'o*s^'
    color = 'bgrc'
    for i in range(len(label)):
        ch = str[int(label[i])]
        co = color[int(label[i])]
        plt.plot(dataArray[i,0],dataArray[i,1],color =co ,marker = ch)

    for i in range(len(centroids)):
        plt.plot(centroids[i,0],centroids[i,1],'k+')
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值