kmeans算法原理及python代码实现

聚类是一种无监督学习算法。kmeans是通过发现给定数据集的k个簇的算法。簇个数k是用户给定的,每一个簇通过其质心。及簇中所有点的中心来描述。


请打开链接下载测试 testSet.txt 文件: https://github.com/caichunbing/kmeans/blob/master/testSet.zip


#================================================================
#   Copyright (C) 2019 * Ltd. All rights reserved.
#
#   Editor      : pycharm
#   File name   : kmeans.py
#   Author      : caichunbing
#   Created date: 2019-10-18 
#   Description :kmeans聚类算法及可视化
#
#================================================================


import numpy as np
import matplotlib.pyplot as plt


def loadDataSet(fileName):  # general function to parse tab -delimited floats
    dataMat = []  # assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        curLine=[float(x) for x in curLine]
        # fltLine = map(float, curLine)  # map all elements to float()
        dataMat.append(curLine)
    return np.mat(dataMat)


def distEclud(vecA, vecB):
    dist=np.sqrt(np.sum(np.power(vecA - vecB, 2)))
    return dist  # la.norm(vecA-vecB)

def randCent(dataSet, k):
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k, n)))  # create centroid mat
    for j in range(n):  # create random cluster centers, within bounds of each dimension
        minJ = min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j]) - minJ)
        centroids[:, j] = np.mat(minJ + rangeJ * np.random.rand(k, 1))
    return centroids


def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))  # create mat to assign data points
    # to a centroid, also holds SE of each point
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):  # for each data point assign it to the closest centroid
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j, :], dataSet[i, :])

                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex: clusterChanged = True
            clusterAssment[i, :] = minIndex, minDist ** 2
        print(centroids)
        for cent in range(k):  # recalculate centroids
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]  # get all the point in this cluster
            centroids[cent, :] = np.mean(ptsInClust, axis=0)  # assign centroid to mean
    return centroids, clusterAssment


def show(w,h,centroid_w,centroid_h):
    fig = plt.figure()
    fig.suptitle("kmeans")

    ax1 = fig.add_subplot(1, 1, 1)
    ax1.scatter(w, h, s=10, color='b')
    ax1.scatter(centroid_w,centroid_h,s=10,color='r')

    plt.show()

filename="./testSet.txt"

if __name__ == '__main__':
    dataSet=loadDataSet(filename)
    centoid,cluster=kMeans(dataSet, 4,distEclud,randCent)

    w = dataSet[:, 0].tolist()
    h = dataSet[:, 1].tolist()
    centoid_w=centoid[:,0].tolist()
    centoid_h=centoid[:,1].tolist()


    show(w,h,centoid_w,centoid_h)

效果图:

在这里插入图片描述

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

菜菜菜菜菜菜菜

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值