Python：手工代码之K-Means

最新推荐文章于 2022-06-24 01:17:47 发布

DeniuHe

最新推荐文章于 2022-06-24 01:17:47 发布

阅读量247

点赞数

分类专栏： Python学习算法

本文链接：https://blog.csdn.net/DeniuHe/article/details/102366675

版权

Python学习同时被 2 个专栏收录

239 篇文章 14 订阅

订阅专栏

算法

193 篇文章 2 订阅

订阅专栏

#DeniuHe手工代码之KMeans
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform

def randomCenters(dataSet,k):
    n_row,n_col = dataSet.shape
    centers = np.zeros((k,n_col))
    selected_index = np.random.choice(range(n_row),k)
    for k,index in zip(range(k),selected_index):
        centers[k,:] = dataSet[index,:]
    return centers

def euclideanDist(A,B):
    return np.sqrt(sum((A-B) ** 2))

def K_Means(dataSet,k):
    n,m = dataSet.shape
    random_centers = randomCenters(dataSet,k)
    distMatrix = np.zeros((n,2)) #存放样本点的质心标识及其到质心的距离
    centerChanged = True
    while centerChanged:
        centerChanged = False
        for i in range(n):
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                distMean = euclideanDist(dataSet[i,:],random_centers[j,:])
                if distMean < minDist:
                    minDist = distMean
                    minIndex = j
            if distMatrix[i,0] != minIndex:  # 聚类结果是否有变化，是否需要继续循环
                centerChanged = True
            distMatrix[i,0] = minIndex
            distMatrix[i,1] = minDist
        #更新聚类中心点
        for j in range(k):
            cluster = dataSet[distMatrix[:,0] == j]
            random_centers[j] = np.mean(cluster,axis=0)
    return random_centers,distMatrix

if __name__ == "__main__":
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    DataSet = X[:,2:4]
    RandomCents,DistMat = K_Means(DataSet,3)
    for j in range(len(DataSet)):
        if 0 == DistMat[j,0]:
            plt.scatter(DataSet[j,0],DataSet[j,1],c='red',marker='o')
        elif 1 == DistMat[j,0]:
            plt.scatter(DataSet[j,0],DataSet[j,1],c='green',marker='*')
        elif 2 == DistMat[j,0]:
            plt.scatter(DataSet[j, 0], DataSet[j,1], c='blue', marker='+')
    plt.ylabel('petal length')
    plt.xlabel('petal width')
    plt.legend(loc=2)
    plt.show()

该版本是不是会抛异常。

版本2：无异常版

import numpy as np
import pandas as pd
from sklearn import datasets as DS
import matplotlib.pyplot as plt

def euclideanDist(A,B):
    return np.sqrt(sum( (A -B ) ** 2 ))

def RandomCenters(dataSet,k):
    n = dataSet.shape[0]
    centerIndex = np.random.choice(range(n),size=k,replace=False)
    centers = dataSet[centerIndex]
    return centers
def KMeans(dataSet,k):
    Centers = RandomCenters(dataSet,k)
    n,m = dataSet.shape
    DistMatrix = np.zeros((n,2))
    centerChanged = True
    while centerChanged == True:
        centerChanged = False
        for i in range(n):
            minDist = np.inf
            minIndex = -1
            for j in range(k):
                dist = euclideanDist(dataSet[i,:],Centers[j,:])
                if dist < minDist:
                    minDist = dist
                    minIndex = j
            if DistMatrix[i,0] != minIndex:
                centerChanged = True
            DistMatrix[i,0] = minIndex
            DistMatrix[i,1] = minDist
        if centerChanged == True:    #如何聚类中心有变化，那么接下来就要更新聚类中心
            for i in range(k):
                dataMean = dataSet[DistMatrix[:,0]==i]  #dataMean中是相同类簇的样本
                Centers[i] = np.mean(dataMean,axis=0)
    return Centers,DistMatrix

if __name__ == "__main__":
    Data = np.array(pd.read_csv(r'E:\data set\clusterData\bolbs_1.csv',header=None))
    X = Data[:,:2]
    N = len(X)
    k = 24
    Center,DistMat = KMeans(X,k)
    print(set(DistMat[:,0]))
    for i in range(N):
        plt.scatter(X[:,0],X[:,1],c=DistMat[:,0])
    plt.show()