kmeans works on mnist

K-Means works on MNIST

一、实验环境
编程语言:Python 3.8

二、一点小建议
在小数据集上,或许我们还可以使用嵌套循环解决问题,但在MNIST这样稍微有点大的数据集上,如果我们还是坚持使用嵌套循环,那么仅仅是完成一次迭代就需要花上一段时间,因此,我们需要学会使用向量化编程,甚至是二维数组(矩阵)化编程,如若读者有过手动实现BPNN的经历,或许就更加能够体会到上述两种编程方式所带来的便利之处,这也是我非常喜欢矩阵的原因,因此,下面我们即将实现的K-Means和GMM都是采用了上述两种编程方式。

三、基本原理
有时间我再回来补上

四、代码展示
K-Means

# Filename: kMeans.py
# Usage: python kMeans.py

import numpy as np
from munkres import Munkres
import matplotlib.pyplot as plt
import random


'''
np.load helps you load data from train/test-images/labels.npy 
train-images.npy contains 60000 images with each image is a 28*28 matrix.
train-labels.npy contains the corresponding labels ranging from 0 to 9.
10000 test instances are included in test_images and test_labels.
'''
# Load dataset, including train_images, train_labels, test_images, test_labels
train_images = np.load("train-images.npy")
train_labels = np.load("train-labels.npy")
test_images = np.load("test-images.npy")
test_labels = np.load("test-labels.npy")
'''
print(train_images.shape)       # (60000, 784)
print(train_labels.shape)       # (60000,)  
print(test_images.shape)        # (10000, 784)
print(test_labels.shape)        # (10000,)
'''


def distEclud(vecA, vecB):
	'''
     计算两个向量的欧式距离
     '''
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


def randCent_1(dataSet, k):
	'''
     为给定数据集构建一个包含k个随机质心的集合
     随机质心必须要在整个数据集的边界之内,这可以通过找到数据集中每一维的最小和最大值来完成
     生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内
     '''
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k, n)))
    print(np.shape(centroids))
    for j in range(n):
        minJ = np.min(dataSet[:, j])
        rangeJ = np.float(np.max(dataSet[:, j]) - minJ)
        centroids[:,j] = minJ + rangeJ * np.random.rand(k,1)
    return centroids


def randCent_2(dataSet, k):
	'''
     Distance-based method
     Start with one random data instance
     Choose the point that is farthest to the existing centers
     Issue: may choose outliers
     '''
    n = np.shape(dataSet)[1]
    m = np.shape(dataSet)[0]
    centroids = []
    index = random.randint(0, m - 1)
    centroids.append(dataSet[index])
    mySet = set()
    mySet.add(index)

    for i in range(1, k):
        dist = []
        for j in range(len(np.array(centroids))):
            dist.append(np.sum(np.power(dataSet - centroids[j], 2), axis = 1))
        dist = np.array(dist)
        max_dist = np.sum(dist, axis = 0)
        max_index = np.argmax(max_dist)
        if max_index not in mySet:
            mySet.add(max_index)
            centroids.append(dataSet[max_index])
    centroids = np.mat(centroids)
    return centroids


def randCent_3(dataSet, k):
	'''
     Random method
     Choose the data instance randomly
     Issue: may choose nearby instance
     '''
    centroids = np.array(random.sample(list(dataSet), k)) 
    centroids = np.mat(centroids)
    return centroids


def kMeans(dataSet, labels, k, iter_num, distMeas = distEclud, createCent = randCent_2):
	'''
     函数一开始确定数据集中样本点的总数,并创建一个矩阵来存储每个点的簇分配结果
     簇分类结果矩阵clusterAssment包含两列:一列记录簇索引值,第二列存储误差,误差是指当前点到簇质心的距离
     反复迭代计算质心-分配-重新计算的过程,直到所有数据点的簇分配结果不再改变为止
     '''
    epoch = []
    acc = []
    num = 0
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged and num < iter_num:
        num += 1                        # 迭代次数+1
        clusterChanged = False          # 样本点的簇分配是否发生变化
        # 寻找最近的质心
        minDist =[]
        for i in range(k):
            minDist.append(np.sum(np.power(dataSet - np.array(centroids)[i], 2), axis = 1))
        minDist = np.array(minDist)
        min_dist = np.min(minDist, axis = 0)            # 最小距离
        min_index = np.argmin(minDist, axis = 0)        # 最小距离对应的簇心标号
        if((clusterAssment.T[0] == min_index.T).all()):
            clusterChanged = False                      # 样本点的簇分配没有发生变化
        else:
            clusterChanged = True                       # 样本点的簇分配发生变化
            clusterAssment.T[0] = min_index.T

        # 更新质心的位置
        for cent in range(k):
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0].A == cent)[0]]
            centroids[cent, :] = np.mean(ptsInClust, axis = 0)
        min_index = maplabels(labels + 1, min_index + 1)    # 匈牙利算法
        count = np.sum(labels + 1 == min_index)               # 分类正确的样本数目
        accuracy = count / len(labels)                        # 准确率
        print("当迭代次数为", num, "时在训练数据集上的聚类精度为", accuracy)
        epoch.append(num)
        acc.append(accuracy)
    plt.figure(1)
    plt.xlabel("iteration")
    plt.ylabel("accuracy")
    plt.title("accuracy-iteration")
    plt.plot(epoch, acc)
    plt.show()
    return centroids, clusterAssment


def calcAcc(dataSet, labels, centroids):
    # 给定数据集和真实标签以及簇心,寻找最近的质心,计算准确率
    # 寻找最近的质心
    minDist =[]
    k = len(centroids)
    for i in range(k):
        minDist.append(np.sum(np.power(dataSet - np.array(centroids)[i], 2), axis = 1))
    minDist = np.array(minDist)
    min_dist = np.min(minDist, axis = 0)
    min_index = np.argmin(minDist, axis = 0)
    # 计算准确率
    min_index = maplabels(labels + 1, min_index + 1)
    count = np.sum(labels + 1 == min_index)   # 分类正确的样本数目
    accuracy = count/len(labels)            # 准确率
    return accuracy


def maplabels(L1, L2):
    L2 = L2
    Label1 = np.unique(L1)
    Label2 = np.unique(L2)
    nClass1 = len(Label1)
    nClass2 = len(Label2)
    nClass = np.maximum(nClass1, nClass2)
    G = np.zeros((nClass, nClass))
    for i in range(nClass1):
        ind_cla1 = L1 == Label1[i]
        ind_cla1 = ind_cla1.astype(float)
        for j in range(nClass2):
            ind_cla2 = L2 == Label2[j]
            ind_cla2 = ind_cla2.astype(float)
            G[i, j] = np.sum(ind_cla2*ind_cla1)

    m = Munkres()
    index = m.compute(-G.T)
    index = np.array(index)
    index = index+1
    # print(-G.T)
    # print(index)
    newL2 = np.zeros(L2.shape, dtype=int)
    for i in range(nClass2):
        for j in range(len(L2)):
            if L2[j] == index[i, 0]:
                newL2[j] = index[i, 1]
    return newL2


def k_foldCrossValidation(dataSet, k, i):
    '''
    交叉验证法或k折交叉验证
    先将数据集D划分为k个大小相似的互斥子集,每次用k-1个子集的并集作为训练集,余下的那个子集作为验证集
    这样就可以获得k组训练集/验证集,从而可以进行k次训练和验证,最终返回的是这k个验证结果的均值
    :param dataSet:数据集
    :param k:将数据集划分为k个大小相似的互斥子集
    :param i:选取k个互斥子集中的一个作为验证集
    :return: train_set:训练集
             validation_set:验证集
    '''
    quotient = len(dataSet) // k
    remainder = len(dataSet) - quotient * k
    if(i < remainder):
        count = quotient + 1
        first = i * count
    else:
        count = quotient
        first = i * count + remainder
    last = first + count
    train_set = dataSet[:first] + dataSet[last:]
    validation_set = dataSet[first:last]
    return train_set, validation_set


def main():
    print("Running...")
    centroids, clusterAssment = kMeans(train_images, train_labels, 10, 40)
    accuracy = calcAcc(test_images, test_labels, centroids)		# 在测试数据集上的聚类精度
    print("在测试数据集上的聚类精度为", accuracy)

if __name__ == '__main__':
    main()

五、实验结果

Random methodDistance-based method
在这里插入图片描述在这里插入图片描述

六、注意
值得注意的是,由于Random method是从训练数据集中随机选取k个样本点作为簇的质心,因此,有可能会出现这k个样本中的某几个距离较近的情况。在运行上面这份代码时,如果选择质心的函数是randCent_1或者randCent_3,有可能会在第一次迭代时出现Running warning的情况,在这种情况下,请按Ctrl+C中止程序运行,并在终端中输入python kMeans.py重新运行即可。

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值