python knn算法数据集和结果_python用K近邻（KNN）算法分类MNIST数据集和Fashion MNIST数据集...

最新推荐文章于 2021-10-24 19:04:06 发布

weixin_39521068

最新推荐文章于 2021-10-24 19:04:06 发布

阅读量712

点赞数

文章标签： python knn算法数据集和结果

三、数据集介绍

MNIST数据集，训练集60000张图片和标签；测试集有10000张图片和标签。读取28*28图片以后，要将每张图片转换为1*784的向量。

四、KNN算法实现和结果分析

代码实现：

from numpy import *

import operator

import os

import numpy as np

import matplotlib.pyplot as plt

from matplotlib import cm

from os import listdir

from mpl_toolkits.mplot3d import Axes3D

import struct

#读取图片

def read_image(file_name):

#先用二进制方式把文件都读进来

file_handle=open(file_name,"rb") #以二进制打开文档

file_content=file_handle.read() #读取到缓冲区中

offset=0

head = struct.unpack_from('>IIII', file_content, offset) # 取前4个整数，返回一个元组

offset += struct.calcsize('>IIII')

imgNum = head[1] #图片数

rows = head[2] #宽度

cols = head[3] #高度

# print(imgNum)

# print(rows)

# print(cols)

#测试读取一个图片是否读取成功

#im = struct.unpack_from('>784B', file_content, offset)

#offset += struct.calcsize('>784B')

images=np.empty((imgNum , 784))#empty，是它所常见的数组内的所有元素均为空，没有实际意义，它是创建数组最快的方法

image_size=rows*cols#单个图片的大小

fmt='>' + str(image_size) + 'B'#单个图片的format

for i in range(imgNum):

images[i] = np.array(struct.unpack_from(fmt, file_content, offset))

# images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols))

offset += struct.calcsize(fmt)

return images

'''bits = imgNum * rows * cols # data一共有60000*28*28个像素值

bitsString = '>' + str(bits) + 'B' # fmt格式：'>47040000B'

imgs = struct.unpack_from(bitsString, file_content, offset) # 取data数据，返回一个元组

imgs_array=np.array(imgs).reshape((imgNum,rows*cols)) #最后将读取的数据reshape成【图片数，图片像素】二维数组

return imgs_array'''

#读取标签

def read_label(file_name):

file_handle = open(file_name, "rb") # 以二进制打开文档

file_content = file_handle.read() # 读取到缓冲区中

head = struct.unpack_from('>II', file_content, 0) # 取前2个整数，返回一个元组

offset = struct.calcsize('>II')

labelNum = head[1] # label数

# print(labelNum)

bitsString = '>' + str(labelNum) + 'B' # fmt格式：'>47040000B'

label = struct.unpack_from(bitsString, file_content, offset) # 取data数据，返回一个元组

return np.array(label)

#KNN算法

def KNN(test_data, dataSet, labels, k):

dataSetSize = dataSet.shape[0]#dataSet.shape[0]表示的是读取矩阵第一维度的长度，代表行数

# distance1 = tile(test_data, (dataSetSize,1)) - dataSet#欧氏距离计算开始

# print("dataSetSize:")

# print(dataSetSize)

distance1 = tile(test_data, (dataSetSize)).reshape((60000,784))-dataSet#tile函数在行上重复dataSetSizec次，在列上重复1次

# print("distance1.shape")

# print(distance1.shape)

distance2 = distance1**2 #每个元素平方

distance3 = distance2.sum(axis=1)#矩阵每行相加

distances4 = distance3**0.5#欧氏距离计算结束

# print(distances4[53843])

# print(distances4[38620])

# print(distances4[16186])

sortedDistIndicies = distances4.argsort() #返回从小到大排序的索引

classCount=np.zeros((10), np.int32)#10是代表10个类别

for i in range(k): #统计前k个数据类的数量

voteIlabel = labels[sortedDistIndicies[i]]

classCount[voteIlabel] += 1

max = 0

id = 0

print(classCount.shape[0])

# print(classCount.shape[1])

for i in range(classCount.shape[0]):

if classCount[i] >= max:

max = classCount[i]

id = i

print(id)

# sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)#从大到小按类别数目排序

return id

def test_KNN():

# 文件获取

#mnist数据集

# train_image = "F:\mnist\\train-images-idx3-ubyte"

# test_image = "F:\mnist\\t10k-images-idx3-ubyte"

# train_label = "F:\mnist\\train-labels-idx1-ubyte"

# test_label = "F:\mnist\\t10k-labels-idx1-ubyte"

#fashion mnist数据集

train_image = "train-images-idx3-ubyte"

test_image = "t10k-images-idx3-ubyte"

train_label = "train-labels-idx1-ubyte"

test_label = "t10k-labels-idx1-ubyte"

# 读取数据

train_x = read_image(train_image) # train_dataSet

test_x = read_image(test_image) # test_dataSet

train_y = read_label(train_label) # train_label

test_y = read_label(test_label) # test_label

# print(train_x.shape)

# print(test_x.shape)

# print(train_y.shape)

# print(test_y.shape)

# plt.imshow(train_x[0])

# plt.show()

testRatio = 1 # 取数据集的前0.1为测试数据,这个参数比重可以改变

train_row = train_x.shape[0] # 数据集的行数，即数据集的总的样本数

test_row=test_x.shape[0]

testNum = int(test_row * testRatio)

errorCount = 0 # 判断错误的个数

for i in range(testNum):

result = KNN(test_x[i], train_x, train_y, 30)

# print('返回的结果是: %s, 真实结果是: %s' % (result, train_y[i]))

print(result, test_y[i])

if result != test_y[i]:

errorCount += 1.0# 如果mnist验证集的标签和本身标签不一样，则出错

error_rate = errorCount / float(testNum) # 计算出错率

acc = 1.0 - error_rate

print(errorCount)

print("\nthe total number of errors is: %d" % errorCount)

print("\nthe total error rate is: %f" % (error_rate))

print("\nthe total accuracy rate is: %f" % (acc))

if __name__ == "__main__":

test_KNN()#test()函数中调用了读取数据集的函数，并调用分类函数对数据集进行分类，最后对分类情况进行计算

结果分析：

输入：mnist数据集或者fashion mnist数据集

输出：出错率和准确率

Mnist数据集：

取k=30，验证集是50个的时候，准确率是1；

取k=30，验证集是500个的时候，准确率是0.98；

取k=30，验证集是10000个的时候，准确率是0.84。

Fashion Mnist数据集

K=30，验证集是10000的时候，一共的出错个数是1666，准确率是0.8334。

本文中的数据集采用KNN算法得到了较高的准确率，但是本文中考虑特征属性值对类别判断的重要性一样，改进算法时应该考虑特征属性值对类别判断的重要性不同，两样本间属性的相关距离可以用来度量属性值对类别的重要性，相关距离熵越小，两样本的相似程度越大，类可信度越大；此外本文中应该对不同取值的k进行分别的试验，得到使准确率较高的k，同时在实验多个k的时候，可以采用多线程进行跑实验，缩短时间。

weixin_39521068

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
python knn算法数据集和结果_python用K近邻（KNN）算法分类MNIST数据集和Fashion MNIST数据集...

三、数据集介绍MNIST数据集，训练集60000张图片和标签；测试集有10000张图片和标签。读取28*28图片以后，要将每张图片转换为1*784的向量。四、KNN算法实现和结果分析代码实现：from numpy import *import operatorimport osimport numpy as npimport matplotlib.pyplot as pltfrom matplot...
复制链接

扫一扫