掌控MNIST数据集

最新推荐文章于 2023-09-20 11:34:00 发布

木盏

最新推荐文章于 2023-09-20 11:34:00 发布

阅读量386

点赞数

分类专栏： python Computer Vision

本文链接：https://blog.csdn.net/leviopku/article/details/80526644

版权

python 同时被 2 个专栏收录

130 篇文章 25 订阅

订阅专栏

Computer Vision

75 篇文章 79 订阅

订阅专栏

MNIST数据集号称是神经网络界的’hello world’，凡入门CNN，必先把MNIST玩得666才行。

MNIST有四个文件：训练集，训练标签，测试集，测试标签。
（共11M左右）
下载链接: http://yann.lecun.com/exdb/mnist/

数据集给的应该是图像和标签(数字), 那么如果想看到图像怎么做。如果想不依靠任何深度学习框架来玩MNIST数据集，又该怎么做？
废话不多说，给代码看:

#!/usr/bin/env python
# encoding: utf-8
import struct
from PIL import Image
import numpy as np
import gzip
import os.path as osp


class Format_decoder():
	def __init__(self):
		pass

	def save_images_into_file(filename):
		g_file = gzip.GzipFile(filename)
		# 创建gzip对象
		buf = g_file.read()
		g_file.close()
		index = 0
		magic, images, rows, columns = struct.unpack_from('>IIII', buf, index)
		index += struct.calcsize('>IIII')
		for i in range(images):
			image = Image.new('L', (columns, rows))
			for x in range(rows):
				for y in range(columns):
					image.putpixel((y, x), int(struct.unpack_from('>B', buf, index)[0]))
					index += struct.calcsize('>B')
			print('save ' + str(i) + 'image')
			image.save('imgs/' + str(i) + '.png')


	def reform_data_into_npy(filename):
		saveFilename = filename.split('.')[0]+'.npy'
		if osp.exists(saveFilename):
			print(saveFilename+' has already existed')
			return
		g_file = gzip.GzipFile(filename)
		# 创建gzip对象
		buf = g_file.read()
		g_file.close()
		index = 0
		magic, images, rows, columns = struct.unpack_from('>IIII', buf, index)
		index += struct.calcsize('>IIII')
		all_array_list = []
		for i in range(images):
			if i%1000 == 0:
				percentage = round(i*100/images, 0)
				print('processing: %s %%'% percentage)
			image = Image.new('L', (columns, rows))
			for x in range(rows):
				for y in range(columns):
					image.putpixel((y, x), int(struct.unpack_from('>B', buf, index)[0]))
					index += struct.calcsize('>B')
			narray = np.array(image)
			all_array_list.append(narray)
		all_arrays = np.array(all_array_list)
		print('processing successfully!')
		print(np.shape(all_arrays))
		np.save(saveFilename, all_arrays)

	def read_label(filename):
		saveFilename = filename.split('-')[0]+'-label.txt'
		if osp.exists(saveFilename):
			print(saveFilename+' has already existed')
			return
		g_file = gzip.GzipFile(filename)
		# 创建gzip对象
		buf = g_file.read()
		g_file.close()
		index = 0
		magic, labels = struct.unpack_from('>II', buf, index)
		index += struct.calcsize('>II')
		labelArr = [0] * labels
		for x in range(labels):
			labelArr[x] = int(struct.unpack_from('>B', buf, index)[0])
			index += struct.calcsize('>B')
		save = open(saveFilename, 'w')
		save.write(','.join(map(lambda x: str(x), labelArr)))
		save.write('\n')
		save.close()
		print('save labels success')


if __name__ == '__main__':
	Format_decoder.reform_data_into_npy('t10k-images-idx3-ubyte.gz')
	Format_decoder.read_label('t10k-labels-idx1-ubyte.gz')
	Format_decoder.reform_data_into_npy('train-images-idx3-ubyte.gz')
	Format_decoder.read_label('train-labels-idx1-ubyte.gz')