通常官网提供的mnist数据集都是压缩格式的文档,有时候我们在使用的时候需要将其:
(1)、解压成图片格式存入文件夹
(2)、或者保存成csv格式的文档
(1)保存成图片格式(windows下)
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 18 17:36:14 2020
unzip_mnist
"""
import struct
from array import array
import numpy as np
import os
from PIL import Image
import cv2
trainimg = './fashion_mnist_origin/train-images.idx3-ubyte'
trainlabel = './fashion_mnist_origin/train-labels.idx1-ubyte'
testimg = './fashion_mnist_origin/t10k-images.idx3-ubyte'
testlabel = './fashion_mnist_origin/t10k-labels.idx1-ubyte'
trainfolder = './fashion_mnist_origin/train'
testfolder = './fashion_mnist_origin/test'
if not os.path.exists(trainfolder): os.makedirs(trainfolder)
if not os.path.exists(testfolder): os.makedirs(testfolder)
# open(文件路径,读写格式),用于打开一个文件,返回一个文件对象
# rb表示以二进制读模式打开文件
trimg = open(trainimg, 'rb')
teimg = open(testimg, 'rb')
trlab = open(trainlabel, 'rb')
telab = open(testlabel, 'rb')
# struct的用法这里不详述
struct.unpack(">IIII", trimg.read(16))
struct.unpack(">IIII", teimg.read(16))
struct.unpack(">II", trlab.read(8))
struct.unpack(">II", telab.read(8))
# array模块是Python中实现的一种高效的数组存储类型
# 所有数组成员都必须是同一种类型,在创建数组时就已经规定
# B表示无符号字节型,b表示有符号字节型
trimage = array("B", trimg.read())
teimage = array("B", teimg.read())
trlabel = array("b", trlab.read())
telabel = array("b", telab.read())
# close方法用于关闭一个已打开的文件,关闭后文件不能再进行读写操作
trimg.close()
teimg.close()
trlab.close()
telab.close()
# 为训练集和测试集各定义10个子文件夹,用于存放从0到9的所有数字,文件夹名分别为0-9
trainfolders = [os.path.join(trainfolder, str(i)) for i in range(10)]
testfolders = [os.path.join(testfolder, str(i)) for i in range(10)]
for dir in trainfolders:
if not os.path.exists(dir):
os.makedirs(dir)
for dir in testfolders:
if not os.path.exists(dir):
os.makedirs(dir)
# 开始保存训练图像数据
for (i, label) in enumerate(trlabel):
filename = os.path.join(trainfolders[label], str(i) + ".png")
print("writing " + filename)
with open(filename, "wb") as img:
#image = np.array([28,28])
data = [trimage[(i*28*28 + j*28) : (i*28*28 + (j+1)*28)] for j in range(28)]
data = np.array(data)
#print("data:",data.shape)
image = data
# for r in range(28):
#print("data:",np.array(data[r]))
# for c in range(28):
#image(r,:) = data1[:28]
#print(" image shape:",image)
cv2.imwrite(filename, image)
#image.write(img, data)
# 开始保存测试图像数据
for (i, label) in enumerate(telabel):
filename = os.path.join(testfolders[label], str(i) + ".png")
print("writing " + filename)
with open(filename, "wb") as img:
data = [teimage[(i*28*28 + j*28) : (i*28*28 + (j+1)*28)] for j in range(28)]
image = np.array(data)
cv2.imwrite(filename,image)
针对该作者的稍作修改:
https://blog.csdn.net/SongGu1996/article/details/98849274
(2)存成CSV格式的文档
def convert(imgf, labelf, outf, n):
f = open(imgf, "rb")
o = open(outf, "w")
l = open(labelf, "rb")
f.read(16)
l.read(8)
images = []
for i in range(n):
image = [ord(l.read(1))]
for j in range(28*28):
image.append(ord(f.read(1)))
images.append(image)
for image in images:
o.write(",".join(str(pix) for pix in image)+"\n")
f.close()
o.close()
l.close()
convert("mnist_origin/train-images.idx3-ubyte", "mnist_origin/train-labels.idx1-ubyte",
"mnist_train.csv", 60000)
convert("mnist_origin/t10k-images.idx3-ubyte", "mnist_origin/t10k-labels.idx1-ubyte",
"mnist_test.csv", 10000)
print("Convert Finished!")