MNIST数据集简介
# MNIST 数据集合共包含70000张手写数字图片
# 其中60000张用作训练集
# 10000张用作预测集
# 数据集包含了0-9共10类手写数字图片,每张
# 图片都做了尺寸归一化,都是28x28大小的灰
# 度图。每张图片中像素值大小在0-255之间,
# 其中0是黑色背景,255是白色前景
# 文件数据是以二进制文件进行存储的
# **************************************#
# train-images-indx3-ubyte.gz # 训练集图片
# train-labels-indx1-ubyte.gz # 训练集标签
# t10k-images-idx3-ubyte.gz # 预测集图片
# t10k-labels-idx1-ubyte.gz # 预测集标签
# ******************************图像文件**********************#
# 一张照片包含28*28=784 个像素点,需要784bytes的存储空间
# 60000张图片则需要784*60000=47040000bytes的存储空间
# 文件开始处使用了16个bytes用于存储magic number、图像数量
# 图像高度和图像宽度,因此,训练集图像文件的大小应该是
# 47040000+16 = 47040016 bytes
# 1-4 个bytes 存的是文件的magic number
# 5-8 个bytes 存的是图像的个数,60000
# 9-12 个bytes 存的是每张图片的行数/高度,28
# 13-16 个bytes 存的是每张图片的列数/宽度,28
# 从第17个byte开始,每个byte存储一张图片中的一个像素点
# *******标签文件#
# 一个数字占有1Byte
# 1*60000+8=60008bytes
# 1-4 个bytes存储的是文件的magic number
# 5-8 个bytes存的是number of items 即label数量60000
# 从9个byte开始,每个byte存一个照片的label信息,即数字0-9中的一个
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
代码
# coding=utf-8
'''将二进制格式的MNIST数据集转成.jpg图片格式并保存,图片标签包含在图片名中'''
import numpy as np
import cv2
import os
def save_mnist_to_jpg(mnist_image_file, mnist_label_file, save_dir):
if ‘train’ in os.path.basename(mnist_image_file):
num_file = 60000
prefix = ‘train’
print("file_num is ",num_file)
else:
num_file = 10000
prefix = ‘test’
print("file num is ",num_file)
with open(mnist_image_file, ‘rb’) as f1:
image_file = f1.read()
<span class="token keyword">with</span> <span class="token builtin">open</span><span class="token punctuation">(</span>mnist_label_file<span class="token punctuation">,</span> <span class="token string">'rb'</span><span class="token punctuation">)</span> <span class="token keyword">as</span> f2<span class="token punctuation">:</span>
label_file <span class="token operator">=</span> f2<span class="token punctuation">.</span>read<span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token comment">#print(len(label_file))</span>
image_file <span class="token operator">=</span> image_file<span class="token punctuation">[</span><span class="token number">16</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token builtin">len</span><span class="token punctuation">(</span>image_file<span class="token punctuation">)</span><span class="token punctuation">)</span>
label_file <span class="token operator">=</span> label_file<span class="token punctuation">[</span><span class="token number">8</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token builtin">len</span><span class="token punctuation">(</span>label_file<span class="token punctuation">)</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"The first number is "</span><span class="token punctuation">,</span>label_file<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">"The second number is "</span><span class="token punctuation">,</span>label_file<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> i <span class="token keyword">in</span> <span class="token builtin">range</span><span class="token punctuation">(</span>num_file<span class="token punctuation">)</span><span class="token punctuation">:</span>
label_name <span class="token operator">=</span> label_file<span class="token punctuation">[</span>i<span class="token punctuation">]</span>
image_list <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">int</span><span class="token punctuation">(</span>item<span class="token punctuation">)</span> <span class="token keyword">for</span> item <span class="token keyword">in</span> image_file<span class="token punctuation">[</span>i <span class="token operator">*</span> <span class="token number">784</span><span class="token punctuation">:</span> i<span class="token operator">*</span><span class="token number">784</span><span class="token operator">+</span><span class="token number">784</span><span class="token punctuation">]</span><span class="token punctuation">]</span> <span class="token comment"># 将二进制数据转换成十进制</span>
image_array <span class="token operator">=</span> np<span class="token punctuation">.</span>array<span class="token punctuation">(</span>image_list<span class="token punctuation">,</span>dtype<span class="token operator">=</span>np<span class="token punctuation">.</span>uint8<span class="token punctuation">)</span><span class="token punctuation">.</span>reshape<span class="token punctuation">(</span><span class="token number">28</span><span class="token punctuation">,</span><span class="token number">28</span><span class="token punctuation">,</span><span class="token number">1</span><span class="token punctuation">)</span>
<span class="token keyword">print</span><span class="token punctuation">(</span>image_array<span class="token punctuation">.</span>shape<span class="token punctuation">)</span>
image_name <span class="token operator">=</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>join<span class="token punctuation">(</span>save_dir<span class="token punctuation">,</span><span class="token builtin">str</span><span class="token punctuation">(</span>i<span class="token operator">+</span><span class="token number">1</span><span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'_'</span><span class="token operator">+</span><span class="token builtin">str</span><span class="token punctuation">(</span>label_name<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'.jpg'</span><span class="token punctuation">)</span>
cv2<span class="token punctuation">.</span>imwrite<span class="token punctuation">(</span>image_name<span class="token punctuation">,</span>image_array<span class="token punctuation">)</span>
<span class="token keyword">pass</span>
# 参考网址:https://www.jianshu.com/p/e7c286530ab9
# 运动小爽博主对MNIST数据集介绍的很清楚
if name == ‘main’:
train_image_file = ‘./mnist_dataset/train-images.idx3-ubyte’
train_label_file = ‘./mnist_dataset/train-labels.idx1-ubyte’
test_image_file = ‘./mnist_dataset/t10k-images.idx3-ubyte’
test_label_file = ‘./mnist_dataset/t10k-labels.idx1-ubyte’
save_train_dir <span class="token operator">=</span> <span class="token string">'./train_images/'</span>
save_test_dir <span class="token operator">=</span><span class="token string">'./test_images/'</span>
<span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span>save_train_dir<span class="token punctuation">)</span><span class="token punctuation">:</span>
os<span class="token punctuation">.</span>makedirs<span class="token punctuation">(</span>save_train_dir<span class="token punctuation">)</span>
<span class="token keyword">if</span> <span class="token operator">not</span> os<span class="token punctuation">.</span>path<span class="token punctuation">.</span>exists<span class="token punctuation">(</span>save_test_dir<span class="token punctuation">)</span><span class="token punctuation">:</span>
os<span class="token punctuation">.</span>makedirs<span class="token punctuation">(</span>save_test_dir<span class="token punctuation">)</span>
save_mnist_to_jpg<span class="token punctuation">(</span>train_image_file<span class="token punctuation">,</span> train_label_file<span class="token punctuation">,</span> save_train_dir<span class="token punctuation">)</span>
save_mnist_to_jpg<span class="token punctuation">(</span>test_image_file<span class="token punctuation">,</span> test_label_file<span class="token punctuation">,</span> save_test_dir<span class="token punctuation">)</span>
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
重要部分
for i in range(num_file):
label_name = label_file[i] # 数据图片对应的标签
image_list = [int(item) for item in image_file[i * 784: i*784+784]] # 将二进制数据转换成十进制
image_array = np.array(image_list,dtype=np.uint8).reshape(28,28,1)
print(image_array.shape)
# 保存图片的路径
image_name = os.path.join(save_dir,str(i+1)+'_'+str(label_name)+'.jpg')
cv2.imwrite(image_name,image_array)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
参考
- 参考网址:https://www.jianshu.com/p/e7c286530ab9
- 运动小爽博主对MNIST数据集介绍的很清楚[嘻嘻]
- 感谢博主
结果(百度网盘自取)
链接:https://pan.baidu.com/s/1VGEgS5-mYb6RQc8NaPMwnw
提取码:ck0r
</div><div data-report-view="{"mod":"1585297308_001","dest":"https://blog.csdn.net/weixin_42473844/article/details/103995329","extend1":"pc","ab":"new"}"><div></div></div>
<link href="https://csdnimg.cn/release/phoenix/mdeditor/markdown_views-60ecaf1f42.css" rel="stylesheet">
</div>
</article>