python字节流和字符流_Python 操作字节流以及 struct 模块简易教程

# -*- coding=utf-8 -*-

"""

解析MNIST数据集的IDX格式文件

"""

import scipy.misc

import numpy as np

import struct

import matplotlib.pyplot as plt

import os

# 数据集存放目录

dataset_path = "/home/ryancrj/data/mnist-dataset/"

# 训练数据集文件

train_image_idx_ubyte_file = 'train-images.idx3-ubyte'

train_labels_idx_ubyte_file = 'train-labels.idx1-ubyte'

save_train_images_path = "train_images"

save_train_labels_file= "train_labels.txt"

# 测试数据集文件

test_image_idx_ubyte_file = 't10k-images.idx3-ubyte'

test_labels_idx_ubyte_file = 't10k-labels.idx1-ubyte'

save_test_images_path = "test_images"

save_test_labels_file = "test_labels.txt"

def decode_idx3_ubyte(idx3_ubyte_file, save_path):

'''

解析idx3文件

:param idx3_ubyte_file: idx3文件路径

:return: 解析得到的数据集

'''

# 读取二进制数据

bin_data = open(idx3_ubyte_file, 'rb').read()

# 解析文件头信息:魔术数,图片数量,图片高,宽

offset = 0

fmt_header = '>iiii' # 大端读取,4个整数类型

magic_number, num_images, num_rows, num_cols = struct.unpack_from(

fmt_header, bin_data, offset)

print '魔术数: {},图片数量: {},图片大小: {} * {}'.format(

magic_number, num_images, num_rows, num_cols)

# 解析数据集

image_size = num_rows * num_cols

offset += struct.calcsize(fmt_header)

fmt_image = '>' + str(image_size) + 'B' # 从大端开始读取图像尺寸大小的无符号字节流

images = np.empty((num_images, num_rows, num_cols))

for i in range(num_images):

if (i + 1) % 10000 == 0:

print "已经解析 %d" %(i+1) + " 张"

images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)

).reshape((num_rows, num_cols))

offset += struct.calcsize(fmt_image) # 计算给定格式所占用空间大小

# 保存图片

scipy.misc.imsave(os.path.join(save_path, '{}.jpg'.format(i+1)), images[i])

return images

def decode_idx1_ubyte(idx1_ubyte_file, save_file):

'''

解析idx1文件

:param idx1_ubyte_file: idx1文件路径

:return: 解析得到的数据集

'''

# 读取二进制数据

bin_data = open(idx1_ubyte_file, 'rb').read()

# 解析文件头信息:魔术数,标签数量

offset = 0

fmt_header = '>ii'

magic_number, num_labels = struct.unpack_from(fmt_header, bin_data, offset)

print '魔术数: {},标签数量: {}'.format(magic_number, num_labels)

# 解析数据集

offset += struct.calcsize(fmt_header)

fmt_label = '>B'

labels = np.empty(num_labels)

fout = open(save_file, 'w')

for i in range(num_labels):

if (i + 1) % 10000 == 0:

print "已经解析 %d" %(i+1) + " 个"

labels[i] = np.array(struct.unpack_from(fmt_label, bin_data, offset))[0]

offset += struct.calcsize(fmt_label) # 计算给定格式所占用空间大小

fout.write(str(int(labels[i]))+'\n') # 将label写入文件

return labels

def load_train_images():

save_image_path = os.path.join(dataset_path, save_train_images_path)

if not os.path.exists(save_image_path):

os.mkdir(save_image_path)

return decode_idx3_ubyte(os.path.join(

dataset_path, train_image_idx_ubyte_file), save_image_path)

def load_train_labels():

save_file = os.path.join(dataset_path, save_train_labels_file)

return decode_idx1_ubyte(os.path.join(

dataset_path, train_labels_idx_ubyte_file), save_file)

def load_test_images():

save_image_path = os.path.join(dataset_path, save_test_images_path)

if not os.path.exists(save_image_path):

os.mkdir(save_image_path)

return decode_idx3_ubyte(os.path.join(

dataset_path, test_image_idx_ubyte_file), save_image_path)

def load_test_labels():

save_file = os.path.join(dataset_path, save_test_labels_file)

return decode_idx1_ubyte(os.path.join(

dataset_path, test_labels_idx_ubyte_file), save_file)

def test():

#train_images = load_train_images()

#train_labels = load_train_labels()

test_images = load_test_images()

test_labels = load_test_labels()

# 查看前10个标签

for i in range(10):

print test_labels[i]

plt.imshow(test_images[i], cmap='gray')

plt.show()

print 'done'

def parse_data():

train_images = load_train_images()

train_labels = load_train_labels()

test_images = load_test_images()

test_labels = load_test_labels()

if __name__ == '__main__':

parse_data()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值