正犯愁mnist数据集太小了,搜到了SVHN
看起来需要二次加工一下用数据集里面自带的既定事实标签把数字给抠出来
这样就是一个新的专门用来做数字分类的数据集
mat格式的数据真是对Python太不友好了
要建个文件夹这样直接就是写到了0-9的文件夹
import os
import h5py
from PIL import Image
%pylab inline
import cv2
import glob
def get_attrs(digit_struct_mat_file, index):
"""
Returns a dictionary which contains keys: label, left, top, width and height, each key has multiple values.
"""
attrs = {}
f = digit_struct_mat_file
item = f['digitStruct']['bbox'][index].item()
for key in ['label', 'left', 'top', 'width', 'height']:
attr = f[item][key]
values = [f[attr.value[i].item()].value[0][0]
for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]
attrs[key] = values
return attrs
path_to_dir = '/home/nvidia/SVHN/train'
path_to_digit_struct_mat_file = os.path.join(path_to_dir, 'digitStruct.mat')
path_to_dir = '/home/nvidia/SVHN/train/'#存放图片的文件夹路径
paths = glob.glob(os.path.join(path_to_dir, '*.png'))
paths.sort()
num = 0
for i in paths:
path_to_image_file = os.path.join(i)
index = int(path_to_image_file.split('/')[-1].split('.')[0]) - 1
# print(index, path_to_image_