一、制作 HDF5 数据集
1、HDF5 简介
2、整体流程
3、代码示例
# -*- coding: utf-8 -*-
"""
# train_img_list.txt content
./landmark/1.jpg 0.01 1.01 0.02 0.98 -0.04 0.44 0.54 1.06
./landmark/2.jpg 0.04 0.99 0.06 0.98 0.21 0.05 0.95 0.78
...
# train_h5_list.txt content
train0.h5
train1.h5
train2.h5
...
"""
import os
import sys
import cv2
import h5py
import random
import numpy as np
def get_img_info(img_list, img_h, img_w, label_len):
"""
Args:
img_list: str, where the img txt file exists
img_h: int, height of the resized img
img_w: int, width of the resized img
label_len: int, label num
Returns:
img_num, images with data filled, labels with data filled
"""
with open(img_list, 'r') as f:
lines = f.readlines()
random.shuffle(lines)
img_num_f = len(lines)
# training list 不能过长,防止内存溢出(<=20w)
images_f = np.zeros([img_num_f, 3, img_h, img_w], dtype=np.float32)
labels_f = np.zeros([img_num_f, label_len, 1, 1], dtype=np.float32)
cnt = 0
for index in range(img_num_f):
line = lines[index]
img_path = line.strip().split(' ')[0]
img_label = np.array(line.strip().split(' ')[1:]).astype(np.float32)
try:
img = cv2.imread(img_path)
img = cv2.resize(img, (img_h, img_w))
# img = img[:, :, ::-1] # BGR2RGB
img = (img - 127.5) / 128 # pre_process
img = img.transpose(2, 0, 1) # 普通图片输入是 h w c, 而 caffe 要求是 c h w,所以要转回来
images_f[index, :, :, :] = img.astype(np.float32)
labels_f[index, :] = img_label.reshape(label_len, 1, 1)
cnt += 1
if cnt % 2 == 0:
sys.stdout.write('\r>> %d/%d images has been processed' % (cnt, img_num_f))
sys.stdout.flush() # 强制刷新缓冲区,立刻进行打印
except Exception as e:
print('error reason is {}'.format(e))
print('error img is {}'.format(img_path))
continue
return img_num_f, images_f, labels_f
def gen_hdf5(img_list, h5_list, out_prefix):
"""
Args:
img_list: str, where the img txt file exists
h5_list: str, where the hdf5 txt file saved
out_prefix: str, the created hdf5 prefix
Returns:
None
"""
img_num, images, labels = get_img_info(img_list=img_list, img_h=106, img_w=106, label_len=8)
if os.path.exists(h5_list):
os.remove(h5_list)
# 采用 gzip 压缩,等级分为 1~9,1 最快, 但是压缩比最差
comp_kwargs = {'compression': 'gzip', 'compression_opts': 1}
# 每个 hdf5 文件里存放的个数, 一般单个文件设置在 4G 以下, 若想只生成一个, 设很大即可
# 设置 h5_capacity 的时候要注意: Chunk shape must not be greater than data shape in any dimension.
# error example: (128, 3, 106, 106) is not compatible with (48, 3, 106, 106)
h5_capacity = 200000
batch_num = int(np.ceil(1.0 * img_num / h5_capacity))
for i in range(batch_num):
start = i * h5_capacity
end = min((i + 1) * h5_capacity, img_num)
file_name = '{}{}.h5'.format(out_prefix, i)
print('\nWriting hdf5, this will take a few minutes!')
with h5py.File(file_name, 'w') as f:
# 分块存储时,尽量设置 chunks[0]=batch_size=128, [N, C, H, W]
f.create_dataset('img_data', data=np.array(images[start: end].astype(np.float32)),
chunks=(128, 3, 106, 106), **comp_kwargs)
f.create_dataset('img_label', data=np.array(labels[start: end]).astype(np.float32),
chunks=(128, 8, 1, 1), **comp_kwargs)
# 写入列表文件,可以有多个 hdf5 文件
with open(h5_list, 'a') as f:
f.write("{}{}.h5".format(out_prefix, i) + '\n')
if __name__ == '__main__':
gen_hdf5(img_list='test_img_list.txt', h5_list='test_h5_list.txt', out_prefix='test')
print('gen test h5 done!')
gen_hdf5(img_list='train_img_list.txt', h5_list='train_h5_list.txt', out_prefix='train')
print('gen train h5 done!')
二、Caffe 中 的 HDF5Data Layer 设置
name: "HDF5 Input Demo"
# Data Layer
layer {
name: "data"
type: "HDF5Data"
top: "img_data"
top: "img_label"
include {
phase: TRAIN
}
hdf5_data_param {
source: "train_h5_list.txt" # 所有需要读取的 hdf5 文件的路径
batch_size: 128
shuffle: true
}
}
layer {
name: "data"
type: "HDF5Data"
top: "img_data"
top: "img_label"
include {
phase: TEST
}
hdf5_data_param {
source: "test_h5_list.txt"
batch_size: 128
}
}
# Loss Layer
layer {
name: "regression_loss"
type: "EuclideanLoss"
bottom: "conv9"
bottom: "img_label"
top: "regression_loss"
}
三、参考资料
1、使用 Python 制作 Caffe 的数据源 hdf5
2、生成 hdf5文件用于多标签训练
3、Caffe 中 HDF5Data 例子
4、Caffe 中同时使用 LMDB 和 HDF5