最近要用caffe处理一个multi-label的回归问题,就是输出是一个向量,不是一个具体的数值,这个时候之前的leveldb格式就不凑效了,因为caffe源代码里面默认label是一个数值,网上搜了下,都说hdf5格式可以解决这个问题
在caffe里面,有一个hdf5的datalayer作为数据输入,从源代码来看,对于label的维数没做限制,剩下的问题就是如何生成hdf5的数据,目前只是找到了github上的一个人共享的用matlab写的hdf5数据的读写操作,在这我把代码粘贴出来
testHDF5.m
-
%% WRITING
TO HDF5
-
filename=
'trial.h5';
-
-
num_total_samples=
10000;
-
%
to simulate data being
read from disk / generated etc.
-
data_disk=rand(
5,
5,
1,num_total_samples);
-
label_disk=rand(
10,num_total_samples);
-
-
chunksz=
100;
-
created_flag=false;
-
totalct=
0;
-
for batchno=
1:num_total_samples/chunksz
-
fprintf(
'batch no. %d\n', batchno);
-
last_read=(batchno-
1)*chunksz;
-
-
%
to simulate maximum data
to be held
in memory before dumping
to hdf5
file
-
batchdata=data_disk(:,:,
1,last_read+
1:last_read+chunksz);
-
batchlabs=label_disk(:,last_read+
1:last_read+chunksz);
-
-
% store
to hdf5
-
startloc=struct(
'dat',[
1,
1,
1,totalct+
1],
'lab', [
1,totalct+
1]);
-
curr_dat_sz=store2hdf5(filename, batchdata, batchlabs, ~created_flag, startloc, chunksz);
-
created_flag=true;% flag
set so that
file
is created only once
-
totalct=curr_dat_sz(
end);% updated dataset size (#samples)
-
end
-
-
% display structure
of the
stored HDF5
file
-
h5disp(filename);
-
-
%% READING FROM HDF5
-
-
%
Read data
and labels
for samples
#1000
to
1999
-
data_rd=h5read(filename,
'/data', [
1
1
1
1000], [
5,
5,
1,
1000]);
-
label_rd=h5read(filename,
'/label', [
1
1000], [
10,
1000]);
-
fprintf(
'Testing ...\n');
-
try
-
assert(isequal(data_rd, single(data_disk(:,:,:,
1000:
1999))),
'Data do not match');
-
assert(isequal(label_rd, single(label_disk(:,
1000:
1999))),
'Labels do not match');
-
-
fprintf(
'Success!\n');
-
catch err
-
fprintf(
'Test failed ...\n');
-
getReport(err)
-
end
-
-
%delete(filename);
-
-
% CREATE list.txt containing filename,
to be used
as source
for HDF5_DATA_LAYER
-
FILE=fopen(
'list.txt',
'w');
-
fprintf(
FILE,
'%s', filename);
-
fclose(
FILE);
-
fprintf(
'HDF5 filename listed in %s \n',
'list.txt');
-
-
% NOTE:
In net definition prototxt, use list.txt
as input
to HDF5_DATA
as:
-
% layers
{
-
% name: "data"
-
% type: HDF5_DATA
-
% top: "data"
-
% top: "labelvec"
-
% hdf5_data_param {
-
% source: "/path/to/list.txt"
-
% batch_size: 64
-
% }
-
% }
store2hdf5.m
-
<span style=
"font-family:Microsoft YaHei;font-size:18px;">
function [curr_dat_sz, curr_lab_sz] = store2hdf5(filename, data, labels, create, startloc, chunksz)
-
% *data*
is W*H*C*N matrix
of images should be normalized (e.g.
to lie between
0
and
1) beforehand
-
% *label*
is D*N matrix
of labels (D labels per sample)
-
% *create* [
0/
1] specifies whether
to create file newly
or
to append
to previously created file, useful
to store information
in batches
when a dataset
is too big
to be held
in memory (
default:
1)
-
% *startloc* (point at which
to start writing data).
By
default,
-
%
if create=
1 (create mode), startloc.data=[
1
1
1
1],
and startloc.lab=[
1
1];
-
%
if create=
0 (append mode), startloc.data=[
1
1
1 K+
1],
and startloc.lab = [
1 K+
1];
where K
is the current number
of samples stored
in the HDF
-
% chunksz (used only
in create mode), specifies number
of samples
to be stored per chunk (see HDF5 documentation
on chunking)
for creating HDF5 files
with unbounded maximum size - TLDR; higher chunk sizes allow faster read-write operations
-
-
% verify that format
is right
-
dat_dims=size(data);
-
lab_dims=size(labels);
-
num_samples=dat_dims(
end);
-
-
assert(lab_dims(
end)==num_samples,
'Number of samples should be matched between data and labels');
-
-
if ~exist(
'create','var')
-
create=
true;
-
end
-
-
-
if create
-
%fprintf(
'Creating dataset with %d samples\n', num_samples);
-
if ~exist(
'chunksz', 'var')
-
chunksz=
1000;
-
end
-
if exist(filename,
'file')
-
fprintf(
'Warning: replacing existing file %s \n', filename);
-
delete(filename);
-
end
-
h5create(filename,
'/data', [dat_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [dat_dims(1:end-1) chunksz]); % width, height, channels, number
-
h5create(filename,
'/label', [lab_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [lab_dims(1:end-1) chunksz]); % width, height, channels, number
-
if ~exist(
'startloc','var')
-
startloc.dat=[ones(
1,length(dat_dims)
-1),
1];
-
startloc.lab=[ones(
1,length(lab_dims)
-1),
1];
-
end
-
else % append mode
-
if ~exist(
'startloc','var')
-
info=h5info(filename);
-
prev_dat_sz=info.Datasets(
1).Dataspace.Size;
-
prev_lab_sz=info.Datasets(
2).Dataspace.Size;
-
assert(prev_dat_sz(
1:
end
-1)==dat_dims(
1:
end
-1),
'Data dimensions must match existing dimensions in dataset');
-
assert(prev_lab_sz(
1:
end
-1)==lab_dims(
1:
end
-1),
'Label dimensions must match existing dimensions in dataset');
-
startloc.dat=[ones(
1,length(dat_dims)
-1), prev_dat_sz(
end)+
1];
-
startloc.lab=[ones(
1,length(lab_dims)
-1), prev_lab_sz(
end)+
1];
-
end
-
end
-
-
if ~isempty(data)
-
h5write(filename,
'/data', single(data), startloc.dat, size(data));
-
h5write(filename,
'/label', single(labels), startloc.lab, size(labels));
-
end
-
-
if nargout
-
info=h5info(filename);
-
curr_dat_sz=info.Datasets(
1).Dataspace.Size;
-
curr_lab_sz=info.Datasets(
2).Dataspace.Size;
-
end
-
end</span>