HDF 文件结构包括一个file id(文件号)、至少一个 data descriptor (数据描述符)、没有或多个 data element(数据内容)数据内容。
file id (文件号)是一个 32 比特的值,最多占用 HDF 文件的头4 个字节。通过读取这个值,应用程序就知道此文件是否是一个HDF 文件
Data descriptor block(数据块描述符)包含一个数据描述符数值。所有的数据描述符都是12 字节长,包含 4 个域,即一个 16比特长的标签,一个 16比特的引用字,一个 32比特的数据偏移量和一个32 比特的数据长度。
tag(标记)是数据描述符域,表示存于相应数据内容的数据类型。例如 306 是光栅图像对象的识别符。
Reference number(引用号)是一个 16 比特无符号整型数。HDF 文件中的每一个对象,由HDF库和数据描述符中的标签确定一个唯一的引用字。在引用字确定的数据对象期间,标签和引用字不能改变。标签和引用字的结合可以唯一确定文件中对应的数据对象。
引用字没有必要连续指定,因此,在一个具有相同标签区分对象的方法后,不能假设引用字的值有任何意义。有时应用程序员也会发现在他们的程序中把一些另外的信息加到引用字中是很方便的,但必须强调的是,HDF 库本身并不识别这些含义。
Data offset field(数据偏移量)是一个 32 比特无符号整型字。通过存储文件开始时的字节数和数据内容开始时的字节数,指明文件中数据内容的位置。
Length field (长度域)是一个 32 比特无符号整型字。它表示整个数据内容的字节大小。数据内容增加,其长度也要增加。
Data element(数据成分)是数据对象的原始数据部分,包含每个象素的值。
最简单hdf5的matlab读写操作
testHDF5.m
%% WRITING TO HDF5
filename='trial.h5';
num_total_samples=10000;
% to simulate data being read from disk / generated etc.
data_disk=rand(5,5,1,num_total_samples);
label_disk=rand(10,num_total_samples);
chunksz=100;
created_flag=false;
totalct=0;
for batchno=1:num_total_samples/chunksz
fprintf('batch no. %d\n', batchno);
last_read=(batchno-1)*chunksz;
% to simulate maximum data to be held in memory before dumping to hdf5 file
batchdata=data_disk(:,:,1,last_read+1:last_read+chunksz);
batchlabs=label_disk(:,last_read+1:last_read+chunksz);
% store to hdf5
startloc=struct('dat',[1,1,1,totalct+1], 'lab', [1,totalct+1]);
curr_dat_sz=store2hdf5(filename, batchdata, batchlabs, ~created_flag, startloc, chunksz);
created_flag=true;% flag set so that file is created only once
totalct=curr_dat_sz(end);% updated dataset size (#samples)
end
% display structure of the stored HDF5 file
h5disp(filename);
%% READING FROM HDF5
% Read data and labels for samples #1000 to 1999
data_rd=h5read(filename, '/data', [1 1 1 1000], [5, 5, 1, 1000]);
label_rd=h5read(filename, '/label', [1 1000], [10, 1000]);
fprintf('Testing ...\n');
try
assert(isequal(data_rd, single(data_disk(:,:,:,1000:1999))), 'Data do not match');
assert(isequal(label_rd, single(label_disk(:,1000:1999))), 'Labels do not match');
fprintf('Success!\n');
catch err
fprintf('Test failed ...\n');
getReport(err)
end
%delete(filename);
% CREATE list.txt containing filename, to be used as source for HDF5_DATA_LAYER
FILE=fopen('list.txt', 'w');
fprintf(FILE, '%s', filename);
fclose(FILE);
fprintf('HDF5 filename listed in %s \n', 'list.txt');
% NOTE: In net definition prototxt, use list.txt as input to HDF5_DATA as:
% layers {
% name: "data"
% type: HDF5_DATA
% top: "data"
% top: "labelvec"
% hdf5_data_param {
% source: "/path/to/list.txt"
% batch_size: 64
% }
% }
store2hdf5.m
function [curr_dat_sz, curr_lab_sz] = store2hdf5(filename, data, labels, create, startloc, chunksz)
% *data* is W*H*C*N matrix of images should be normalized (e.g. to lie between 0 and 1) beforehand
% *label* is D*N matrix of labels (D labels per sample)
% *create* [0/1] specifies whether to create file newly or to append to previously created file, useful to store information in batches when a dataset is too big to be held in memory (default: 1)
% *startloc* (point at which to start writing data). By default,
% if create=1 (create mode), startloc.data=[1 1 1 1], and startloc.lab=[1 1];
% if create=0 (append mode), startloc.data=[1 1 1 K+1], and startloc.lab = [1 K+1]; where K is the current number of samples stored in the HDF
% chunksz (used only in create mode), specifies number of samples to be stored per chunk (see HDF5 documentation on chunking) for creating HDF5 files with unbounded maximum size - TLDR; higher chunk sizes allow faster read-write operations
% verify that format is right
dat_dims=size(data);
lab_dims=size(labels);
num_samples=dat_dims(end);
assert(lab_dims(end)==num_samples, 'Number of samples should be matched between data and labels');
if ~exist('create','var')
create=true;
end
if create
%fprintf('Creating dataset with %d samples\n', num_samples);
if ~exist('chunksz', 'var')
chunksz=1000;
end
if exist(filename, 'file')
fprintf('Warning: replacing existing file %s \n', filename);
delete(filename);
end
h5create(filename, '/data', [dat_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [dat_dims(1:end-1) chunksz]); % width, height, channels, number
h5create(filename, '/label', [lab_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [lab_dims(1:end-1) chunksz]); % width, height, channels, number
if ~exist('startloc','var')
startloc.dat=[ones(1,length(dat_dims)-1), 1];
startloc.lab=[ones(1,length(lab_dims)-1), 1];
end
else % append mode
if ~exist('startloc','var')
info=h5info(filename);
prev_dat_sz=info.Datasets(1).Dataspace.Size;
prev_lab_sz=info.Datasets(2).Dataspace.Size;
assert(prev_dat_sz(1:end-1)==dat_dims(1:end-1), 'Data dimensions must match existing dimensions in dataset');
assert(prev_lab_sz(1:end-1)==lab_dims(1:end-1), 'Label dimensions must match existing dimensions in dataset');
startloc.dat=[ones(1,length(dat_dims)-1), prev_dat_sz(end)+1];
startloc.lab=[ones(1,length(lab_dims)-1), prev_lab_sz(end)+1];
end
end
if ~isempty(data)
h5write(filename, '/data', single(data), startloc.dat, size(data));
h5write(filename, '/label', single(labels), startloc.lab, size(labels));
end
if nargout
info=h5info(filename);
curr_dat_sz=info.Datasets(1).Dataspace.Size;
curr_lab_sz=info.Datasets(2).Dataspace.Size;
end
end
Python解析HDF文件
#xlwt这个包是写HDF文件的
import math
import pandas as pd
import xlwt
with closing(pd.HDFStore(HDF_FILR_URL)) as store:
df = store[date]
# index shoule be end -> region -> group
df.reset_index(inplace=True)
df.set_index(["end", "region", "group"], inplace=True)
df.sort_index(inplace=True)
#获取到数据之后就是pandas提供的函数,获取自己需要的数据
slice_df = df.loc[dt]
rtt = slice_df.rtt.unstack(level=0) / 1000
cwnd = slice_df.cwnd.unstack(level=0)
total = slice_df.total.unstack(level=0)
rows = rtt.index.tolist()
columns = rtt.columns.tolist()
#最后写入Excel
def writexcel(listname, name, time):
#将数据写入Excel
saveurl = EXCEL_FILR_URL + '%s_%s_%s.xls' % (AVG_RTT, time, name)
excel_file = xlwt.Workbook()
table = excel_file.add_sheet('tcpinfo')
index_row = 0
for item in listname:
for item_key, item_value in item.items():
table.write(index_row, 0, str(item_key))
table.write(index_row, 1, str(item_value[1][0]))
table.write(index_row, 2, str(item_value[1][1]))
table.write(index_row, 3, str(item_value[0]).decode('utf-8'))
index_row += 1
excel_file.save(saveurl)
先看一个mnist的hdf5的输入例子:
loadMNISTImages.m
function images = loadMNISTImages(filename)
%loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing
%the raw MNIST images
fp = fopen(filename, 'rb');
assert(fp ~= -1, ['Could not open ', filename, '']);
magic = fread(fp, 1, 'int32', 0, 'ieee-be');
assert(magic == 2051, ['Bad magic number in ', filename, '']);
numImages = fread(fp, 1, 'int32', 0, 'ieee-be');
numRows = fread(fp, 1, 'int32', 0, 'ieee-be');
numCols = fread(fp, 1, 'int32', 0, 'ieee-be');
images = fread(fp, inf, 'unsigned char');
images = reshape(images, numCols, numRows, numImages);
images = permute(images,[2 1 3]);
fclose(fp);
% Reshape to #pixels x #examples
images = reshape(images, size(images, 1) * size(images, 2), size(images, 3));
% Convert to double and rescale to [0,1]
images = double(images) / 255;
end
loadMNISTLabels.m
function labels = loadMNISTLabels(filename)
%loadMNISTLabels returns a [number of MNIST images]x1 matrix containing
%the labels for the MNIST images
fp = fopen(filename, 'rb');
assert(fp ~= -1, ['Could not open ', filename, '']);
magic = fread(fp, 1, 'int32', 0, 'ieee-be');
assert(magic == 2049, ['Bad magic number in ', filename, '']);
numLabels = fread(fp, 1, 'int32', 0, 'ieee-be');
labels = fread(fp, inf, 'unsigned char');
assert(size(labels,1) == numLabels, 'Mismatch in label count');
fclose(fp);
end
mnist2hdf5.m
clc
close all
clear all
%%
addpath mnistHelper;
addpath datasets;
% train-images.idx3-ubyte / train-labels.idx1-ubyte
images = loadMNISTImages('train-images-idx3-ubyte');
labels = loadMNISTLabels('train-labels-idx1-ubyte');
% reshape images to 4-D: [rows,col,channel,numbers]
trainData=reshape(images,[28 28 1 size(images,2)]);
% permute to [cols,rows,channel,numbers]
trainData=permute(trainData,[2 1 3 4]);
% permute lables to [labels, number of labels ]
trainLabels=permute(labels,[2,1]);
h5create('train.hdf5','/data',size(trainData),'Datatype','double');
h5create('train.hdf5','/label',size(trainLabels),'Datatype','double');
h5write('train.hdf5','/data',trainData);
h5write('train.hdf5','/label',trainLabels);
%%
% test images
images = loadMNISTImages('t10k-images-idx3-ubyte');
labels = loadMNISTLabels('t10k-labels-idx1-ubyte');
% reshape images to 4-D: [rows,col,channel,numbers]
testData=reshape(images,[28 28 1 size(images,2)]);
% permute to [cols,rows,channel,numbers]
testData=permute(testData,[2 1 3 4]);
% permute lables to [labels, number of labels ]
testLabels=permute(labels,[2,1]);
h5create('test.hdf5','/data',size(testData),'Datatype','double');
h5create('test.hdf5','/label',size(testLabels),'Datatype','double');
h5write('test.hdf5','/data',testData);
h5write('test.hdf5','/label',testLabels);
lenet_train_test.prototxt
name: "LeNet"
layer {
name: "mnist"
type: "HDF5Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
hdf5_data_param {
source: "examples/mnist/mnist_train_hdf/train_loc.txt"
batch_size: 64
}
}
layer {
name: "mnist"
type: "HDF5Data"
top: "data"
top: "label"
include {
phase: TEST
}
hdf5_data_param {
source: "examples/mnist/mnist_test_hdf/test_loc.txt"
batch_size: 100
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 50
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool2"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "ip1"
top: "ip1"
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
由于可能存在多个H5文件,所以HDF5Data的输入是从一个TXT文件读取的列表,train_loc.txt内容示例如下:
train1.h5
train2.h5
利用python生成供给Caffe的hdf5数据
Requirements:
sudo pip install pydot
sudo apt-get install -y graphviz
Interesting resources on Caffe:
- https://github.com/BVLC/caffe/tree/master/examples
- http://nbviewer.ipython.org/github/joyofdata/joyofdata-articles/blob/master/deeplearning-with-caffe/Neural-Networks-with-Caffe-on-the-GPU.ipynb
Interesting resources on Iris with ANNs:
- iris data set test bed: http://deeplearning4j.org/iris-flower-dataset-tutorial.html
- http://se.mathworks.com/help/nnet/examples/iris-clustering.html
- http://lab.fs.uni-lj.si/lasin/wp/IMIT_files/neural/doc/seminar8.pdf
Synonyms:
- output = label = target
- input = feature
'''
import subprocess
import platform
import copy
from sklearn.datasets import load_iris
import sklearn.metrics
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import h5py
import caffe
import caffe.draw
def load_data():
'''
Load Iris Data set
'''
data = load_iris()
print(data.data)
print(data.target)
targets = np.zeros((len(data.target), 3))
for count, target in enumerate(data.target):
targets[count][target]= 1
print(targets)
new_data = {}
#new_data['input'] = data.data
new_data['input'] = np.reshape(data.data, (150,1,1,4))
new_data['output'] = targets
#print(new_data['input'].shape)
#new_data['input'] = np.random.random((150, 1, 1, 4))
#print(new_data['input'].shape)
#new_data['output'] = np.random.random_integers(0, 1, size=(150,3))
#print(new_data['input'])
return new_data
def save_data_as_hdf5(hdf5_data_filename, data):
'''
HDF5 is one of the data formats Caffe accepts
'''
with h5py.File(hdf5_data_filename, 'w') as f:
f['data'] = data['input'].astype(np.float32)
f['label'] = data['output'].astype(np.float32)
def main():
'''
This is the main function
'''
# Set parameters
solver_prototxt_filename = 'iris_solver.prototxt'
train_test_prototxt_filename = 'iris_train_test.prototxt'
deploy_prototxt_filename = 'iris_deploy.prototxt'
deploy_prototxt_filename = 'iris_deploy.prototxt'
deploy_prototxt_batch2_filename = 'iris_deploy_batchsize2.prototxt'
hdf5_train_data_filename = 'iris_train_data.hdf5'
hdf5_test_data_filename = 'iris_test_data.hdf5'
caffemodel_filename = 'iris__iter_1000000.caffemodel' # generated by train()
# Prepare data
data = load_data()
print(data)
train_data = data
test_data = data
save_data_as_hdf5(hdf5_train_data_filename, data)
save_data_as_hdf5(hdf5_test_data_filename, data)
# Train network
train(solver_prototxt_filename)
# Get predicted outputs
input = np.array([[ 5.1, 3.5, 1.4, 0.2]])
print(get_predicted_output(deploy_prototxt_filename, caffemodel_filename, input))
input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]])
#print(get_predicted_output(deploy_prototxt_batch2_filename, caffemodel_filename, input))
# Print network
print_network(deploy_prototxt_filename, caffemodel_filename)
print_network(train_test_prototxt_filename, caffemodel_filename)
print_network_weights(train_test_prototxt_filename, caffemodel_filename)
# Compute performance metrics
#inputs = input = np.array([[[[ 5.1, 3.5, 1.4, 0.2]]],[[[ 5.9, 3. , 5.1, 1.8]]]])
inputs = data['input']
outputs = get_predicted_outputs(deploy_prototxt_filename, caffemodel_filename, inputs)
get_accuracy(data['output'], outputs)
if __name__ == "__main__":
main()
#cProfile.run('main()') # if you want to do some profiling
这个脚本是一个MIT的学生写的,鸢尾花卉数据集得训练,我后面补一下这个例子,先挖个坑。
可以参考
http://blog.csdn.net/shadow_guo/article/details/50382446
https://github.com/Franck-Dernoncourt/caffe_demos
http://fuel.readthedocs.io/en/latest/new_dataset.html
接下来看看多标签回归的hdf5的输入:
python实现
import random
from PIL import Image
import numpy as np
import h5py
IMAGE_DIR = ['image_train', 'image_test']
HDF5_FILE = ['hdf5_train.h5', 'hdf5_test.h5']
LIST_FILE = ['list_train.txt', 'list_test.txt']
LABELS = dict(
# (kind_1, kind_2)
A_0 = (0, 0),
B_0 = (1, 0),
A_1 = (0, 1),
B_1 = (1, 1),
A_2 = (0, 2),
B_2 = (1, 2),
)
print '\nplease wait...'
for kk, image_dir in enumerate(IMAGE_DIR):
# 读取文件列表于file_list
file_list = ...
# 文件列表乱序
random.shuffle(file_list)
# 标签类别
kind_index = ...
# 图片大小为96*32,单通道
datas = np.zeros((len(file_list), 1, 32, 96))
# label大小为1*2
labels = np.zeros((len(file_list), 2))
for ii, _file in enumerate(file_list):
# hdf5文件要求数据是float或者double格式
# 同时caffe中Hdf5DataLayer不允许使用transform_param,
# 所以要手动除以256
datas[ii, :, :, :] = \
np.array(Image.open(_file)).astype(np.float32) / 256
labels[ii, :] = np.array(LABELS[kind_index ]).astype(np.int)
# 写入hdf5文件
with h5py.File(HDF5_FILE[kk], 'w') as f:
f['data'] = datas
f['labels'] = labels
f.close()
# 写入列表文件,可以有多个hdf5文件
with open(LIST_FILE[kk], 'w') as f:
f.write(os.path.abspath(HDF5_FILE[kk]) + '\n')
f.close()
print '\ndone...'
一个生成hdf5并转为LMDB的python脚本:
import lmdb
import random
import os
import caffe
def convert_data_lmdb(train_data,train_label,output_data_lmdb,output_labels_lmdb):
"""
Used for save data and multi-labels to lmdbs
call: convert_data_lmdb(train_X,train_y,'train_data_lmdb','train_labels_lmdb')
"""
X = train_data.astype(np.float)
y = train_label.astype(np.float)
X, y = shuffle(X, y, random_state=42) # shuffle train data
# creating images lmdb
in_db = lmdb.open(output_data_lmdb, map_size=X.nbytes*10)
with in_db.begin(write=True) as in_txn :
for in_idx,in_ in enumerate(X) :
im = in_;
im = im[:,:,::-1]
im = im.transpose((2, 0, 1))
im_dat = caffe.io.array_to_datum(im)
#in_txn.put(in_idx.encode('ascii'), im_dat.SerializeToString())
in_txn.put('{:0>10d}'.format(in_idx), im_dat.SerializeToString())
in_db.close()
in_label = lmdb.open(output_labels_lmdb, map_size=y.nbytes*10)
counter_label = 0
with in_label.begin(write=True) as in_txn :
for idx in range(y.shape[0]):
datum = caffe.io.array_to_datum(y[np.newaxis,np.newaxis,idx])
in_txn.put("{:0>10d}".format(counter_label), datum.SerializeToString())
counter_label += 1
in_label.close()
def write_hdf5(filename):
import h5py
IMAGE_SIZE = (96, 96)
LABEL_SIZE = 30 # Multi-labels
MEAN_VALUE = 128
#filename = sys.argv[1]
setname, ext = filename.split('.')
with open(filename, 'r') as f:
lines = f.readlines()
np.random.shuffle(lines)
sample_size = len(lines)
imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
scores = np.zeros((sample_size,1) + LABEL_SIZE, dtype=np.float32)
h5_filename = '{}.h5'.format(setname)
with h5py.File(h5_filename, 'w') as h:
for i, line in enumerate(lines):
image_name, score = line[:-1].split()
img = pyplot.imread(image_name)[:, :, 0].astype(np.float32)
img = img.reshape((1, )+img.shape)
#img -= MEAN_VALUE
imgs[i] = img
scores[i,1] = float(score)
if (i+1) % 1000 == 0:
print('processed {} images!'.format(i+1))
h.create_dataset('data', data=imgs)
h.create_dataset('label', data=scores)
with open('{}_h5.txt'.format(setname), 'w') as f:
f.write(h5_filename)
def write_hdf5(data,labels,output_filename):
"""
This function is used to save image data and its label(s) to hdf5 file.
output_file.h5,contain data and label
data.shape is (n,c,h,w)
label.shape is (n,labels)
"""
import h5py
X = data.astype(np.float32)
y = labels.astype(np.float32)
X, y = shuffle(X, y, random_state=42) # shuffle train data
IMAGE_SIZE = (96, 96)
LABEL_SIZE = 30 # Multi-labels
MEAN_VALUE = 128
#filename = sys.argv[1]
setname, ext = output_filename.split('.')
sample_size = X.shape[0]
imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
scores = np.zeros((sample_size,LABEL_SIZE), dtype=np.float32)
h5_filename = '{}.h5'.format(setname)
with h5py.File(h5_filename, 'w') as h:
i = 0;
for in_,label in zip(X,y) :
im = in_;
im = im[:,:,::-1]
im = im.transpose((2, 0, 1))
imgs[i] = im
scores[i] = label
i = i + 1;
print('processed {} images!'.format(i))
h.create_dataset('data', data=imgs)
h.create_dataset('label', data=scores)
with open('{}_h5.txt'.format(setname), 'w') as f:
f.write(h5_filename)
篇幅有限,多标签的例子也挖一个坑,哈哈=============================================================================
参考:
http://blog.csdn.net/u011762313/article/details/48830561
http://blog.csdn.net/eagelangel/article/details/51811519
http://blog.csdn.net/kuaitoukid/article/details/43448517
https://github.com/mravendi/caffe-mnist-hdf5/tree/master/MATLAB_HDF5_Converter
http://www.hust.site/%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0/2016-04-26/225.html
http://blog.csdn.net/shuzfan?viewmode=contents