接上篇 caffe增加自己的layer实战(中)–caffe学习(11)
先放出完整的修改后的video_data_layers.cpp
:
#include <fstream>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
#include "caffe/data_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"
#ifdef USE_MPI
#include "mpi.h"
#include <boost/filesystem.hpp>
using namespace boost::filesystem;
#endif
namespace caffe{
template <typename Dtype>
VideoDataLayer<Dtype>:: ~VideoDataLayer<Dtype>(){
this->JoinPrefetchThread();
}
template <typename Dtype>
void VideoDataLayer<Dtype>:: DataLayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){
const int new_height = this->layer_param_.video_data_param().new_height();
const int new_width = this->layer_param_.video_data_param().new_width();
const int new_length = this->layer_param_.video_data_param().new_length();
const int num_segments = this->layer_param_.video_data_param().num_segments();
const string& source = this->layer_param_.video_data_param().source();
LOG(INFO) << "Opening file: " << source;
std:: ifstream infile(source.c_str());
string filename;
int label;
int length;
while (infile >> filename >> length >> label){
lines_.push_back(std::make_pair(filename,label));
lines_duration_.push_back(length);
}
if (this->layer_param_.video_data_param().shuffle()){
const unsigned int prefectch_rng_seed = caffe_rng_rand();
prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
ShuffleVideos();
}
LOG(INFO) << "A total of " << lines_.size() << " videos.";
lines_id_ = 0;
//check name patter
if (this->layer_param_.video_data_param().name_pattern() == ""){
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_RGB){
name_pattern_ = "image_%04d.jpg";
}else if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
name_pattern_ = "flow_%c_%04d.jpg";
}
}else{
name_pattern_ = this->layer_param_.video_data_param().name_pattern();
}
Datum datum;
const unsigned int frame_prefectch_rng_seed = caffe_rng_rand();
frame_prefetch_rng_.reset(new Caffe::RNG(frame_prefectch_rng_seed));
int average_duration = (int) lines_duration_[lines_id_]/num_segments;
vector<int> offsets;
for (int i = 0; i < num_segments; ++i){
caffe::rng_t* frame_rng = static_cast<caffe::rng_t*>(frame_prefetch_rng_->generator());
int offset = (*frame_rng)() % (average_duration - new_length + 1);
offsets.push_back(offset+i*average_duration);
}
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW)
CHECK(ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str()));
else
CHECK(ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, true, name_pattern_.c_str()));
const int crop_size = this->layer_param_.transform_param().crop_size();
const int batch_size = this->layer_param_.video_data_param().batch_size();
if (crop_size > 0){
top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
} else {
top[0]->Reshape(batch_size, datum.channels(), datum.height(), datum.width());
this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(), datum.width());
}
LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," << top[0]->width();
top[1]->Reshape(batch_size, 1, 1, 1);
this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
this->transformed_data_.Reshape(top_shape);
}
template <typename Dtype>
void VideoDataLayer<Dtype>::ShuffleVideos(){
caffe::rng_t* prefetch_rng1 = static_cast<caffe::rng_t*>(prefetch_rng_1_->generator());
caffe::rng_t* prefetch_rng2 = static_cast<caffe::rng_t*>(prefetch_rng_2_->generator());
shuffle(lines_.begin(), lines_.end(), prefetch_rng1);
shuffle(lines_duration_.begin(), lines_duration_.end(),prefetch_rng2);
}
template <typename Dtype>
void VideoDataLayer<Dtype>::InternalThreadEntry(){
Datum datum;
CHECK(this->prefetch_data_.count());
Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
VideoDataParameter video_data_param = this->layer_param_.video_data_param();
const int batch_size = video_data_param.batch_size();
const int new_height = video_data_param.new_height();
const int new_width = video_data_param.new_width();
const int new_length = video_data_param.new_length();
const int num_segments = video_data_param.num_segments();
const int lines_size = lines_.size();
for (int item_id = 0; item_id < batch_size; ++item_id){
CHECK_GT(lines_size, lines_id_);
vector<int> offsets;
int average_duration = (int) lines_duration_[lines_id_] / num_segments;
for (int i = 0; i < num_segments; ++i){
if (this->phase_==TRAIN){
if (average_duration >= new_length){
caffe::rng_t* frame_rng = static_cast<caffe::rng_t*>(frame_prefetch_rng_->generator());
int offset = (*frame_rng)() % (average_duration - new_length + 1);
offsets.push_back(offset+i*average_duration);
} else {
offsets.push_back(0);
}
} else{
if (average_duration >= new_length)
offsets.push_back(int((average_duration-new_length+1)/2 + i*average_duration));
else
offsets.push_back(0);
}
}
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
if(!ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str())) {
continue;
}
} else{
if(!ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, true, name_pattern_.c_str())) {
continue;
}
}
int offset1 = this->prefetch_data_.offset(item_id);
this->transformed_data_.set_cpu_data(top_data + offset1);
this->data_transformer_->Transform(datum, &(this->transformed_data_));
top_label[item_id] = lines_[lines_id_].second;
//LOG()
//next iteration
lines_id_++;
if (lines_id_ >= lines_size) {
DLOG(INFO) << "Restarting data prefetching from start.";
lines_id_ = 0;
if(this->layer_param_.video_data_param().shuffle()){
ShuffleVideos();
}
}
}
}
INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);
}
接下来分析其中的三个函数:
DataLayerSetUp,ShuffleVideos,InternalThreadEntry
DataLayerSetUp函数:初始化数据层
ShuffleVideos函数:将视频顺序打乱
InternalThreadEntry函数:创建新线程
函数一开始
const int new_height = this->layer_param_.video_data_param().new_height();
const int new_width = this->layer_param_.video_data_param().new_width();
const int new_length = this->layer_param_.video_data_param().new_length();
const int num_segments = this->layer_param_.video_data_param().num_segments();
const string& source = this->layer_param_.video_data_param().source();
new_height和new_width参数在image中也有,后面的都是不同的:
new_length:一个视频最小片段包含几张图片文件,帧数
num_segments:将整个视频分割为几段
source:训练和test文件所在的txt文档位置
后面几行:
LOG(INFO) << "Opening file: " << source;
std:: ifstream infile(source.c_str());
string filename;
int label;
int length;
while (infile >> filename >> length >> label){
lines_.push_back(std::make_pair(filename,label));
lines_duration_.push_back(length);
}
if (this->layer_param_.video_data_param().shuffle()){
const unsigned int prefectch_rng_seed = caffe_rng_rand();
prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
ShuffleVideos();
}
LOG(INFO) << "A total of " << lines_.size() << " videos.";
lines_id_ = 0;
和image的基本保持一致,
这里最大的区别就是对头文件里面的几个参数的赋值
const unsigned int prefectch_rng_seed = caffe_rng_rand();
prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
然后就是对输入文件的组织顺序,文件名|视频帧数|类别
LOG(INFO) << "A total of " << lines_.size() << " videos.";
从这个可见参数lines_保存的是样本文件名
接下来自己定义输入参数名字匹配方法:
//check name patter
if (this->layer_param_.video_data_param().name_pattern() == ""){
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_RGB){
name_pattern_ = "image_%04d.jpg";
}else if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
name_pattern_ = "flow_%c_%04d.jpg";
}
}else{
name_pattern_ = this->layer_param_.video_data_param().name_pattern();
}
不多说。继续分析
其实这里就是对自己在caffeprototxt里面定义的参数的解析,原来的image中还解析了参数randomly skip:
// Check if we would need to randomly skip a few data points
if (this->layer_param_.image_data_param().rand_skip()) {
unsigned int skip = caffe_rng_rand() %
this->layer_param_.image_data_param().rand_skip();
LOG(INFO) << "Skipping first " << skip << " data points.";
CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
lines_id_ = skip;
}
我们这里没有此参数,因此不需要。
Datum datum; //定义存储数据
干货来了,将图片数据读取到结构体中:
//读取文件到datum中,ReadSegmentFlowToDatum等函数定义在io.cpp中
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW)
CHECK(ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str()));
else
CHECK(ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, true, name_pattern_.c_str()));
以函数ReadSegmentRGBToDatum为例,定义在io.cpp中:
bool ReadSegmentRGBToDatum(const string& filename, const int label,
const vector<int> offsets, const int height, const int width, const int length, Datum* datum, bool is_color,
const char* name_pattern ){
cv::Mat cv_img;
string* datum_string;
char tmp[30];
int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
CV_LOAD_IMAGE_GRAYSCALE);
for (int i = 0; i < offsets.size(); ++i){
int offset = offsets[i];
for (int file_id = 1; file_id < length+1; ++file_id){
sprintf(tmp, name_pattern, int(file_id+offset));
string filename_t = filename + "/" + tmp;
cv::Mat cv_img_origin = cv::imread(filename_t, cv_read_flag);
if (!cv_img_origin.data){
LOG(ERROR) << "Could not load file " << filename_t;
return false;
}
if (height > 0 && width > 0){
cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
}else{
cv_img = cv_img_origin;
}
int num_channels = (is_color ? 3 : 1);
if (file_id==1 && i==0){
datum->set_channels(num_channels*length*offsets.size());
datum->set_height(cv_img.rows);
datum->set_width(cv_img.cols);
datum->set_label(label);
datum->clear_data();
datum->clear_float_data();
datum_string = datum->mutable_data();
}
if (is_color) {
for (int c = 0; c < num_channels; ++c) {
for (int h = 0; h < cv_img.rows; ++h) {
for (int w = 0; w < cv_img.cols; ++w) {
datum_string->push_back(
static_cast<char>(cv_img.at<cv::Vec3b>(h, w)[c]));
}
}
}
} else { // Faster than repeatedly testing is_color for each pixel w/i loop
for (int h = 0; h < cv_img.rows; ++h) {
for (int w = 0; w < cv_img.cols; ++w) {
datum_string->push_back(
static_cast<char>(cv_img.at<uchar>(h, w)));
}
}
}
}
}
return true;
}
看看该文件中还包含什么:
可以加载图片到数据库的很多底层函数都在这里定义。
#include "caffe/util/io.hpp"
从这可以知道我们需要在io.hpp文件中增加ReadSegmentRGBToDatum函数的定义,然后在io.cpp中去实现它。
注意新版caffe的hpp文件目录是:/home/caffe/include/caffe
而cpp所在目录是 :/home/xl/caffe/src/caffe
io.hpp文件内容:
bool ReadSegDataToDatum(const string& img_filename, const string& label_filename,
Datum* datum_data, Datum* datum_label, bool is_color);
bool ReadSegmentFlowToDatum(const string& filename, const int label,
const vector<int> offsets, const int height, const int width, const int length, Datum* datum, const char* name_pattern);
bool ReadSegmentRGBToDatum(const string& filename, const int label,
const vector<int> offsets, const int height, const int width, const int length, Datum* datum, bool is_color,
const char* name_pattern);
在io.cpp中新增ReadSegmentRGBToDatum函数详解:
cv::Mat cv_img;
string* datum_string;
char tmp[30];
int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
CV_LOAD_IMAGE_GRAYSCALE);
for (int i = 0; i < offsets.size(); ++i){
int offset = offsets[i];
for (int file_id = 1; file_id < length+1; ++file_id){
sprintf(tmp, name_pattern, int(file_id+offset));
string filename_t = filename + "/" + tmp;
cv::Mat cv_img_origin = cv::imread(filename_t, cv_read_flag);
if (!cv_img_origin.data){
LOG(ERROR) << "Could not load file " << filename_t;
return false;
}
if (height > 0 && width > 0){
cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
}else{
cv_img = cv_img_origin;
} //读取图片完毕
这几行在前面的官方默认读取函数中,都能找到参考,真正改动大的是下面几行:需要注意的是循环体结构:
for (int i = 0; i < offsets.size(); ++i){ // offsets.size()=段数
int offset = offsets[i];
for (int file_id = 1; file_id < length+1; ++file_id){ //每段里面包含几张图片
最终整个视频分段后所有的数据结构都保存在datum和datum_string中。
说完io之后继续分析video_data_layers.cpp
:
下面需要实现ShuffleVideos函数,功能就是打乱列表顺序
非常简单:
template <typename Dtype>
void VideoDataLayer<Dtype>::ShuffleVideos(){
caffe::rng_t* prefetch_rng1 = static_cast<caffe::rng_t*>(prefetch_rng_1_->generator());
caffe::rng_t* prefetch_rng2 = static_cast<caffe::rng_t*>(prefetch_rng_2_->generator());
shuffle(lines_.begin(), lines_.end(), prefetch_rng1);
shuffle(lines_duration_.begin(), lines_duration_.end(),prefetch_rng2);
}
只比image多了两行,image的该函数如下:
template <typename Dtype>
void ImageDataLayer<Dtype>::ShuffleImages() {
caffe::rng_t* prefetch_rng =
static_cast<caffe::rng_t*>(prefetch_rng_->generator());
shuffle(lines_.begin(), lines_.end(), prefetch_rng);
}
下面介绍另一个非常重要的函数InternalThreadEntry,它和solver中的test_interval: 15000看起来很像,实际上表达的意思也是一样,就是循环处理完一次batch_size大小的数据,见其中的主循环体:
for (int item_id = 0; item_id < batch_size; ++item_id){
在image_data中也是同样的处理方法,只是这里我们循环的时候每次处理的是一个视频,而不仅仅是一帧图片。
最后的这两行十分重要:会对层进行注册
INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);
这样我们的数据层函数就构造完了,接下来要对其在prototxt中进行注册和测试。
接下篇:
caffe增加自己的layer实战(下-续1)–caffe学习(13)