An interface for the units of computation which can be composed into a Net.
Layer&s must implement a Forward function, in which they take their input (bottom) Blob&s (if any) and compute their output Blob&s (if any). They may also implement a Backward function, in which they compute the error gradients with respect to their input Blob&s, given the error gradients with their output Blob&s.
按照我们对一般卷积神经网络的模型来理解:一个网络(net)包含很多层(layer),而每层里面的东西无外乎数据(前馈的数据和反馈的误差),这些数据在caffe里面已经用blob类来实现了,那么应该可以想到layer中应该包含了很多blob类,或者说是一个blob类型的vector。
1 layer中的数据有哪些?
protected:
/** The protobuf that stores the layer parameters */
LayerParameter layer_param_;
/** The vector that stores the learnable parameters as a set of blobs. */
vector<shared_ptr<Blob<Dtype> > > blobs_;
/** Vector indicating whether to compute the diff of each param blob. */
vector<bool> param_propagate_down_;
/** The vector that indicates whether each top blob has a non-zero weight in
* the objective function. */
vector<Dtype> loss_;
在源码中可以看到,确实有一个vector来存储很多blob的变量blobs_;
param_propagate_down_在注释中其实说明了,指示本层中的blob是否需要计算diff;为什么会需要这个东西呢?这里就需要再次强调一下了:net > layer > blob。而一个layer中可能包含多个blob,例如有多个bottom, 多个top。
那么loss_是干嘛的呢?暂时也看不明白。
至于LayerParameter定义在caffe.proto中:
message LayerParameter {
repeated string bottom = 2; // the name of the bottom blobs
repeated string top = 3; // the name of the top blobs
optional string name = 4; // the layer name
// Rules controlling whether and when a layer is included in the network,
// based on the current NetState. You may specify a non-zero number of rules
// to include OR exclude, but not both. If no include or exclude rules are
// specified, the layer is always included. If the current NetState meets
// ANY (i.e., one or more) of the specified rules, the layer is
// included/excluded.
repeated NetStateRule include = 32;
repeated NetStateRule exclude = 33;
// NOTE
// Add new LayerTypes to the enum below in lexicographical order (other than
// starting with NONE), starting with the next available ID in the comment
// line above the enum. Update the next available ID when you add a new
// LayerType.
//
// LayerType next available ID: 38 (last added: CONTRASTIVE_LOSS)
enum LayerType {
// "NONE" layer type is 0th enum element so that we don't cause confusion
// by defaulting to an existent LayerType (instead, should usually error if
// the type is unspecified).
NONE = 0;
ABSVAL = 35;
ACCURACY = 1;
ARGMAX = 30;
BNLL = 2;
CONCAT = 3;
CONTRASTIVE_LOSS = 37;
CONVOLUTION = 4;
DATA = 5;
DROPOUT = 6;
DUMMY_DATA = 32;
EUCLIDEAN_LOSS = 7;
ELTWISE = 25;
FLATTEN = 8;
HDF5_DATA = 9;
HDF5_OUTPUT = 10;
HINGE_LOSS = 28;
IM2COL = 11;
IMAGE_DATA = 12;
INFOGAIN_LOSS = 13;
INNER_PRODUCT = 14;
LRN = 15;
MEMORY_DATA = 29;
MULTINOMIAL_LOGISTIC_LOSS = 16;
MVN = 34;
POOLING = 17;
POWER = 26;
RELU = 18;
SIGMOID = 19;
SIGMOID_CROSS_ENTROPY_LOSS = 27;
SILENCE = 36;
SOFTMAX = 20;
SOFTMAX_LOSS = 21;
SPLIT = 22;
SLICE = 33;
TANH = 23;
WINDOW_DATA = 24;
THRESHOLD = 31;
}
optional LayerType type = 5; // the layer type from the enum above
// The blobs containing the numeric parameters of the layer
repeated BlobProto blobs = 6;
// The names of the parameter blobs -- useful for sharing parameters among
// layers (but never required).
repeated string param = 1001;
// Whether to require shared weights to have the same shape, or just the same
// count -- defaults to STRICT if unspecified.
repeated DimCheckMode blob_share_mode = 1002;
enum DimCheckMode {
// Neil: Disabled for windows
// // STRICT (default) requires that num, channels, height, width each match.
// STRICT = 0;
// PERMISSIVE requires only the count (num*channels*height*width) to match.
PERMISSIVE = 1;
}
// The ratio that is multiplied on the global learning rate. If you want to
// set the learning ratio for one blob, you need to set it for all blobs.
repeated float blobs_lr = 7;
// The weight decay that is multiplied on the global weight decay.
repeated float weight_decay = 8;
// The amount of weight to assign each top blob in the objective.
// Each layer assigns a default value, usually of either 0 or 1,
// to each top blob.
repeated float loss_weight = 35;
optional AccuracyParameter accuracy_param = 27;
optional ArgMaxParameter argmax_param = 23;
optional ConcatParameter concat_param = 9;
optional ContrastiveLossParameter contrastive_loss_param = 40;
optional ConvolutionParameter convolution_param = 10;
optional DataParameter data_param = 11;
optional DropoutParameter dropout_param = 12;
optional DummyDataParameter dummy_data_param = 26;
optional EltwiseParameter eltwise_param = 24;
optional HDF5DataParameter hdf5_data_param = 13;
optional HDF5OutputParameter hdf5_output_param = 14;
optional HingeLossParameter hinge_loss_param = 29;
optional ImageDataParameter image_data_param = 15;
optional InfogainLossParameter infogain_loss_param = 16;
optional InnerProductParameter inner_product_param = 17;
optional LRNParameter lrn_param = 18;
optional MemoryDataParameter memory_data_param = 22;
optional MVNParameter mvn_param = 34;
optional PoolingParameter pooling_param = 19;
optional PowerParameter power_param = 21;
optional ReLUParameter relu_param = 30;
optional SigmoidParameter sigmoid_param = 38;
optional SoftmaxParameter softmax_param = 39;
optional SliceParameter slice_param = 31;
optional TanHParameter tanh_param = 37;
optional ThresholdParameter threshold_param = 25;
optional WindowDataParameter window_data_param = 20;
// Parameters for data pre-processing.
optional TransformationParameter transform_param = 36;
// Note: certain layers may have more than one computational engine
// for their implementation. These layers include an Engine type and
// engine parameter for selecting the implementation.
// The default for the engine is set by the ENGINE switch at compile-time.
// DEPRECATED: The layer parameters specified as a V0LayerParameter.
// This should never be used by any code except to upgrade to the new
// LayerParameter specification.
optional V0LayerParameter layer = 1;
}
相当于是对层中各个参数的相信说明,例如该层的名字,该层的上一层下一层是什么,以及学习率等等参数。
继续再看hpp会发现各种虚函数,先来看看protected里面的方法:
2 前馈,反馈函数:
/** @brief Using the CPU device, compute the layer output. */
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) = 0;
/**
* @brief Using the GPU device, compute the layer output.
* Fall back to Forward_cpu() if unavailable.
*/
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
// LOG(WARNING) << "Using CPU code as backup.";
return Forward_cpu(bottom, top);
}
/**
* @brief Using the CPU device, compute the gradients for any parameters and
* for the bottom blobs if propagate_down is true.
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
vector<Blob<Dtype>*>* bottom) = 0;
/**
* @brief Using the GPU device, compute the gradients for any parameters and
* for the bottom blobs if propagate_down is true.
* Fall back to Backward_cpu() if unavailable.
*/
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
vector<Blob<Dtype>*>* bottom) {
// LOG(WARNING) << "Using CPU code as backup.";
Backward_cpu(top, propagate_down, bottom);
}
这几个函数理解起来应该很容易。重点是那几个XX_gpu(),为啥里面调用的是XX_cpu()啊??!!如果是所谓的在GPU不可用的情况下,调用XX_cpu()还可以理解,但是什么都没有做直接调用XX_cpu了?不过这是虚函数哈,不要着急,在详细的实现里面应该能够明白所以然。
回答这里的问题(参见:caffe源码简单解析——Layer层),在layer中主要的接口是前馈和反馈函数:
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top)
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom)
反馈中的参数propagate_down表明是否需要反馈传递参数。而两个函数会根据caffe: mode来判断在cpu中执行还是在gpu中执行。
注意:有些layer并没有GPU计算的实现,所以封装时加入了CPU的计算作为后备
3 检测blobs的数量是否正确:
/**
* Called by the parent Layer's SetUp to check that the number of bottom
* and top Blobs provided as input match the expected numbers specified by
* the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
*/
virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
if (ExactNumBottomBlobs() >= 0) {
CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
<< type_name() << " Layer takes " << ExactNumBottomBlobs()
<< " bottom blob(s) as input.";
}
if (MinBottomBlobs() >= 0) {
CHECK_LE(MinBottomBlobs(), bottom.size())
<< type_name() << " Layer takes at least " << MinBottomBlobs()
<< " bottom blob(s) as input.";
}
if (MaxBottomBlobs() >= 0) {
CHECK_GE(MaxBottomBlobs(), bottom.size())
<< type_name() << " Layer takes at most " << MaxBottomBlobs()
<< " bottom blob(s) as input.";
}
if (ExactNumTopBlobs() >= 0) {
CHECK_EQ(ExactNumTopBlobs(), top.size())
<< type_name() << " Layer produces " << ExactNumTopBlobs()
<< " top blob(s) as output.";
}
if (MinTopBlobs() >= 0) {
CHECK_LE(MinTopBlobs(), top.size())
<< type_name() << " Layer produces at least " << MinTopBlobs()
<< " top blob(s) as output.";
}
if (MaxTopBlobs() >= 0) {
CHECK_GE(MaxTopBlobs(), top.size())
<< type_name() << " Layer produces at most " << MaxTopBlobs()
<< " top blob(s) as output.";
}
if (EqualNumBottomTopBlobs()) {
CHECK_EQ(bottom.size(), top.size())
<< type_name() << " Layer produces one top blob as output for each "
<< "bottom blob input.";
}
}
这里涉及到了有些奇怪的函数,例如:ExactNumBottomBlobs(),但并不会影响阅读,CheckBlobCounts()实现的功能就是检测输入的bottom blob和输出的top blob是否在指定的范围内,既然是指定的范围,自然就会联想到在什么地方指定这个范围的?带着疑问继续阅读源码,在某个地方,总会明白的。
4 最后一个protected函数,SetLossWeight():
/**
* Called by SetUp to initialize the weights associated with any top blobs in
* the loss function. Store non-zero loss weights in the diff blob.
*/
inline void SetLossWeights(vector<Blob<Dtype>*>* top) {
const int num_loss_weights = layer_param_.loss_weight_size();
if (num_loss_weights) {
CHECK_EQ(top->size(), num_loss_weights) << "loss_weight must be "
"unspecified or specified once per top blob.";
for (int top_id = 0; top_id < top->size(); ++top_id) {
const Dtype loss_weight = layer_param_.loss_weight(top_id);
if (loss_weight == Dtype(0)) { continue; }
this->set_loss(top_id, loss_weight);
const int count = (*top)[top_id]->count();
Dtype* loss_multiplier = (*top)[top_id]->mutable_cpu_diff();
caffe_set(count, loss_weight, loss_multiplier);
}
}
}
这里是设置损失值的,一般也只有在损失函数的那层才会计算损失值的,所以这个函数主要还是用在最后一层。大概相当于是把损失值复制过来,所以可以大胆猜测,这是反馈的第一步吧。
接着来看public中的非虚函数部分:
5 构造函数:
/**
* You should not implement your own constructor. Any set up code should go
* to SetUp(), where the dimensions of the bottom blobs are provided to the
* layer.
*/
explicit Layer(const LayerParameter& param)
: layer_param_(param) {
// The only thing we do is to copy blobs if there are any.
if (layer_param_.blobs_size() > 0) {
blobs_.resize(layer_param_.blobs_size());
for (int i = 0; i < layer_param_.blobs_size(); ++i) {
blobs_[i].reset(new Blob<Dtype>());
blobs_[i]->FromProto(layer_param_.blobs(i));
}
}
}
这个构造函数,从注释上可以看到了,似乎基本都没有直接调用这个构造函数,因为在caffe中有各种各样的层。而不同的层都有自己的 SetUp()。那么这里的 Layer()做了什么事情呢?首先我们可以确认的一点是这里的layer.hpp是各种各样的层的一个抽象,也就是说其它的层都会继承这个。从这个构造函数的代码中可以看到,其实它只做了将blobs拷贝过来,但是具体的其它参数,以及方法都是空白的。
6 前馈和反馈函数:
/**
* @brief Given the bottom blobs, compute the top blobs and the loss.
*
* @param bottom
* the input blobs, whose data fields store the input data for this layer
* @param top
* the preshaped output blobs, whose data fields will store this layers'
* outputs
* \return The total loss from the layer.
*
* The Forward wrapper calls the relevant device wrapper function
* (Forward_cpu or Forward_gpu) to compute the top blob values given the
* bottom blobs. If the layer has any non-zero loss_weights, the wrapper
* then computes and returns the loss.
*
* Your layer should implement Forward_cpu and (optionally) Forward_gpu.
*/
inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top);
/**
* @brief Given the top blob error gradients, compute the bottom blob error
* gradients.
*
* @param top
* the output blobs, whose diff fields store the gradient of the error
* with respect to themselves
* @param propagate_down
* a vector with equal length to bottom, with each index indicating
* whether to propagate the error gradients down to the bottom blob at
* the corresponding index
* @param bottom
* the input blobs, whose diff fields will store the gradient of the error
* with respect to themselves after Backward is run
*
* The Backward wrapper calls the relevant device wrapper function
* (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
* top blob diffs.
*
* Your layer should implement Forward_cpu and (optionally) Forward_gpu.
*/
inline void Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
vector<Blob<Dtype>*>* bottom);
这里的注释很详细,具体的实现也在layer.hpp中:
// Forward and backward wrappers. You should implement the cpu and
// gpu specific implementations instead, and should not change these
// functions.
template <typename Dtype>
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
Dtype loss = 0;
switch (Caffe::mode()) {
case Caffe::CPU:
Forward_cpu(bottom, top);
for (int top_id = 0; top_id < top->size(); ++top_id) {
if (!this->loss(top_id)) { continue; }
const int count = (*top)[top_id]->count();
const Dtype* data = (*top)[top_id]->cpu_data();
const Dtype* loss_weights = (*top)[top_id]->cpu_diff();
loss += caffe_cpu_dot(count, data, loss_weights);
}
break;
case Caffe::GPU:
Forward_gpu(bottom, top);
#ifndef CPU_ONLY
for (int top_id = 0; top_id < top->size(); ++top_id) {
if (!this->loss(top_id)) { continue; }
const int count = (*top)[top_id]->count();
const Dtype* data = (*top)[top_id]->gpu_data();
const Dtype* loss_weights = (*top)[top_id]->gpu_diff();
Dtype blob_loss = 0;
caffe_gpu_dot(count, data, loss_weights, &blob_loss);
loss += blob_loss;
}
#endif
break;
default:
LOG(FATAL) << "Unknown caffe mode.";
}
return loss;
}
template <typename Dtype>
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
vector<Blob<Dtype>*>* bottom) {
switch (Caffe::mode()) {
case Caffe::CPU:
Backward_cpu(top, propagate_down, bottom);
break;
case Caffe::GPU:
Backward_gpu(top, propagate_down, bottom);
break;
default:
LOG(FATAL) << "Unknown caffe mode.";
}
}
其中具体的计算方式,可以先看看深度学习基础教程。在该代码中会根据不同的caffe::mode来选择不同的函数实现。
每种层的具体实现后面再慢慢分析~