caffe源码解读
Caffe组成结构
caffe主要由下列4个类组成:Blob;layer;Net;Solver。主要功能如下:
caffe.proto 网络超参数的定义
caffe.proto定义了多个message结构体,它包含了网络需要传递的参数。required是必须要定义的,optional是可选的(后面可加默认选项),
这些message有属于Blob:BlobProto, BlobProtoVector, Datum
,也有Net:NetParameter, SolverParameter, SolverState, NetState, NetStateRule, ParamSpec;也有layer:FillerParameter, LayerParameter, ArgMaxParameter, TransformationParameter, LossParameter, AccuracyParameter, ConcatParameter等;
下列为部分message的解释,更多的见(https://blog.csdn.net/weixin_39970417/article/details/80825601)
syntax = "proto2" package caffe;(命名空间) // Specifies the shape (dimensions) of a Blob. message BlobShape { //数据块形状定义为Num * Channel * Height * Wight, 原因在于caffe基于容器的多维嵌套来实现高维数据的封装, 即vector。 repeated int64 dim = 1 [packed = true];} // Blob数据块,包括Blob shape,数据和微分 message BlobProto { // Blob的shape, 即numpy中的shape optional BlobShape shape = 7; // Blob的数据部分 repeated float data = 5 [packed = true]; // Blob的微分部分 repeated float diff = 6 [packed = true]; // Blob中的数据部分(double类型) repeated double double_data = 8 [packed = true]; // Blob的微分部分(double类型) repeated double double_diff = 9 [packed = true]; // 4D dimensions -- deprecated. Use "shape" instead. // Blob的4个维度,已被Blob shape代替 // Blob中数据的个数(例如卷积核的个数) optional int32 num = 1 [default = 0]; // Blob中数据的通道数 optional int32 channels = 2 [default = 0]; // Blob中数据的高度 optional int32 height = 3 [default = 0]; // Blob中数据的宽度 optional int32 width = 4 [default = 0];} ... ... // 卷积层参数 message ConvolutionParameter { // 输出数据的个数 optional uint32 num_output = 1; // The number of outputs for the layer // 是否有偏置项 optional bool bias_term = 2 [default = true]; // whether to have bias terms // Pad, kernel size, and stride are all given as a single value for equal // dimensions in all spatial dimensions, or once per spatial dimension. // 卷积padding的大小 repeated uint32 pad = 3; // The padding size; defaults to 0 // 卷积核的大小 repeated uint32 kernel_size = 4; // The kernel size // 卷积的步长 repeated uint32 stride = 6; // The stride; defaults to 1 // Factor used to dilate the kernel, (implicitly) zero-filling the resulting // holes. (Kernel dilation is sometimes referred to by its use in the // algorithme à trous from Holschneider et al. 1987.) // 卷积膨胀,在卷积的时候可以skip一定长度的像素 repeated uint32 dilation = 18; // The dilation; defaults to 1 // For 2D convolution only, the *_h and *_w versions may also be used to // specify both spatial dimensions. // padding, kernel, stride的宽度和高度 optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only) optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only) optional uint32 kernel_h = 11; // The kernel height (2D only) optional uint32 kernel_w = 12; // The kernel width (2D only) optional uint32 stride_h = 13; // The stride height (2D only) optional uint32 stride_w = 14; // The stride width (2D only) // 来自于AlexNet论文 optional uint32 group = 5 [default = 1]; // The group size for group conv // 权重初始化 optional FillerParameter weight_filler = 7; // The filler for the weight // 偏置初始化 optional FillerParameter bias_filler = 8; // The filler for the bias enum Engine { DEFAULT = 0; CAFFE = 1; CUDNN = 2; } // 卷积的方式的选择,default是正常的卷积,caffe是矩阵乘法的卷积,cudnn是cuda库流并行式的卷积 optional Engine engine = 15 [default = DEFAULT]; // The axis to interpret as "channels" when performing convolution. // Preceding dimensions are treated as independent inputs; // succeeding dimensions are treated as "spatial". // With (N, C, H, W) inputs, and axis == 1 (the default), we perform // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for // groups g>1) filters across the spatial axes (H, W) of the input. // With (N, C, D, H, W) inputs, and axis == 1, we perform // N independent 3D convolutions, sliding (C/g)-channels // filters across the spatial axes (D, H, W) of the input. // 通道channel所在的维度 optional int32 axis = 16 [default = 1]; // Whether to force use of the general ND convolution, even if a specific // implementation for blobs of the appropriate number of spatial dimensions // is available. (Currently, there is only a 2D-specific convolution // implementation; for input blobs with num_axes != 2, this option is // ignored and the ND implementation will be used.) // 如果输入数据维度等于2,则执行通用的ND卷积,否则正常执行卷积 optional bool force_nd_im2col = 17 [default = false]; } } }
Blob模块
Blob类是caffe数据的载体。它相当于一个4维数组(n,c,h,w)。它的成员变量如下:
protected: shared_ptr<SyncedMemory> data_; //存储前向传递数据 shared_ptr<SyncedMemory> diff_; //存储反向传递梯度 shared_ptr<SyncedMemory> shape_data_;// 参数维度old version vector<int> shape_; //参数维度 int count_; //Blob存储的元素个数(shape_所有元素乘积) int capacity_;//当前Blob的元素个数(控制动态分配)
其中SyncedMemory类将在后面讲解。
template <typename Dtype>//caffe由于内存是连续的,所以reshape只需要改变shape_就可以
void Blob<Dtype>::Reshape(const vector<int>& shape) {
CHECK_LE(shape.size(), kMaxBlobAxes);
count_ = 1;
shape_.resize(shape.size());
if (!shape_data_ || shape_data_->size() < shape.size() * sizeof(int)) {
shape_data_.reset(new SyncedMemory(shape.size() * sizeof(int)));
}//检查数据是否存在以及内存大小是否大于新的内存大小,如果是否定的就新建内存
int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());
for (int i = 0; i < shape.size(); ++i) {
CHECK_GE(shape[i], 0);
if (count_ != 0) {
CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
}
count_ *= shape[i];
shape_[i] = shape[i];
shape_data[i] = shape[i];
}
if (count_ > capacity_) {
capacity_ = count_;
data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
}
}
获取Blob cpu上的数据(也有Gpu的)
template <typename Dtype>// **数据不可修改**
const Dtype* Blob<Dtype>::cpu_data() const {
CHECK(data_);
return (const Dtype*)data_->cpu_data();
template <typename Dtype>
template <typename Dtype>// **数据可修改**
Dtype* Blob<Dtype>::mutable_cpu_data() {
CHECK(data_);
return static_cast<Dtype*>(data_->mutable_cpu_data());
}
}
2种读数据的方式,一种是不可变,比如说在读上一层bottom的输出data的时,这时我们只能读取它而不能修改它。 而在写输出top数据到下一层时,这时我们需要修改数据了。
SyncedMemory类
SyncedMemory类是负责Blob的内存管理,如内存的释放与分配。它的主要成员变量如下:
class SyncedMemory {
private:
void* cpu_ptr_;//内存指针
void* gpu_ptr_;//显存指针
size_t size_; //数据大小
SyncedHead head_;//当前数据状态
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
};
SyncedMemory屏蔽了代码对不同硬件设备的内存分配的感知,同时隐藏了CPU和GPU之间的同步过程。
SyncedMemory采用“lazy”的模式,就是内存的实际申请时机是在第一次使用时进行的(通过枚举状态),其代码如下(syncedmem.hpp syncedmem.cpp)
enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
inline void SyncedMemory::to_cpu() {
check_device();
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);//分配内存(里面用到了malloc\free)
caffe_memset(size_, 0, cpu_ptr_);//置0
head_ = HEAD_AT_CPU;//修改状态
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
#ifndef CPU_ONLY
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
NO_GPU;
#endif
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
layer类
layer类主要在编写网络结构proto.txt文件中用到。例如
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param { //在LayerParameter里面定义了ParamSpec,里面涉及的参数主要有学习率,衰减率
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {//也在LayerParameter里面
num_output: 96
kernel_size: 11
stride: 4
pad: 2
weight_filler { //ConvolutionParameter{optional FillerParameter weight_filler}这样定义的,其中FillerParameter里面有定义了type,标准差,最大最小值等参数
type: "gaussian"
std: 0.01 #标准差:distribution with stdev 0.01(default mean: 0)
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer里面的所有参数都是layer类的成员变量,其中layer类里面的LayerParameter定义了大量的参数变量(在caffe.proto文件里 message LayerParameter{…})。
layer类主要成员变量如下
template<typename Dtype>
class Layer{
protected:
//protobuf文件中存储的layer参数,从protocal buffers格式的网络结构说明文件中读取
//protected类成员,构造函数中初始化
LayerParameter layer_param_;
//层状态,参与网络的训练还是测试
Phase phase_;
// 可学习参数层权值和偏置参数,使用向量是因为权值参数和偏置是分开保存在两个blob中的
// 在基类layer中初始化(只是在描述文件定义了的情况下)
vector<shared_ptr<Blob<Dtype> > > blobs_;
// 标志每个可学习参数blob是否需要计算反向传递的梯度值
vector<bool> param_propagate_down_;
// 非LossLayer为零,LossLayer中表示每个top blob计算的loss的权重
vector<Dtype> loss_;
private:
/** Whether this layer is actually shared by other nets*/
bool is_shared_;
// 若该layer被shared,则需要这个mutex序列保持forward过程的正常运行
shared_ptr<boost::mutex> forward_mutex_;
}
layer类的主要成员函数如下,也是最核心的东西,包括网络cpu、gpu的forward与backward函数
class layer{
protect:
inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
//给定相对于 top 层输出的梯度,计算其相对于输入的梯度,并传递到 bottom
层。一个有参数的 layer 需要计算相对于各个参数的梯度值并存储在内部。
inline void Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom);
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) = 0;
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) = 0;
}//类里面成员函数的定义
由于其是纯虚函数,必须要被其继承类实现
//类成员函数的实现
template <typename Dtype>
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
switch (Caffe::mode()) {
case Caffe::CPU:
Backward_cpu(top, propagate_down, bottom);
break;
case Caffe::GPU:
Backward_gpu(top, propagate_down, bottom);
break;
default:
LOG(FATAL) << "Unknown caffe mode.";
}
}
forward与backward函数的实现都是基于forward_cpu与backward_cpu。
layer主要包括以下几层:
输入层DataLayer
Datalayer
里面用了高效的数据读取(LMDB、LevelDB),并利用多线程的方式加速solver对数据的读取,以及包括一些简单的数据预处理(resize ,crop.mirror,mean subtraction等(用了单独的data_transformer.cpp去处理,并在data_layer中被调用)),其中里面重要的参数定义在文章中最前面的caffe_proto中的message结构体中。DataLayer类的继承关系如下。
Datalayer里的load_batch函数实现对LMDB数据的batch读取。
下面为prototxt中的datalayer,里面的超参数都可以在DataLayer的cpp里面找到,存储在LayerParameter里。
layer {
name: "mnist"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
scale: 0.00390625
}
data_param {
source: "/home/xy/caffe-master/examples/mnist/mnist_train_lmdb"
batch_size: 64
backend: LMDB
}
}
BlockingQueue
BlockingQueue是线程安全的队列。
InternalThread
InternalThread封装了boost::thread,主要用于建立多线程对数据的读取
网络层ReLULayer
大部分网络层继承于NeutralLayer,而NeutralLayer继承于我们前面提到的Layer.
class ReLULayer : public NeuronLayer<Dtype> {
public:
explicit ReLULayer(const LayerParameter& param)
: NeuronLayer<Dtype>(param) {}
virtual inline const char* type() const { return "ReLU"; }
protect:(包括gpu)
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
//Relulayer的实现
template<typename Dtype>
void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype> *> &bottom,
const vector<Blob<Dtype> *> &top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
const int count = bottom[0]->count();
Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
for (int i = 0; i < count; ++i) {
top_data[i] = std::max(bottom_data[i], Dtype(0));
}
}
template <typename Dtype>
void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
if (propagate_down[0]) {
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* top_diff = top[0]->cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const int count = bottom[0]->count();
for (int i = 0; i < count; ++i) {
bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0);
}
}
}
layer_factory
caffe里面有许多网络层,这些网络层由layer_factory连接到Net中。layer_factory定义如下。这里有2个类,并用了2个宏定义,用来注册layer。
template<typename Dtype>
class LayerRegistry{
public:
// 函数指针Creator,返回Layer<Dtype>指针
typedef shared_ptr<Layer<Dtype>> (*Creator)(const LayerParameter&);
typedef std::map<string, Creator> CreatorRegistry;
static CreatorRegistry& Registry() {
static CreatorRegistry* g_registry_ = new CreatorRegistry();
return *g_registry_;
}
// Adds a creator.
static void AddCreator(const string& type, Creator creator) {
CreatorRegistry& registry = Registry();
LOG(INFO) << "AddCreator: " << type;
registry[type] = creator;
}
// Get a layer using a LayerParameter.
static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
const string& type = param.type();
CreatorRegistry& registry = Registry();
CHECK_EQ(registry.count(type), 1) << "Unknown layer type: ";
return registry[type](param);
}
private:
// Layer registry should never be instantiated
// everything is done with its static variables.
LayerRegistry() {}
};
/**********************************************************/
template <typename Dtype>
class LayerRegisterer {
public:
LayerRegisterer(const string& type,
shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
// LOG(INFO) << "Registering layer type: " << type;
LayerRegistry<Dtype>::AddCreator(type, creator);
}
};
#define REGISTER_LAYER_CREATOR(type, creator) \
static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>); \
static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>) \
#define REGISTER_LAYER_CLASS(type) \
template <typename Dtype> \
shared_ptr<Layer<Dtype> > Creator_##type##Layer(const LayerParameter& param) \
{ \
return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param)); \
} \
REGISTER_LAYER_CREATOR(type, Creator_##type##Layer)
} // namespace caffe
最后在每个layer的结尾都有
INSTANTIATE_CLASS(AbsValLayer);//定义实例化这个类
REGISTER_LAYER_CLASS(AbsVal);
其中INSTANTIATE_CLASS定义在common.h中
// Instantiate a class with float and double specifications.
#define INSTANTIATE_CLASS(classname) \
char gInstantiationGuard##classname; \
template class classname<float>; \
template class classname<double>
Util工具
IO操作
IO.cpp的功能主要是读取prototxt文件。
bool ReadProtoFromTextFile(const char* filename, Message* proto) {
int fd = open(filename, O_RDONLY);
CHECK_NE(fd, -1) << "File not found: " << filename;
FileInputStream* input = new FileInputStream(fd);
bool success = google::protobuf::TextFormat::Parse(input, proto);
delete input;
close(fd);
return success;
}