Caffe源代码之Softmax前后向传播

最新推荐文章于 2018-08-31 11:59:35 发布

Charel_CHEN

最新推荐文章于 2018-08-31 11:59:35 发布

阅读量611

点赞数

分类专栏：深度学习与计算机视觉 caffe源代码

本文链接：https://blog.csdn.net/Charel_CHEN/article/details/81293755

版权

深度学习与计算机视觉同时被 2 个专栏收录

32 篇文章 0 订阅

订阅专栏

caffe源代码

13 篇文章 0 订阅

订阅专栏

Caffe源代码之Softmax前后向传播

之前的几个博客介绍了Caffe中，网络训练过程中，数据块怎么存储的、层怎么搭建的、网络怎么进行管理层和数据的、网络怎么进行优化的，接下来几篇博客就集中到某些层上面了，比如说，Softmax层、卷积层、反卷积层、池化层、BN层以及SoftmaxWithLoss层的相关代码了。

今天分享Softmax层的代码，在之前的一个博客里面，笔者推到了Softmax的求导公式，那么在Caffe中，代码是怎样的呢？这里方便代码解释，这里以一个例子进行说明，就拿图像分割的例子。Softmax层的输入为： $32 \times 21 \times 224 \times 224$ 的矩阵(维度索引从0开始)，需要对每一个像素点进行21分类，也就是对第一维度进行Softmax，求 $224 \times 224$ 的图像所属21类的概率值。接下来，以上面这个例子解释SoftmaxLayer的工作机理。

caffe.proto配置文件

在看正式代码之前，我们看一下caffe.proto配置文件对softmaxLayer需要哪些参数

message SoftmaxParameter {
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 1 [default = DEFAULT];

  // The axis along which to perform the softmax -- may be negative to index
  // from the end (e.g., -1 for the last axis).
  // Any other axes will be evaluated as independent softmaxes.
  optional int32 axis = 2 [default = 1];//axis需要对哪个维度进行softmax，默认值为1
  //这里需要注意，axis可以为负的，具体看Reshape函数
}

Softmax头文件

Caffe中，各个层都有一个共同的基类Layer，SoftmaxLayer继承于Layer。

#ifndef CAFFE_SOFTMAX_LAYER_HPP_
#define CAFFE_SOFTMAX_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Computes the softmax function.
 *
 * TODO(dox): thorough documentation for Forward, Backward, and proto params.
 */
template <typename Dtype>
class SoftmaxLayer : public Layer<Dtype> {
 public:
 //显示构造函数
  explicit SoftmaxLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
//为输入、输出以及中间数据块分配合适的内存
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
//该层的类型
  virtual inline const char* type() const { return "Softmax"; }
  virtual inline int ExactNumBottomBlobs() const { return 1; }
  virtual inline int ExactNumTopBlobs() const { return 1; }

 protected:
 //前向传播
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
//反向传播
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int outer_num_; // 需要Softmax那一维度之外的维度，每一维度的乘积，对于所给例子，outer_num_ = 32

  int inner_num_;// 需要Softmax那一维度之内的维度，每一维度的乘积，对于所给例子，outer_num_ = 224 * 224

  int softmax_axis_; // 需要Softmax的维度，softmax_axis_ = 1
  /// sum_multiplier is used to carry out sum using BLAS
  Blob<Dtype> sum_multiplier_; //sum_multiplier_没有其他的含义，采用矩阵相乘的方式进行求和的
  /// scale is an intermediate Blob to hold temporary results.
  Blob<Dtype> scale_; //保存中间变量
};

}  // namespace caffe

#endif  // CAFFE_SOFTMAX_LAYER_HPP_

SoftmaxLayer源文件

在SoftmaxLayer的源文件中，主要关注，构造函数、Reshape、Forward_cpu、Backwar_cpu这样几个函数。下面继续以本文开篇的例子进行说明。

#include <algorithm>
#include <vector>

#include "caffe/layers/softmax_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
      //需要对哪一个维度进行softmax
      //在prototxt文件中，axis可以是负的，表示倒数第几层
      //通过CanonicalAxisIndex函数，转换成正的，softmax_axis_ = 1
  softmax_axis_ =
      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
      //输入输出显然大小一样都是 32 * 21 * 224 * 224
  top[0]->ReshapeLike(*bottom[0]);
  vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
  sum_multiplier_.Reshape(mult_dims);// sum_multiplier_是一个容量21的向量
  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);//sum_multiplier_向量默认值为1
  outer_num_ = bottom[0]->count(0, softmax_axis_); //outer_num_ = 32
  inner_num_ = bottom[0]->count(softmax_axis_ + 1);//inner_num_ = 224 * 224
  //scale_ 数据块的大小为 32 * 1 * 224 * 224
  vector<int> scale_dims = bottom[0]->shape(); 
  scale_dims[softmax_axis_] = 1;
  scale_.Reshape(scale_dims);
}

template <typename Dtype>
void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
    //获取bottom数据块指针，不可对其值进行修改
  const Dtype* bottom_data = bottom[0]->cpu_data();
  //获取top数据块指针，可对其值进行修改
  Dtype* top_data = top[0]->mutable_cpu_data();
  获取scale_数据块指针，可对其值进行修改
  Dtype* scale_data = scale_.mutable_cpu_data();
  int channels = bottom[0]->shape(softmax_axis_);//channel = 21
  int dim = bottom[0]->count() / outer_num_; //dim = 32 * 21 * 224 *224 / 32 = 21 * 224 * 224
  //将bottom的值拷贝到top中
  caffe_copy(bottom[0]->count(), bottom_data, top_data);
  // We need to subtract the max to avoid numerical issues, compute the exp,
  // and then normalize.

  for (int i = 0; i < outer_num_; ++i) {//大循环outer_num_ = 32
    // initialize scale_data to the first plane
    //首先，对21 * 224 * 224的数据块中，查找21这个维度下，其余维度（第2，3维度）的最大值
    //也就是224 * 224 存储在scale_中
    //然后减去scale_，其目的是防止数据溢出
    caffe_copy(inner_num_, bottom_data + i * dim, scale_data);//首先拷贝一个224 * 224的再说
    //接下来两层for循环就是找21 * 224 * 224的数据块，最后两个维度的最大值
    for (int j = 0; j < channels; j++) {//channels = 21
      for (int k = 0; k < inner_num_; k++) {//inner_num_ = 224 * 224
        scale_data[k] = std::max(scale_data[k],
            bottom_data[i * dim + j * inner_num_ + k]);
      }
    }
    // subtraction 矩阵相减
    //caffe_cpu_gemm是caffe封装cblas的函数
    //具体进行的操作：括号中，表示矩阵大小
    //top = -1 * sum_multiplier_（21 * 1） * scale_(1 * 224 * 224) + top(21 * 224 * 224)
    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_,
        1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
    // exponentiation 求指数，存到top
    caffe_exp<Dtype>(dim, top_data, top_data);
    // sum after exp，求和，存储到scale_中
    //向量和矩阵相乘，达到求和的目的
    //scale_ = top‘（21 * 224 * 224 ）* sum_multiplier_（21 * 1）
    caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,
        top_data, sum_multiplier_.cpu_data(), 0., scale_data);
    // division，对 top数据块中21 * 224 * 224 ，21这个维度进行归一化，也就是除以scale_
    for (int j = 0; j < channels; j++) {//channels = 21
      caffe_div(inner_num_, top_data, scale_data, top_data);
      top_data += inner_num_;//进行指针偏移，获取下一个224 * 224数据块的头地址 
    }
  }
}

//反向传播，在softmaxLayer中，没有可学习的的参量，因此反向传播只需要求取bottom
//的梯度即可
template <typename Dtype>
void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
    //获取上一层的梯度top_diff，不可对其值进行修改
  const Dtype* top_diff = top[0]->cpu_diff();
  //获取输出的值，也就是概率值top_data，不可对其值进行修改
  const Dtype* top_data = top[0]->cpu_data();
  //获取bottom的梯度指针bottom_diff，可对其值进行修改
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  //获取获取scale_，可对其值进行修改
  Dtype* scale_data = scale_.mutable_cpu_data();
  int channels = top[0]->shape(softmax_axis_);//channels = 21
  int dim = top[0]->count() / outer_num_;//dim = 21 * 224 * 224
  caffe_copy(top[0]->count(), top_diff, bottom_diff); //首先，将top
  //的梯度拷贝到bottom来
  for (int i = 0; i < outer_num_; ++i) {//outer_num_ = 32
    // compute dot(top_diff, top_data) and subtract them from the bottom diff，向量的内积，求取 top_data * top_diff
    //我们回顾一下Softmax反向传播的公式：\dot 内积
    //bottom_diff = top_data * (top_diff - top_diff \dot top_data)
    for (int k = 0; k < inner_num_; ++k) { // inner_num_ = 224 * 224
    //top_diff \dot top_data
      scale_data[k] = caffe_cpu_strided_dot<Dtype>(channels,
          bottom_diff + i * dim + k, inner_num_,
          top_data + i * dim + k, inner_num_);
    }
    // subtraction,矩阵乘法，实现相减
    //bottom_diff = -1 * sum_multiplier_(21 * 1) * scale_(1 * 224 * 224) + bottom_diff
    //实际上实现的是：top_diff - top_diff \dot top_data 
    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
        -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
  }
  // elementwise multiplication，然后实现，元素级相乘
  //也就是实现：bottom_diff = (top_diff - top_data \dot top_diff) * top_data
  caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
}


#ifdef CPU_ONLY
STUB_GPU(SoftmaxLayer);
#endif

INSTANTIATE_CLASS(SoftmaxLayer);

}  // namespace caffe

Charel_CHEN

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Caffe源代码之Softmax前后向传播

Caffe源代码之Softmax前后向传播之前的几个博客介绍了Caffe中，网络训练过程中，数据块怎么存储的、层怎么搭建的、网络怎么进行管理层和数据的、网络怎么进行优化的，接下来几篇博客就集中到某些层上面了，比如说，Softmax层、卷积层、反卷积层、池化层、BN层以及SoftmaxWithLoss层的相关代码了。今天分享Softmax层的代码，在之前的一个博客里面，笔者推到了Softma...
复制链接

扫一扫

专栏目录