Caffe源码 - RoI Pooling 层

最新推荐文章于 2020-11-21 01:55:20 发布

AIHGF

最新推荐文章于 2020-11-21 01:55:20 发布

阅读量4.7k

点赞数

本文链接：https://blog.csdn.net/zziahgf/article/details/78330085

版权

Caffe 同时被 3 个专栏收录

37 篇文章 2 订阅

订阅专栏

Caffe

30 篇文章 5 订阅

订阅专栏

目标检测

26 篇文章 5 订阅

订阅专栏

RoI Pooling 层

caffe prototxt 定义：

layer {
  name: "roi_pool5"
  type: "ROIPooling"
  bottom: "conv5"
  bottom: "rois"
  top: "pool5"
  roi_pooling_param {
    pooled_w: 6
    pooled_h: 6
    spatial_scale: 0.0625 # 1/16
  }
}

caffe caffe.proto ROI Pooling 层参数说明：

 optional ROIPoolingParameter roi_pooling_param = 43;

 message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for equal
// dimensions in height and width or as Y, X pairs.
  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height 池化后输出的 height
  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width  池化后输出的 width
  // Multiplicative spatial scale factor to translate ROI coords from their
  // input scale to the scale used when pooling
  // 乘以空间缩放因子，以将 RoI 坐标由输入尺度转换到 pooling 时使用的尺度
  optional float spatial_scale = 3 [default = 1];
}

根据 prototxt 定义可以看出，roi_pool5 的输入有两个，bottom[0] 是 conv5 卷积层出来的 feature map，由于前面进行的 pool 层，conv5 的 feature map 的 height 和 width 分别是原图尺寸的 1/16. bottom[1] 是 rois blob，其类似于一个 $num\_rois × 5$ 的二维矩阵，行数 num_rosi 为 bottom[1]->num()，列数为 5，其定义为：

input: "rois"
input_shape {
  dim: 1 # to be changed on-the-fly to num ROIs
  dim: 5 # [batch_index, x1, y1, x2, y2] zero-based indexing
}

bottom_index 为每次在 bottom[0] 中的第一个维度的偏移量，[x1, y1, x2, y2] 是 feature map 中点的坐标.

在 feature map 中，RoI Pooling层首先计算定义的 rois 在 conv feature map 上所映射的两个坐标 —— (x1 * spatial_scale, y1 * spatial_scale)，(x2 * spatial_scale, y2 * spatial_scale)，对应的点为 (top-left, bottom-right)，即在 feature map 中确定一个区域.

对于确定的一个区域，进行 pooled_h * pooled_w (这里是 6*6) 等分，划分为 36 个相同大小的子区域, 区域大小为 bin_h = roi_h / pooled_h, bin_w = roi_w / pooled_w；

对于每个子区域，采用 max 操作找出对应 feature map 的最大值，即为输出 top blob 的对应值.

对于 bottom[0] 的每个 channel 进行相同操作.

roi_pool5 有一个输出 top[0]，其尺寸为 (bottom[1]->num(), bottom[0]->channels, pooled_h, pooled_w)，其中，pooled_h 和 pooled_w 是固定定义的，其值这里为 6.

根据其源码 roi_pooling_layer.cpp 分析，

RoI Pooling 层 LayerSetUp —— 参数读取

void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
CHECK_GT(roi_pool_param.pooled_h(), 0)
    << "pooled_h must be > 0";
CHECK_GT(roi_pool_param.pooled_w(), 0)
    << "pooled_w must be > 0";
pooled_height_ = roi_pool_param.pooled_h();
pooled_width_ = roi_pool_param.pooled_w();
spatial_scale_ = roi_pool_param.spatial_scale();
LOG(INFO) << "Spatial scale: " << spatial_scale_;
}

RoI Pooling 层 Reshape

void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
width_ = bottom[0]->width();
top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,
    pooled_width_);
max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
    pooled_width_);
}

RoI Pooling 层的输出Blob - top[0] 的channels 与 bottom[0] (即 conv5) 相同； top[0] 的 num 与 RoI 的 num 相同，将 RoI 对应在 conv5 的 feature map 进行操作.

RoI Pooling 层 Forward_cpu 实现

void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data(); // conv5 得到的 feature map
const Dtype* bottom_rois = bottom[1]->cpu_data(); // rois 数据 [batch_index, x1, y1, x2, y2]
// Number of ROIs
int num_rois = bottom[1]->num(); // rois 数量
int batch_size = bottom[0]->num(); 
int top_count = top[0]->count();
Dtype* top_data = top[0]->mutable_cpu_data();
caffe_set(top_count, Dtype(-FLT_MAX), top_data); //top blob data 设成最小值，(个数， 数值，位置)
int* argmax_data = max_idx_.mutable_cpu_data();
caffe_set(top_count, -1, argmax_data);

// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
for (int n = 0; n < num_rois; ++n) {
  int roi_batch_ind = bottom_rois[0]; // batch_index 下标
  int roi_start_w = round(bottom_rois[1] * spatial_scale_); // top-left x1
  int roi_start_h = round(bottom_rois[2] * spatial_scale_); // top-left y1
  int roi_end_w = round(bottom_rois[3] * spatial_scale_); // bottom-right x2
  int roi_end_h = round(bottom_rois[4] * spatial_scale_); // bottom-fight y2
  CHECK_GE(roi_batch_ind, 0);
  CHECK_LT(roi_batch_ind, batch_size);

  int roi_height = max(roi_end_h - roi_start_h + 1, 1);
  int roi_width = max(roi_end_w - roi_start_w + 1, 1);
  // RoI bin 区域相对于 pooling 后图像大小的比例，一个 RoI 像素表示多少 pooling 后图像像素
  const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                           / static_cast<Dtype>(pooled_height_);
  const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                           / static_cast<Dtype>(pooled_width_);

  const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);

  for (int c = 0; c < channels_; ++c) {
    for (int ph = 0; ph < pooled_height_; ++ph) {
      for (int pw = 0; pw < pooled_width_; ++pw) {
        // Compute pooling region for this output unit:
        //  start (included) = floor(ph * roi_height / pooled_height_)
        //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
        int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                            * bin_size_h));
        int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                            * bin_size_w));
        int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                         * bin_size_h));
        int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                         * bin_size_w));

        hstart = min(max(hstart + roi_start_h, 0), height_);
        hend = min(max(hend + roi_start_h, 0), height_);
        wstart = min(max(wstart + roi_start_w, 0), width_);
        wend = min(max(wend + roi_start_w, 0), width_);

        bool is_empty = (hend <= hstart) || (wend <= wstart);

        const int pool_index = ph * pooled_width_ + pw;
        if (is_empty) {
          top_data[pool_index] = 0;
          argmax_data[pool_index] = -1;
        }

        // 对区域内所有像素找出最大元素及其位置索引
        for (int h = hstart; h < hend; ++h) {
          for (int w = wstart; w < wend; ++w) {
            const int index = h * width_ + w;
            if (batch_data[index] > top_data[pool_index]) {
              top_data[pool_index] = batch_data[index];
              argmax_data[pool_index] = index;
            }
          }
        }
      }
    }
    // Increment all data pointers by one channel
    // 各通道独立计算 pooling，结束一个 channel 后，移动指针
    batch_data += bottom[0]->offset(0, 1);
    top_data += top[0]->offset(0, 1);
    argmax_data += max_idx_.offset(0, 1);
  }
  // Increment ROI data pointer
  // bottom[1] 的 shape - [num_rois, 5, 1, 1], offset(1) 表示将指针移动到下一个 roi 区域.
  bottom_rois += bottom[1]->offset(1);
}
}