Faster R-CNN roi_pooling_layer 浅析

最新推荐文章于 2023-03-27 09:38:26 发布

星刻

最新推荐文章于 2023-03-27 09:38:26 发布

阅读量3.4k

点赞数 2

分类专栏： caffe 文章标签： caffe

本文链接：https://blog.csdn.net/u014565333/article/details/53206593

版权

caffe 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

阅读完 R-CNN一系列论文之后，开始看源代码。本文简要记录自己对roi_pooling_layer源码的理解。

作者首先在caffe.proto中添加该层参数说明，主要是三个变量

 optional ROIPoolingParameter roi_pooling_param = 43;

 message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for    equal
// dimensions in height and width or as Y, X pairs.
  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
  // Multiplicative spatial scale factor to translate ROI coords from their
  // input scale to the scale used when pooling
  optional float spatial_scale = 3 [default = 1];
}

LayerSetUp

void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
        CHECK_GT(roi_pool_param.pooled_h(), 0)
            << "pooled_h must be > 0";
        CHECK_GT(roi_pool_param.pooled_w(), 0)
            << "pooled_w must be > 0";
        pooled_height_ = roi_pool_param.pooled_h();
        pooled_width_ = roi_pool_param.pooled_w();
        spatial_scale_ = roi_pool_param.spatial_scale();
        LOG(INFO) << "Spatial scale: " << spatial_scale_;
}

实现参数赋值

Reshape

    void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        channels_ = bottom[0]->channels();
        height_ = bottom[0]->height();
        width_ = bottom[0]->width();
        // top[0]的通道数与bottom[0]的通道数是相等的，毕竟只是做了个pooling而已   
        // top[0]的数量跟ROI的数量是一样的，就是将ROI对应到conv_5上
        top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,   // num of rois
            pooled_width_);
        max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
            pooled_width_);             
}

Forward_cpu

    void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
        const Dtype* bottom_data = bottom[0]->cpu_data();  
        // bottom_data 为完整图像经过conv_layer等前向传播所得 ，即conv_5
        const Dtype* bottom_rois = bottom[1]->cpu_data();  
        // bottom_rois 为rois，其实就是一些rois的信息:batch_index和2个点的坐标
        // Number of ROIs
        int num_rois = bottom[1]->num();      //ROI的数量
        int batch_size = bottom[0]->num();    //conv_5 一批特征的数量
        int top_count = top[0]->count();      
        Dtype* top_data = top[0]->mutable_cpu_data();
        caffe_set(top_count, Dtype(-FLT_MAX), top_data);    
        //将top_data全部设置成最小值（个数，值，位置）
        int* argmax_data = max_idx_.mutable_cpu_data();
        caffe_set(top_count, -1, argmax_data);

        // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
        //对于每一个ROI 区域
            int roi_batch_ind = bottom_rois[0];        //下标
            int roi_start_w = round(bottom_rois[1] * spatial_scale_);      
            int roi_start_h = round(bottom_rois[2] * spatial_scale_);
            int roi_end_w = round(bottom_rois[3] * spatial_scale_);
            int roi_end_h = round(bottom_rois[4] * spatial_scale_);
            CHECK_GE(roi_batch_ind, 0);
            CHECK_LT(roi_batch_ind, batch_size);

            int roi_height = max(roi_end_h - roi_start_h + 1, 1);
            int roi_width = max(roi_end_w - roi_start_w + 1, 1);
            const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                / static_cast<Dtype>(pooled_height_);          
             //除法 ROI区域相对于pooling后图像大小的比例 可理解为pooling后一个像  素代表多少个ROI像素
            const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                / static_cast<Dtype>(pooled_width_);

            const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind); // 找到对应ROI的conv_5的地址
            // bottom_data 为完整图像经过conv_layer等前向传播所得 ，即conv_5 地址 
            for (int c = 0; c < channels_; ++c) {
                for (int ph = 0; ph < pooled_height_; ++ph) {
                    for (int pw = 0; pw < pooled_width_; ++pw) {
                        // Compute pooling region for this output unit:
                        //  start (included) = floor(ph * roi_height / pooled_height_)
                        //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
                        int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                            * bin_size_h));
                        int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                            * bin_size_w));
                        int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                            * bin_size_h));
                        int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                            * bin_size_w));

                        hstart = min(max(hstart + roi_start_h, 0), height_);    //height_  是conv_5的大小  
                        hend = min(max(hend + roi_start_h, 0), height_);
                        wstart = min(max(wstart + roi_start_w, 0), width_);
                        wend = min(max(wend + roi_start_w, 0), width_);
   //为什么要加上roi_start_h？因为roi取自源图片，其左上角坐标不是从（0,0）开始
                        bool is_empty = (hend <= hstart) || (wend <= wstart);

                        const int pool_index = ph * pooled_width_ + pw;
                        if (is_empty) {
                            top_data[pool_index] = 0;
                            argmax_data[pool_index] = -1;
                        }

                        for (int h = hstart; h < hend; ++h) {
                            for (int w = wstart; w < wend; ++w) {
                                const int index = h * width_ + w;
                                if (batch_data[index] > top_data[pool_index]) {
                                    top_data[pool_index] = batch_data[index];    //conv_5上的像素对应到了输出里面
                                    argmax_data[pool_index] = index;
                                }
                            }
                        }
                    }
                }
                // Increment all data pointers by one channel
                batch_data += bottom[0]->offset(0, 1);
                top_data += top[0]->offset(0, 1);
                argmax_data += max_idx_.offset(0, 1);
            }
            // Increment ROI data pointer
            bottom_rois += bottom[1]->offset(1);
        }
}

template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
        NOT_IMPLEMENTED;
}

总的来讲，实现的功能是
首先计算rois映射到feature map的坐标，即原始坐标*spacial_scale(大小为所有stride的乘积分之一)，然后针对每个输出来进行计算，即每个输出点都代表原先的一块区域，这个区域大小为bin_h= roi_height / pooled_ height, bin_w=roi_width / pooled_width.遍历所有top的点所映射回feature map的区域，并找到最大值，记录最大值所在的位置。

Backward_gpu

作者实现的是GPU上的反向传播算法

 template  <typename Dtype>
 __global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
const int* argmax_data, const int num_rois, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, Dtype* bottom_diff,
const Dtype* bottom_rois) {
 CUDA_KERNEL_LOOP(index, nthreads) {
    // (n, c, h, w) coords in bottom data       //遍历bottom[0]，也就是feature map conv_5的特征图
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
Dtype gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
  const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
  int roi_batch_ind = offset_bottom_rois[0];
  // Skip if ROI's batch index doesn't match n         ROI要和conv_5的序号对应起来
  if (n != roi_batch_ind) {
    continue;
  }

  int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);  //ROIS 缩放到 映射大小区域
  int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
  int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
  int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);

  // Skip if ROI doesn't include (h, w)
  const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                       h >= roi_start_h && h <= roi_end_h);
  if (!in_roi) {
    continue;
  }

  int offset = (roi_n * channels + c) * pooled_height * pooled_width;  //计算该ROI 对应到pooling后，相对于初始位置的偏移
  const Dtype* offset_top_diff = top_diff + offset;                    //获得地址
  const int* offset_argmax_data = argmax_data + offset;                //对应pooling取值在conv_5上的位置地址

  // Compute feasible set of pooled units that could have pooled
  // this bottom unit

  // Force malformed ROIs to be 1x1
  int roi_width = max(roi_end_w - roi_start_w + 1, 1);
  int roi_height = max(roi_end_h - roi_start_h + 1, 1);

  Dtype bin_size_h = static_cast<Dtype>(roi_height)
                     / static_cast<Dtype>(pooled_height);
  Dtype bin_size_w = static_cast<Dtype>(roi_width)
                     / static_cast<Dtype>(pooled_width);

  int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
  int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
  int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
  int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);

  phstart = min(max(phstart, 0), pooled_height);
  phend = min(max(phend, 0), pooled_height);
  pwstart = min(max(pwstart, 0), pooled_width);
  pwend = min(max(pwend, 0), pooled_width);

  for (int ph = phstart; ph < phend; ++ph) {
    for (int pw = pwstart; pw < pwend; ++pw) {
      if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {  //对于这一个conv_5位置点,它可能对pooling上多个点有影响，累加
        gradient += offset_top_diff[ph * pooled_width + pw];
      }
    }
  }
}
bottom_diff[index] = gradient;
  }
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) {
return;
 }
  const Dtype* bottom_rois = bottom[1]->gpu_data();
  const Dtype* top_diff = top[0]->gpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  const int count = bottom[0]->count();
  caffe_gpu_set(count, Dtype(0.), bottom_diff);
  const int* argmax_data = max_idx_.gpu_data();
  // NOLINT_NEXT_LINE(whitespace/operators)
  ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
      count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
  CUDA_POST_KERNEL_CHECK;
}

总结起来就是：
遍历feature map并记录n, c, h, w，为之后记录bottom_diff做准备，然后计算每个roi映射到feature map的坐标，接下来我就认为有个小问题了，作者的意思是表达如果h，w如果不在roi区域内的话，可以直接continue了，这点不难理解，某个点在roi中可能对这个roi所对应的top产生贡献（在某个bin中为最大），如果点不在那个区域中，一定不会对top产生贡献。而某一点可能对多个区域产生贡献，故loss返回来时，同一点的loss累加。