Fast R-CNN中提出了Roi pooling层的概念
对于rolpooling层的的介绍就不多说了 网上已经有很多介绍,作用如下:
在目标检测算法中,region proposal产生的ROI大小不一,而分类网络的输入要固定的输入,所以ROI Pooing起到一个连接作用,实现了网络的end to end.
下图为一个特征图,黑色框为产生的ROI区域,需要把该区域通过ROI Pooing操作输出为2x2大小的维度
对每个分割的区域用max pooing操作得到
前向传播的代码
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
// featuremap
const Dtype* bottom_data = bottom[0]->cpu_data();
// 得到生成的 rois
const Dtype* bottom_rois = bottom[1]->cpu_data();
// Number of ROIs
int num_rois = bottom[1]->num();
int batch_size = bottom[0]->num();
int top_count = top[0]->count();
Dtype* top_data = top[0]->mutable_cpu_data();
caffe_set(top_count, Dtype(-FLT_MAX), top_data);
int* argmax_data = max_idx_.mutable_cpu_data();
caffe_set(top_count, -1, argmax_data);
// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
// 遍历所有的regions
for (int n = 0; n < num_rois; ++n) {
// 得到该roi属于哪张图片 batch_ind对应图片索引
int roi_batch_ind = bottom_rois[0];
// 得到region在特征图上的坐标 起始位置 缩放到特征图上
int roi_start_w = round(bottom_rois[1] * spatial_scale_);
int roi_start_h = round(bottom_rois[2] * spatial_scale_);
int roi_end_w = round(bottom_rois[3] * spatial_scale_);
int roi_end_h = round(bottom_rois[4] * spatial_scale_);
CHECK_GE(roi_batch_ind, 0);
CHECK_LT(roi_batch_ind, batch_size);
// 得到region在特征图上宽高
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
// 将roi划分成 roi_width * roi_height多个bin
const Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
const Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
// bottom_data获取对应索引位置图片特征图的输出
const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
// 三层循环 得到该层的输出 (pooled_height, pooled_widht, channle)
for (int c = 0; c < channels_; ++c) {
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
// Compute pooling region for this output unit:
// start (included) = floor(ph * roi_height / pooled_height_)
// end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
// 得到在特征图上的所有 bin的其起始位置 每个bin的大小为 bin_size_h * bin_size_w
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
* bin_size_w));
// 得到此时遍历的bin相对于特征图的坐标
hstart = min(max(hstart + roi_start_h, 0), height_);
hend = min(max(hend + roi_start_h, 0), height_);
wstart = min(max(wstart + roi_start_w, 0), width_);
wend = min(max(wend + roi_start_w, 0), width_);
bool is_empty = (hend <= hstart) || (wend <= wstart);
// 在特征图上的遍历位置
const int pool_index = ph * pooled_width_ + pw;
// 数据异常的情况
if (is_empty) {
top_data[pool_index] = 0;
argmax_data[pool_index] = -1;
}
// 这里就是进行最大池化操作 得到当前bin中的最大值
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = h * width_ + w;
if (batch_data[index] > top_data[pool_index]) {
// 得到最大值存入 top中 并保存该位置索引 top是存的一个数组 长度为width_*height_
top_data[pool_index] = batch_data[index];
argmax_data[pool_index] = index;
}
}
}
}
}
// Increment all data pointers by one channel
batch_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
argmax_data += max_idx_.offset(0, 1);
}
// Increment ROI data pointer
bottom_rois += bottom[1]->offset(1);
}
}
反向传播代码 源码中没有实现cpu版本的反向传播只实现了gpu版本的反向传播
template <typename Dtype>
__global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
const int* argmax_data, const int num_rois, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, Dtype* bottom_diff,
const Dtype* bottom_rois) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, h, w) coords in bottom data
// 因为bottom数据存成一个连续数组 w -> h -> c -> n的数据顺序依次存入bottom中
// 这里表示根于bottom中任何一个index找到他在featuremap中的 位置 w,h,c,n
// 对特征图的每一个点进行梯度更新
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
Dtype gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
// 遍历所有的rois
// 所有的数据都存成一个连续数组
// rois 每5个值为一组 [batch_index x1 y1 x2 y2]
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
// 这里每次取5个位置 [batch_index x1 y1 x2 y2]
const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Skip if ROI's batch index doesn't match n
if (n != roi_batch_ind) {
continue;
}
// 将坐标映射到featuremap尺寸上
int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
// Skip if ROI doesn't include (h, w)
const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if (!in_roi) {
continue;
}
int offset = (roi_n * channels + c) * pooled_height * pooled_width;
const Dtype* offset_top_diff = top_diff + offset;
const int* offset_argmax_data = argmax_data + offset;
// Compute feasible set of pooled units that could have pooled
// this bottom unit
// Force malformed ROIs to be 1x1
// 得到roi区域的宽高
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
// 得到划分的bin 尺寸 和forward一致
Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height);
Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width);
// 下面是在计算前向传播的最大点位于那个bin (总共 pooled_height* pooled_width) 个bin
// 向下取整
int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
// 向上取整
int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
phstart = min(max(phstart, 0), pooled_height);
phend = min(max(phend, 0), pooled_height);
pwstart = min(max(pwstart, 0), pooled_width);
pwend = min(max(pwend, 0), pooled_width);
// 遍历bin区域 找出前向传播存储的最大点位于所在bin的具体位置
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// 找到在 forward传播 中 argmax_data中存的索引值
if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {
// 累加记录的最大位置梯度 某个位置被多次计算梯度 因为有多个rois
gradient += offset_top_diff[ph * pooled_width + pw];
}
}
}
}
// 记录梯度值
bottom_diff[index] = gradient;
}
}
template <typename Dtype>
void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (!propagate_down[0]) {
return;
}
const Dtype* bottom_rois = bottom[1]->gpu_data();
const Dtype* top_diff = top[0]->gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const int count = bottom[0]->count();
caffe_gpu_set(count, Dtype(0.), bottom_diff);
const int* argmax_data = max_idx_.gpu_data();
// NOLINT_NEXT_LINE(whitespace/operators)
ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
CUDA_POST_KERNEL_CHECK;
}
INSTANTIATE_LAYER_GPU_FUNCS(ROIPoolingLayer);
} // namespace caffe