RoI Pooling 层
caffe prototxt 定义:
layer {
name: "roi_pool5"
type: "ROIPooling"
bottom: "conv5"
bottom: "rois"
top: "pool5"
roi_pooling_param {
pooled_w: 6
pooled_h: 6
spatial_scale: 0.0625 # 1/16
}
}
caffe caffe.proto ROI Pooling 层参数说明:
optional ROIPoolingParameter roi_pooling_param = 43;
message ROIPoolingParameter {
// Pad, kernel size, and stride are all given as a single value for equal
// dimensions in height and width or as Y, X pairs.
optional uint32 pooled_h = 1 [default = 0]; // The pooled output height 池化后输出的 height
optional uint32 pooled_w = 2 [default = 0]; // The pooled output width 池化后输出的 width
// Multiplicative spatial scale factor to translate ROI coords from their
// input scale to the scale used when pooling
// 乘以空间缩放因子,以将 RoI 坐标由输入尺度转换到 pooling 时使用的尺度
optional float spatial_scale = 3 [default = 1];
}
根据 prototxt 定义可以看出,roi_pool5 的输入有两个,bottom[0] 是 conv5 卷积层出来的 feature map,由于前面进行的 pool 层,conv5 的 feature map 的 height 和 width 分别是原图尺寸的 1/16. bottom[1] 是 rois blob, 其类似于一个 num_rois×5 的二维矩阵,行数 num_rosi 为 bottom[1]->num(),列数为 5,其定义为:
input: "rois"
input_shape {
dim: 1 # to be changed on-the-fly to num ROIs
dim: 5 # [batch_index, x1, y1, x2, y2] zero-based indexing
}
bottom_index 为每次在 bottom[0] 中的第一个维度的偏移量,[x1, y1, x2, y2] 是 feature map 中点的坐标.
在 feature map 中,RoI Pooling层首先计算定义的 rois 在 conv feature map 上所映射的两个坐标 —— (x1 * spatial_scale, y1 * spatial_scale),(x2 * spatial_scale, y2 * spatial_scale),对应的点为 (top-left, bottom-right),即在 feature map 中确定一个区域.
对于确定的一个区域,进行 pooled_h * pooled_w (这里是 6*6) 等分,划分为 36 个相同大小的子区域, 区域大小为 bin_h = roi_h / pooled_h, bin_w = roi_w / pooled_w;
对于每个子区域,采用 max 操作找出对应 feature map 的最大值,即为输出 top blob 的对应值.
对于 bottom[0] 的每个 channel 进行相同操作.
roi_pool5 有一个输出 top[0],其尺寸为 (bottom[1]->num(), bottom[0]->channels, pooled_h, pooled_w),其中,pooled_h 和 pooled_w 是固定定义的,其值这里为 6.
根据其源码 roi_pooling_layer.cpp 分析,
RoI Pooling 层 LayerSetUp —— 参数读取
void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param(); CHECK_GT(roi_pool_param.pooled_h(), 0) << "pooled_h must be > 0"; CHECK_GT(roi_pool_param.pooled_w(), 0) << "pooled_w must be > 0"; pooled_height_ = roi_pool_param.pooled_h(); pooled_width_ = roi_pool_param.pooled_w(); spatial_scale_ = roi_pool_param.spatial_scale(); LOG(INFO) << "Spatial scale: " << spatial_scale_; }
RoI Pooling 层 Reshape
void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); }
RoI Pooling 层的输出Blob - top[0] 的channels 与 bottom[0] (即 conv5) 相同; top[0] 的 num 与 RoI 的 num 相同,将 RoI 对应在 conv5 的 feature map 进行操作.
RoI Pooling 层 Forward_cpu 实现
void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); // conv5 得到的 feature map const Dtype* bottom_rois = bottom[1]->cpu_data(); // rois 数据 [batch_index, x1, y1, x2, y2] // Number of ROIs int num_rois = bottom[1]->num(); // rois 数量 int batch_size = bottom[0]->num(); int top_count = top[0]->count(); Dtype* top_data = top[0]->mutable_cpu_data(); caffe_set(top_count, Dtype(-FLT_MAX), top_data); //top blob data 设成最小值,(个数, 数值,位置) int* argmax_data = max_idx_.mutable_cpu_data(); caffe_set(top_count, -1, argmax_data); // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R for (int n = 0; n < num_rois; ++n) { int roi_batch_ind = bottom_rois[0]; // batch_index 下标 int roi_start_w = round(bottom_rois[1] * spatial_scale_); // top-left x1 int roi_start_h = round(bottom_rois[2] * spatial_scale_); // top-left y1 int roi_end_w = round(bottom_rois[3] * spatial_scale_); // bottom-right x2 int roi_end_h = round(bottom_rois[4] * spatial_scale_); // bottom-fight y2 CHECK_GE(roi_batch_ind, 0); CHECK_LT(roi_batch_ind, batch_size); int roi_height = max(roi_end_h - roi_start_h + 1, 1); int roi_width = max(roi_end_w - roi_start_w + 1, 1); // RoI bin 区域相对于 pooling 后图像大小的比例,一个 RoI 像素表示多少 pooling 后图像像素 const Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height_); const Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width_); const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind); for (int c = 0; c < channels_; ++c) { for (int ph = 0; ph < pooled_height_; ++ph) { for (int pw = 0; pw < pooled_width_; ++pw) { // Compute pooling region for this output unit: // start (included) = floor(ph * roi_height / pooled_height_) // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h)); int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w)); int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h)); int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w)); hstart = min(max(hstart + roi_start_h, 0), height_); hend = min(max(hend + roi_start_h, 0), height_); wstart = min(max(wstart + roi_start_w, 0), width_); wend = min(max(wend + roi_start_w, 0), width_); bool is_empty = (hend <= hstart) || (wend <= wstart); const int pool_index = ph * pooled_width_ + pw; if (is_empty) { top_data[pool_index] = 0; argmax_data[pool_index] = -1; } // 对区域内所有像素找出最大元素及其位置索引 for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width_ + w; if (batch_data[index] > top_data[pool_index]) { top_data[pool_index] = batch_data[index]; argmax_data[pool_index] = index; } } } } } // Increment all data pointers by one channel // 各通道独立计算 pooling,结束一个 channel 后,移动指针 batch_data += bottom[0]->offset(0, 1); top_data += top[0]->offset(0, 1); argmax_data += max_idx_.offset(0, 1); } // Increment ROI data pointer // bottom[1] 的 shape - [num_rois, 5, 1, 1], offset(1) 表示将指针移动到下一个 roi 区域. bottom_rois += bottom[1]->offset(1); } }