先来说一下ROI pooling在做什么吧,对于猫咪或者狗所在区域,我希望能够提取该区域的特征,但是这两块区域不一样大,我又希望能够获取一样长的特征,这样方便后面的接全连接层(全连接层对于输入的大小要求固定),于是就有了这样一个需求,对于不同的区域(我们称之为rois),得到相同的特征
做法:将roi区域划分成一个N*N(一般为7*7)的方格,在每个方格内取该方格的最大值,来代表该区域的特征
于是就有了这样一个接口
cv::Mat2d roi_pooling(vector<int> &rois,cv::Mat2d &imgs){
//return 7*7*3的一块区域
}
但是我们是在特征图上做的,并且每次传一个batch的rois(B*5)的tensor,(batch_id,x1,y1,x2,y2)
这样我们便可以得到roi 区域的特征图
cv::Mat2d roi_pooling(vector<int> &rois,cv::Mat2d &imgs){
//return 7*7*3的一块区域
//为了简便,使用输入图像为灰度图(单通道图)
float w=rois[1]-rois[0];
float w_bin=w/7;
float h=rois[3]-rois[2];
float h_bin=h/7;
cv::Mat feature = cv::Mat::zeros(7,7,CV_8UC1);
for(int i=0;i<7;i++){
for(int j=0;j<7;j++){
float startx=rois[0]+i*w_bin;
float endx=rois[0]+(i+1)*w_bin;
float starty=rois[2]+j*h_bin;
float endy=rois[2]+(j+1)*h_bin;
int max_value=0;
for(int m=floor(startx);m<ceil(endx);m++){
for(int n=floor(starty);n<ceil(endy);n++){
feature[i][j]=max(feature[i][j],imgs[m][n]);
}
}
}
}
}
上述过程我们便完成了特征图roi pooling的基本实现,但是在实际过程中,我们处理会略有不同,主要体现在三点
1.特征不再是1-dim,而是C-dim,这样外层循环需要多套一层
2.使用CUDA加速,外面的三层循环,就是上述代码的7*7和1提到的C维度都分配给不同的CUDA线程执行
3.不再是对Mat操作,而是对裸数据,即数据操作,需要自己计算数据偏执
下面是对应的CUDA forward代码
extern "C"
__global__ void roi_pooling2d_forward_kernel(
const ${Dtype}* bottom_data, const ${Dtype}* bottom_rois,
${Dtype}* top_data, ${Dtype_ind}* argmax_data) {
CUDA_KERNEL_LOOP(index, ${nthreads}) {
// pos in output filter
int pw = index % ${pooled_width};
int ph = (index / ${pooled_width}) % ${pooled_height};
int c = (index / ${pooled_width} / ${pooled_height}) % ${channels};
int num = index / ${pooled_width} / ${pooled_height} / ${channels};
int roi_batch_ind = bottom_rois[num * 5 + 0];
int roi_start_w = round(bottom_rois[num * 5 + 1] * ${spatial_scale});
int roi_start_h = round(bottom_rois[num * 5 + 2] * ${spatial_scale});
int roi_end_w = round(bottom_rois[num * 5 + 3] * ${spatial_scale});
int roi_end_h = round(bottom_rois[num * 5 + 4] * ${spatial_scale});
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
float bin_size_h = static_cast<float>(roi_height)
/ static_cast<float>(${pooled_height});
float bin_size_w = static_cast<float>(roi_width)
/ static_cast<float>(${pooled_width});
int hstart = static_cast<int>(floor(static_cast<float>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<float>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
* bin_size_w));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), ${height});
hend = min(max(hend + roi_start_h, 0), ${height});
wstart = min(max(wstart + roi_start_w, 0), ${width});
wend = min(max(wend + roi_start_w, 0), ${width});
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
float maxval = is_empty ? 0 : -1E+37;
// If nothing is pooled, argmax=-1 causes nothing to be backprop'd
int maxidx = -1;
int data_offset = (roi_batch_ind * ${channels} + c) * ${height} * ${width};
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * ${width} + w;
if (bottom_data[data_offset + bottom_index] > maxval) {
maxval = bottom_data[data_offset + bottom_index];
maxidx = bottom_index;
}
}
}
top_data[index] = maxval;
argmax_data[index] = maxidx;
}
}