概要:因为某些原因要修改riroi_align的前向传播和反向传播,作者的代码是用c++写的,之前并未使用过c++为pytorch写拓展库,故研究一番。
先贴出文章和代码地址:
文章:https://openaccess.thecvf.com/content/CVPR2021/papers/Han_ReDet_A_Rotation-Equivariant_Detector_for_Aerial_Object_Detection_CVPR_2021_paper.pdf
代码:https: //github.com/csuhan/ReDet
基础知识:RIRoI是在RRoI基础上改进的,而RRoI是在RoI基础上改进的,所以需要懂RoI+RRoI!
RIRoI Align对齐包括两部分,分别是:
(1)空间对齐
(2)方向对齐
空间对齐比较容易理解,方向对齐是该文的创新之处,也是难懂之处。
不是很懂 r的表达式为什么是这样!
根据论文,r应该是Cn的index,即选择哪个Cn。
cu文件是针对GPU,由NVCC编译的。接下来看代码:
#include <ATen/ATen.h>
#include <THC/THCAtomics.cuh>//这两个模块是干啥的?
#include <math.h>
#define PI 3.141592653
//该函数表示线程数大于当前grid开启上限时,一直在block中循环线程计算直到完成任务
//当前开辟的所有线程数:blockDim.x * gridDim.x
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
//字面翻译 每个块的线程开辟的线程数 1024
#define THREADS_PER_BLOCK 1024
/*inline关键字作用为:,解决频繁调用的小函数大量消耗栈空间
该函数功能为:计算需要的block数
这里详细解释下,之前定义了每个block开辟的线程数为1024,开辟多少个block由数据量N决定,
N=num_rois * pooled_height * pooled_width * channels * nOrientation
riroi_align的输出是(num_rois,channels * nOrientation,pooled_height,pooled_width),
即N表示输出数据的元素个数。函数里面的代码其实就是实现了,当N/1024不为整数时,向上取整的功能。
int 本身是向下取整的,举例 3.6->3,这里分三种情况来解释向上取整原理。
(1)当N/THREADS_PER_BLOCK为整数a时
需要的block数就是a,a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK >a 经过int运算为a;
(2)当N/THREADS_PER_BLOCK为整数a余1时
需要的block数就是a+1,a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK =a+1 经过int运算为a+1;
(3)当N/THREADS_PER_BLOCK为整数a余b时
需要的block数还是a+1 a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK >a+1 经过int运算为a+1;
综上*/
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
int max_block_num = 65000;
return min(optimal_block_num, max_block_num);
}
template <typename scalar_t>//template C++高级特性:模板,我c++入门选手不懂啊
__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
const int height, const int width,
scalar_t y, scalar_t x) {
/*__device__ 这是干什么的?
这个函数是双线性插值的实现;
scalar_t是一个宏,特化的时候会传入具体的类型;
bottom_data:需要进行roialign的featuremap的首地址指针(depth=1)注意特征图是(h*w)的一维数组
height/width:特征图的高宽;
xy:要插值的点的坐标。
关于双线性插值,作者是没有改动的。*/
deal with cases that inverse elements are out of feature map boundary
//处理逆元素超出特征映射边界的情况(啥是逆元素)
if (y < -1.0 || y > height || x < -1.0 || x > width) {
return 0;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
// 修正x y范围
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (scalar_t)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (scalar_t)x_low;
} else {
x_high = x_low + 1;
}
scalar_t ly = y - y_low;
scalar_t lx = x - x_low;
scalar_t hy = 1. - ly;
scalar_t hx = 1. - lx;
// do bilinear interpolation
scalar_t lt = bottom_data[y_low * width + x_low];
scalar_t rt = bottom_data[y_low * width + x_high];
scalar_t lb = bottom_data[y_high * width + x_low];
scalar_t rb = bottom_data[y_high * width + x_high];
scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
return val;
}
template <typename scalar_t>
/* nthreads: 线程总数,实际传入的为align输出数据元素个数
bottom_data: 需要进行roialign的featuremap的首地址
bottom_rois: 存储rois的首地址
spatial_scale:特征图和原图之间的比例。特征图的尺寸/原图的尺寸
sample_num: 采样点数
height/width: 特征图尺寸
pooled_height/pooled_width: 一般是7
nOrientation: 方向的数量
top_data:align结果的首地址,最后的结果存储在这里。*/
__global__ void RiROIAlignForward(const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois,
const scalar_t spatial_scale,
const int sample_num, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int nOrientation,
scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
/*__global__又是啥子?
(n, c, ph, pw) is an element in the pooled output
(n, c, ph, pw)是池化输出中的一个元素,输出长度为nthreads
表示会线程数大于当前grid开启上限时,一直在block中循环线程计算直到完成任务
具体:pooling后的所有RoI像素点总数量进行同步/循环的计算,每单独计算核单次求取一个点的坐标
pw:x方向bin索引
ph: y方向bin索引
o:方向索引号,1组nOrientation个方向特征图
c:通道索引号
n:roi索引号
*/
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % nOrientation;
int c = (index / pooled_width / pooled_height / nOrientation) % channels;
int n = index / pooled_width / pooled_height / nOrientation / channels;
/*
offset_bottom_bottom_rois以6为单位
0位置放当前roi属于当前batch中的第几张图片(从0开始排序),也就是batch_index
注意缩放:1-4位置放当前roi左上角,右下角坐标,针对真实图像大小而言,需要通过
spatial_scale 缩放!!;spatial_scale乘子将roi坐标缩放到featuremap后,是float型,无量化损
失!!!*/;5位置存放角度(RIROI_align引入了角度参数)。
/*
第一行关于指针:
定义一个指向bottom_rois + n * 6位置的指针,指向第n个roi的首地址,其有六个参数bs x1 y1 x2 y2
theta;
可以直接将指针作为新的数组索引,0从当前所指的位置开始
*/
const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0]; 指针指向的地址内容取出,取出roi信息
//不要使用舍入;这个实现细节非常关键
// Do not using rounding; this implementation detail is critical
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;//ROI中心的x x*尺度因子
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;//ROI中心的y y*尺度因子
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;//ROI的宽度 宽度*尺度因子
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;//ROI的高度 高度*尺度因子
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;//弧度和角度转换 1°=π/180 rad
scalar_t theta = offset_bottom_rois[5];//看代码应该是不需要转换,即输入为角度
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);//roi宽度,和1比较取最大值,保证宽/高大于等于1
roi_height = max(roi_height, (scalar_t)1.);roi宽度,和1比较取最大值
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) / static_cast<scalar_t>(pooled_height);//划分成很多个bin,每个bin的宽高计算;即计算roi多少区域对应align输出的一格区域
scalar_t bin_size_w = static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// 之后就是对每个bin插值并取值(一般取最大值),如果不理解可以先看看roi_align原理
// TODO下面几行为作者的改动之处,即riroi_align里的操作
// find aligned index 找到align index
//ind_float范围应该为 0~nOrientation-1
scalar_t ind_float = theta * nOrientation / (2 * PI);//对应论文中的r,为啥要乘以n~?
int ind = floor(ind_float);//求不大于ind_float的最大整数,取整!
scalar_t l_var = ind_float - (scalar_t)ind;//公式9的α
scalar_t r_var = 1.0 - l_var;//α就是r的小数部分,衡量theta角度所对应的特征图的偏向
//正确的开始通道!
ind = (ind + nOrientation) % nOrientation;//为什么不直接取余数?
/*rotated channel 旋转通道
o的取值范围为0~nOrientation-1
rot和rot_plus值是相邻的,对应公式9f的上标
*/
int ind_rot = (o - ind + nOrientation) % nOrientation;
int ind_rot_plus = (ind_rot + 1 + nOrientation) % nOrientation;
/*这个代码一个难懂点就是 对于一行(2维)数据, 怎么对应4维
roi_batch_ind * channels * nOrientation:对应到哪个roi
c * nOrientation:对应到哪个通道(应该说组,1组有nOrientation个方向)
ind_rot):对应到哪个方向
*/
const scalar_t* offset_bottom_data =
bottom_data + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot) * height * width;
const scalar_t* offset_bottom_data_plus =
bottom_data + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot_plus) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
// // 三目运算,设置了>0的sample_num,那么x方向取这么多个点,y同理,总共2*2=4个采样点
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosscalar_theta = cos(theta);//计算cos和sin值
scalar_t sinscalar_theta = sin(theta);
//在 bin 里做平均池化。cout是bin内采样个数
const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
scalar_t output_val = 0.;//初始化
// y方向遍历
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1,
// 计算采样点的y坐标:roi的h + bin的位置(如:7*7的第几个bin)+ bin内的偏移(bin宽高
除以采样点个数)
const scalar_t yy = roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
//x方向遍历,计算采样点的x坐标
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
//坐标逆变换到原图坐标
//关于x y 的计算推导见下图
// Rotate by theta around the center and translate
// scalar_t x = xx * cosscalar_theta + yy * sinscalar_theta + roi_center_w;
// scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
scalar_t x = xx * cosscalar_theta - yy * sinscalar_theta + roi_center_w;
scalar_t y = xx * sinscalar_theta + yy * cosscalar_theta + roi_center_h;
scalar_t val = bilinear_interpolate<scalar_t>(
offset_bottom_data, height, width, y, x);//双线性插值
scalar_t val_plus = bilinear_interpolate<scalar_t>(
offset_bottom_data_plus, height, width, y, x);
output_val += r_var * val + l_var * val_plus;//对应原文公式9
}//论文中的方向对齐、SC操作难道就是相邻通道加权求和吗?
}
output_val /= count;// 这里的align取值方式是均值
top_data[index] = output_val;//插值完该位置赋值
}
}
x y 计算详解:已知 RoI区域的宽高分别为roi_width 、roi_height,中心点O坐标为(roi_center_w,roi_center_h) ,设A点坐标为(x1,y1),求P点坐标
解析:对于A点的y1,y1+roi_height/2=roi_center_h ->y1=roi_center_h -roi_height/2
x1同理 ->P点的y坐标yp=y1+|AP|=roi_center_h -roi_height/2+|AP|
代码里是先减去roi_height/2,再加上roi_center_h!
scalar_t roi_start_h = -roi_height / 2.0;
······
const scalar_t yy = roi_start_h + ph * bin_size_h +static_cast<scalar_t>(iy + .5f) *bin_size_h /static_cast<scalar_t>(roi_bin_grid_h); //先减去roi_height/2
······
scalar_t x = xx * cosscalar_theta - yy * sinscalar_theta + roi_center_w;//再加上roi_center_h
后面部分
int RiROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
const int nOrientation,
at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels * nOrientation;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "RiROIAlignLaucherForward", ([&] {
const scalar_t *bottom_data = features.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *top_data = output.data<scalar_t>();
RiROIAlignForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>> (
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, channels, height, width, pooled_height,
pooled_width, nOrientation, top_data);
}));
THCudaCheck(cudaGetLastError());
return 1;
}
template <typename scalar_t>
__device__ void bilinear_interpolate_gradient(const int height, const int width,
scalar_t y, scalar_t x,
scalar_t &w1, scalar_t &w2,
scalar_t &w3, scalar_t &w4,
int &x_low, int &x_high,
int &y_low, int &y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (scalar_t)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (scalar_t)x_low;
} else {
x_high = x_low + 1;
}
scalar_t ly = y - y_low;
scalar_t lx = x - x_low;
scalar_t hy = 1. - ly;
scalar_t hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <typename scalar_t>
__global__ void RiROIAlignBackward(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int nOrientation, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % nOrientation;
int c = (index / pooled_width / pooled_height / nOrientation) % channels;
int n = index / pooled_width / pooled_height / nOrientation / channels;
const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not round
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) / static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w = static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// find aligned index
scalar_t ind_float = theta * nOrientation / (2 * PI);
int ind = floor(ind_float);
scalar_t l_var = ind_float - (scalar_t)ind;
scalar_t r_var = 1.0 - l_var;
// correct start channel
ind = (ind + nOrientation) % nOrientation;
// rotated channel
int ind_rot = (o - ind + nOrientation) % nOrientation;
int ind_rot_plus = (ind_rot + 1 + nOrientation) % nOrientation;
scalar_t* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot) * height * width;
scalar_t* offset_bottom_diff_plus =
bottom_diff + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot_plus) * height * width;
int top_offset = (n * channels * nOrientation + c * nOrientation + o) * pooled_height * pooled_width;
const scalar_t* offset_top_diff = top_diff + top_offset;
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosTheta = cos(theta);
scalar_t sinTheta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy = roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// scalar_t x = xx * cosTheta + yy * sinTheta + roi_center_w;
// T y = yy * cosTheta - xx * sinTheta + roi_center_h;
scalar_t x = xx * cosTheta - yy * sinTheta + roi_center_w;
scalar_t y = xx * sinTheta + yy * cosTheta + roi_center_h;
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(
height,
width,
y,
x,
w1,
w2,
w3,
w4,
x_low,
x_high,
y_low,
y_high);
scalar_t g1 = top_diff_this_bin * w1 / count;
scalar_t g2 = top_diff_this_bin * w2 / count;
scalar_t g3 = top_diff_this_bin * w3 / count;
scalar_t g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(
offset_bottom_diff + y_low * width + x_low, g1*r_var);
atomicAdd(
offset_bottom_diff + y_low * width + x_high, g2*r_var);
atomicAdd(
offset_bottom_diff + y_high * width + x_low, g3*r_var);
atomicAdd(
offset_bottom_diff + y_high * width + x_high, g4*r_var);
atomicAdd(
offset_bottom_diff_plus + y_low * width + x_low, g1*l_var);
atomicAdd(
offset_bottom_diff_plus + y_low * width + x_high, g2*l_var);
atomicAdd(
offset_bottom_diff_plus + y_high * width + x_low, g3*l_var);
atomicAdd(
offset_bottom_diff_plus + y_high * width + x_high, g4*l_var);
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
int RiROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
const int nOrientation,
at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_height * pooled_width * channels * nOrientation;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.type(), "RiROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
if (sizeof(scalar_t) == sizeof(double)) {
fprintf(stderr, "double is not supported\n");
exit(-1);
}
RiROIAlignBackward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
channels, height, width, pooled_height, pooled_width, nOrientation,
bottom_diff);
}));
THCudaCheck(cudaGetLastError());
return 1;
}
仅供参考,若有错误,欢迎指正!!!
inline关键字参考:https://blog.csdn.net/u010853261/article/details/84940716
template学习:https://blog.csdn.net/lianhunqianr1/article/details/79966911
__global__学习:https://blog.csdn.net/heiheiya/article/details/82019309
代码解析参考:https://zhuanlan.zhihu.com/p/75171514
https://blog.csdn.net/xiaoxu1025/article/details/103703930
https://blog.csdn.net/qq_43088746/article/details/99637170
https://blog.csdn.net/u011622208/article/details/91355191
https://blog.csdn.net/liyuan02/article/details/6750828