RROI Aglin cuda源码阅读

最新推荐文章于 2024-05-02 13:20:56 发布

农夫山泉2号

最新推荐文章于 2024-05-02 13:20:56 发布

阅读量1.4k

点赞数 3

本文链接：https://blog.csdn.net/u011622208/article/details/91355191

版权

PYTHON 同时被 3 个专栏收录

105 篇文章 4 订阅

订阅专栏

深度学习

84 篇文章 6 订阅

订阅专栏

cuda

24 篇文章 0 订阅

订阅专栏

文章目录

原理

RROI Aglin就是在roi aglin的基础上加上了旋转操作。但是整个过程的原理理解还是很难的，因为涉及到图像旋转。所以强烈推荐下面这篇文章:图像旋转算法原理-- 旋转矩阵，其中rroi aglin的前向传播过程主要参考了。即通过给定aglin后的坐标点（X，Y）映射回原图的（X0,Y0），这里的（X0,Y0）是浮点数，非整数。（大概率），然后通过双线性插值求解f(x0, y0)。

在这里插入图片描述

cuda程序——前向传播

这里还是写出整体流程，并在程序相应位置标上注释。同时这里会用一个变换矩阵，具体还是看上面推荐博客的推导，只是cuda程序中还加了比例缩放。
在这里插入图片描述

整体流程：

提取基础数据
根据aglin后的坐标点，反向映射会原图的坐标点，求相应的变换矩阵
求出原图中倾斜roi的4个坐标，8个值
求roi的中心，这里在一个bin中只用了一个采样值，就是roi的中心
对roi的中心求f(x,y),利用双线性插值求f(x,y)

不知道这里为什么要用原子操作，不管是top_data,con_idx_x/y的读取和写入应该都不会冲突才是

#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)

template <typename T>
__global__ void RROIAlignForward(
    const int nthreads,
    const T* bottom_data,
    const T spatial_scale,
    int height,
    int width,
    int channels,
    const int pooled_height,
    const int pooled_width,
    const T* bottom_rois,
    T* top_data,
    float* con_idx_x,
    float* con_idx_y)
{

    CUDA_1D_KERNEL_LOOP(index, nthreads)
    {
        // +0.5 shift removed
        int imageWidth = width;
        int imageHeight = height;

        // (n, c, ph, pw) is an element in the pooled output
        // 1. 提取基础数据
        int n = index;
        int pw = n % pooled_width;
        n /= pooled_width;
        int ph = n % pooled_height;
        n /= pooled_height;
        int c = n % channels;
        n /= channels;

        const T* offset_bottom_rois = bottom_rois + n * 6; //=

        int roi_batch_ind = offset_bottom_rois[0];
        T cx = offset_bottom_rois[1];
        T cy = offset_bottom_rois[2];
        T h = offset_bottom_rois[3];
        T w = offset_bottom_rois[4];
        T angle = offset_bottom_rois[5]/180.0*3.1415926535;         // 角度

        //TransformPrepare
        // 2. 根据aglin后的坐标点，反向映射会原图的坐标点，求相应的变换矩阵
        T dx = -pooled_width/2.0;
        T dy = -pooled_height/2.0;
        T Sx = w*spatial_scale/pooled_width;
        T Sy = h*spatial_scale/pooled_height;
        T Alpha = cos(angle);
        T Beta = sin(angle);
        T Dx = cx*spatial_scale;
        T Dy = cy*spatial_scale;

        T M[2][3];                                          // 变换矩阵
        M[0][0] = Alpha*Sx;
        M[0][1] = Beta*Sy;
        M[0][2] = Alpha*Sx*dx+Beta*Sy*dy+Dx;
        M[1][0] = -Beta*Sx;
        M[1][1] = Alpha*Sy;
        M[1][2] = -Beta*Sx*dx+Alpha*Sy*dy+Dy;

        // 3. 求出原图中倾斜roi的4个坐标，8个值
        T P[8];
        P[0] = M[0][0]*pw+M[0][1]*ph+M[0][2];
        P[1] = M[1][0]*pw+M[1][1]*ph+M[1][2];
        P[2] = M[0][0]*pw+M[0][1]*(ph+1)+M[0][2];
        P[3] = M[1][0]*pw+M[1][1]*(ph+1)+M[1][2];
        P[4] = M[0][0]*(pw+1)+M[0][1]*ph+M[0][2];
        P[5] = M[1][0]*(pw+1)+M[1][1]*ph+M[1][2];
        P[6] = M[0][0]*(pw+1)+M[0][1]*(ph+1)+M[0][2];
        P[7] = M[1][0]*(pw+1)+M[1][1]*(ph+1)+M[1][2];
        
        // 4. 求roi的中心，这里在一个bin中只用了一个采样值，就是roi的中心
        T leftMost = (max(round(min(min(P[0],P[2]),min(P[4],P[6]))),0.0));
        T rightMost= (min(round(max(max(P[0],P[2]),max(P[4],P[6]))),imageWidth-1.0));
        T topMost= (max(round(min(min(P[1],P[3]),min(P[5],P[7]))),0.0));
        T bottomMost= (min(round(max(max(P[1],P[3]),max(P[5],P[7]))),imageHeight-1.0));

        //float maxval = 0;
        //int maxidx = -1;
        const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;

        //float AB[2];
        //AB[0] = P[2] - P[0];
        //AB[1] = P[3] - P[1];
        //float ABAB = AB[0]*AB[0] +AB[1]*AB[1];
        //float AC[2];
        //AC[0] = P[4] - P[0];
        //AC[1] = P[5] - P[1];
        //float ACAC = AC[0]*AC[0] + AC[1]*AC[1];
        
        // 倾斜图像中，roi的中心
        float bin_cx = (leftMost + rightMost) / 2.0; // shift
        float bin_cy = (topMost + bottomMost) / 2.0;

        // 就是align后的值，就是bin中心的值，求这个值采用了双线性插值
        // 在相关实验中，作者发现将采样点设为4会获得最佳性能，甚至直接设为1在性能上也相差无几，采样点为1就是中心
        // 来源：https://blog.csdn.net/jacke121/article/details/80531304
        // 5. 对roi的中心求f(x,y),利用双线性插值求f(x,y)
        int bin_l = (int)floor(bin_cx);
        int bin_r = (int)ceil(bin_cx);
        int bin_t = (int)floor(bin_cy);
        int bin_b = (int)ceil(bin_cy);

        T lt_value = 0.0;
        if (bin_t > 0 && bin_l > 0 && bin_t < height && bin_l < width)
            lt_value = offset_bottom_data[bin_t * width + bin_l];
        T rt_value = 0.0;
        if (bin_t > 0 && bin_r > 0 && bin_t < height && bin_r < width)
            rt_value = offset_bottom_data[bin_t * width + bin_r];
        T lb_value = 0.0;
        if (bin_b > 0 && bin_l > 0 && bin_b < height && bin_l < width)
            lb_value = offset_bottom_data[bin_b * width + bin_l];
        T rb_value = 0.0;
        if (bin_b > 0 && bin_r > 0 && bin_b < height && bin_r < width)
            rb_value = offset_bottom_data[bin_b * width + bin_r];

        // 双线性插值，归一化为0-1之间
        T rx = bin_cx - floor(bin_cx);
        T ry = bin_cy - floor(bin_cy);

        T wlt = (1.0 - rx) * (1.0 - ry);
        T wrt = rx * (1.0 - ry);
        T wrb = rx * ry;
        T wlb = (1.0 - rx) * ry;

        T inter_val = 0.0;

        inter_val += lt_value * wlt;
        inter_val += rt_value * wrt;
        inter_val += rb_value * wrb;
        inter_val += lb_value * wlb;

        // 这里为什么要用原子操作
        atomicAdd(top_data + index, static_cast<T>(inter_val));             // 这个应该不用加原子操作
        atomicAdd(con_idx_x + index, static_cast<float>(bin_cx));           
        atomicAdd(con_idx_y + index, static_cast<float>(bin_cy));

        //top_data[index] = static_cast<T>(inter_val);
        //con_idx_x[index] = bin_cx;
        //con_idx_y[index] = bin_cy;

    }
}

cuda程序——反向传播

整体流程为：

根据前向传播时保留的信息，获得每个bin的中心
求出双线性插值的系数
求出双线性插值的4个点的坐标
求出每个点的梯度，并写入全局内存

template <typename T>
__global__ void RROIAlignBackward(
            const int nthreads,
            const T* top_diff,
            const float* con_idx_x,
            const float* con_idx_y,
            const int num_rois,
            const float spatial_scale,
            const int height,
            const int width,
            const int channels,
            const int pooled_height,
            const int pooled_width,
            T* bottom_diff,
            const T* bottom_rois) {
    CUDA_1D_KERNEL_LOOP(index, nthreads)
    {

        // (n, c, ph, pw) is an element in the pooled output
        int n = index;
        //int w = n % width;
        n /= pooled_width;
        //int h = n % height;
        n /= pooled_height;
        int c = n % channels;
        n /= channels;

        const T* offset_bottom_rois = bottom_rois + n * 6;
        int roi_batch_ind = offset_bottom_rois[0];
        T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;

        //int bottom_index = argmax_data[index];
        // 1. 得到每个bin的中心
        float bw = con_idx_x[index];
        float bh = con_idx_y[index];

        //if (bh > 0.00001 && bw > 0.00001 && bw < height-1 && bw < width-1){
        // 2. 求出双线性插值的系数
        int bin_xs = int(floor(bw));
        int bin_ys = int(floor(bh));

        float rx = bw - float(bin_xs);
        float ry = bh - float(bin_ys);

        T wlt = (1.0 - rx) * (1.0 - ry);
        T wrt = rx * (1.0 - ry);
        T wrb = rx * ry;
        T wlb = (1.0 - rx) * ry;

        // if(bottom_index >= 0) // original != 0 maybe wrong
        //    bottom_diff[bottom_index]+=top_diff[index] ;

        //int min_x = bin_xs, 0), width - 1);
        //int min_y = min(max(bin_ys, 0), height - 1);
        //int max_x = max(min(bin_xs + 1, width - 1), 0);
        //int max_y = max(min(bin_ys + 1, height - 1), 0);
        
        // 3. 求出双线性插值的4个点的坐标
        int min_x = (int)floor(bw);
        int max_x = (int)ceil(bw);
        int min_y = (int)floor(bh);
        int max_y = (int)ceil(bh);

        T top_diff_of_bin = top_diff[index];        // 每个bin的梯度
        
        // 4. 求出每个点的梯度，并写入全局内存
        T v1 = wlt * top_diff_of_bin;               // 每一个点的梯度为系数
        T v2 = wrt * top_diff_of_bin;
        T v3 = wrb * top_diff_of_bin;
        T v4 = wlb * top_diff_of_bin;

        // Atomic add

        if (min_y > 0 && min_x  > 0 && min_y < height - 1 && min_x < width - 1)
            atomicAdd(offset_bottom_diff + min_y * width + min_x, static_cast<T>(v1));
        if (min_y > 0 && max_x < width - 1 && min_y < height - 1 && max_x > 0)
            atomicAdd(offset_bottom_diff + min_y * width + max_x, static_cast<T>(v2));
        if (max_y < height - 1 && max_x < width - 1 && max_y > 0 && max_x > 0)
            atomicAdd(offset_bottom_diff + max_y * width + max_x, static_cast<T>(v3));
        if (max_y < height - 1 && min_x > 0 && max_y > 0 && min_x < width - 1)
            atomicAdd(offset_bottom_diff + max_y * width + min_x, static_cast<T>(v4));

        //}

  }
}

总结

前向传播的时候原理还是比较难的，所以最好是理解之后，按照公式一步一步来就好了。
涉及到图像旋转的操作都是比较绕的。

农夫山泉2号

关注

3
点赞
踩
7

收藏

觉得还不错? 一键收藏
2
评论
RROI Aglin cuda源码阅读

文章目录原理cuda程序——前向传播cuda程序——反向传播总结原理RROI Aglin就是在roi aglin的基础上加上了旋转操作。但是整个过程的原理理解还是很难的，因为涉及到图像旋转。所以强烈推荐下面这篇文章:图像旋转算法原理-- 旋转矩阵，其中rroi aglin的前向传播过程主要参考了。即通过给定aglin后的坐标点（X，Y）映射回原图的（X0,Y0），这里的（X0,Y0）是浮点数，...
复制链接

扫一扫