riroi_align_kernel.cu代码笔记

最新推荐文章于 2023-05-31 09:54:28 发布

Coding-Prince

最新推荐文章于 2023-05-31 09:54:28 发布

阅读量748

点赞数 1

文章标签：人工智能 c++ pytorch

本文链接：https://blog.csdn.net/qq_42362891/article/details/118407866

版权

概要：因为某些原因要修改riroi_align的前向传播和反向传播，作者的代码是用c++写的，之前并未使用过c++为pytorch写拓展库，故研究一番。
先贴出文章和代码地址：
文章：https://openaccess.thecvf.com/content/CVPR2021/papers/Han_ReDet_A_Rotation-Equivariant_Detector_for_Aerial_Object_Detection_CVPR_2021_paper.pdf
代码：https: //github.com/csuhan/ReDet

基础知识：RIRoI是在RRoI基础上改进的，而RRoI是在RoI基础上改进的，所以需要懂RoI+RRoI！

RIRoI Align对齐包括两部分，分别是：
（1）空间对齐
（2）方向对齐
空间对齐比较容易理解，方向对齐是该文的创新之处，也是难懂之处。

不是很懂 r的表达式为什么是这样！

根据论文，r应该是Cn的index，即选择哪个Cn。

cu文件是针对GPU，由NVCC编译的。接下来看代码：

#include <ATen/ATen.h>
#include <THC/THCAtomics.cuh>//这两个模块是干啥的？
#include <math.h>

#define PI 3.141592653
//该函数表示线程数大于当前grid开启上限时，一直在block中循环线程计算直到完成任务
//当前开辟的所有线程数：blockDim.x * gridDim.x 
#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)
//字面翻译 每个块的线程开辟的线程数 1024
#define THREADS_PER_BLOCK 1024

/*inline关键字作用为：，解决频繁调用的小函数大量消耗栈空间
该函数功能为：计算需要的block数
这里详细解释下，之前定义了每个block开辟的线程数为1024，开辟多少个block由数据量N决定，
N=num_rois * pooled_height * pooled_width * channels * nOrientation
riroi_align的输出是（num_rois，channels * nOrientation，pooled_height，pooled_width），
即N表示输出数据的元素个数。函数里面的代码其实就是实现了，当N/1024不为整数时，向上取整的功能。
int 本身是向下取整的，举例 3.6->3，这里分三种情况来解释向上取整原理。
（1）当N/THREADS_PER_BLOCK为整数a时
 需要的block数就是a，a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK >a 经过int运算为a；
（2）当N/THREADS_PER_BLOCK为整数a余1时
 需要的block数就是a+1，a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK =a+1 经过int运算为a+1;
 (3)当N/THREADS_PER_BLOCK为整数a余b时
 需要的block数还是a+1 a+THREADS_PER_BLOCK - 1/THREADS_PER_BLOCK >a+1 经过int运算为a+1;
综上*/
inline int GET_BLOCKS(const int N) {
    int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    int max_block_num = 65000;
    return min(optimal_block_num, max_block_num);
}

template <typename scalar_t>//template C++高级特性：模板，我c++入门选手不懂啊
__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
                                         const int height, const int width,
                                         scalar_t y, scalar_t x) {
  /*__device__ 这是干什么的？
  这个函数是双线性插值的实现；
  scalar_t是一个宏，特化的时候会传入具体的类型；
  bottom_data：需要进行roialign的featuremap的首地址指针（depth=1）注意特征图是(h*w)的一维数组
  height/width：特征图的高宽；
  xy：要插值的点的坐标。
  关于双线性插值，作者是没有改动的。*/
  deal with cases that inverse elements are out of feature map boundary
  //处理逆元素超出特征映射边界的情况（啥是逆元素）
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    return 0;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;
  // 修正x y范围

  int y_low = (int)y;
  int x_low = (int)x;
  int y_high;
  int x_high;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (scalar_t)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (scalar_t)x_low;
  } else {
    x_high = x_low + 1;
  }

  scalar_t ly = y - y_low;
  scalar_t lx = x - x_low;
  scalar_t hy = 1. - ly;
  scalar_t hx = 1. - lx;
  // do bilinear interpolation
  scalar_t lt = bottom_data[y_low * width + x_low];
  scalar_t rt = bottom_data[y_low * width + x_high];
  scalar_t lb = bottom_data[y_high * width + x_low];
  scalar_t rb = bottom_data[y_high * width + x_high];
  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);

  return val;
}

template <typename scalar_t>
 /* nthreads：     线程总数，实际传入的为align输出数据元素个数
    bottom_data：  需要进行roialign的featuremap的首地址
    bottom_rois：  存储rois的首地址
    spatial_scale：特征图和原图之间的比例。特征图的尺寸/原图的尺寸
    sample_num：   采样点数
    height/width： 特征图尺寸
    pooled_height/pooled_width： 一般是7
    nOrientation： 方向的数量
    top_data：align结果的首地址，最后的结果存储在这里。*/
__global__ void RiROIAlignForward(const int nthreads, const scalar_t *bottom_data,
                                const scalar_t *bottom_rois,
                                const scalar_t spatial_scale,
                                const int sample_num, const int channels,
                                const int height, const int width,
                                const int pooled_height, const int pooled_width,
                                const int nOrientation,
                                scalar_t *top_data) {
    CUDA_1D_KERNEL_LOOP(index, nthreads) {
    /*__global__又是啥子？
    (n, c, ph, pw) is an element in the pooled output
    (n, c, ph, pw)是池化输出中的一个元素，输出长度为nthreads
    表示会线程数大于当前grid开启上限时，一直在block中循环线程计算直到完成任务
    具体：pooling后的所有RoI像素点总数量进行同步/循环的计算，每单独计算核单次求取一个点的坐标 
    pw：x方向bin索引
    ph: y方向bin索引
    o：方向索引号，1组nOrientation个方向特征图
    c：通道索引号
    n：roi索引号
    */
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % nOrientation;
    int c = (index / pooled_width / pooled_height / nOrientation) % channels;
    int n = index / pooled_width / pooled_height / nOrientation / channels;
    /*  
    offset_bottom_bottom_rois以6为单位
    0位置放当前roi属于当前batch中的第几张图片(从0开始排序)，也就是batch_index
    注意缩放：1-4位置放当前roi左上角，右下角坐标，针对真实图像大小而言，需要通过 
    spatial_scale  缩放！！;spatial_scale乘子将roi坐标缩放到featuremap后，是float型，无量化损 
    失!!!*/；5位置存放角度（RIROI_align引入了角度参数）。
    /*
    第一行关于指针：
    定义一个指向bottom_rois + n * 6位置的指针，指向第n个roi的首地址，其有六个参数bs x1 y1 x2 y2 
    theta；
    可以直接将指针作为新的数组索引，0从当前所指的位置开始
    */

    const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0]; 指针指向的地址内容取出，取出roi信息
    //不要使用舍入;这个实现细节非常关键
    // Do not using rounding; this implementation detail is critical
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;//ROI中心的x x*尺度因子
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;//ROI中心的y y*尺度因子
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;//ROI的宽度 宽度*尺度因子
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;//ROI的高度 高度*尺度因子
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;//弧度和角度转换 1°=π/180 rad
    scalar_t theta = offset_bottom_rois[5];//看代码应该是不需要转换，即输入为角度

    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);//roi宽度，和1比较取最大值，保证宽/高大于等于1
    roi_height = max(roi_height, (scalar_t)1.);roi宽度，和1比较取最大值
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) / static_cast<scalar_t>(pooled_height);//划分成很多个bin，每个bin的宽高计算；即计算roi多少区域对应align输出的一格区域
    scalar_t bin_size_w = static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
    // 之后就是对每个bin插值并取值（一般取最大值），如果不理解可以先看看roi_align原理
    // TODO下面几行为作者的改动之处，即riroi_align里的操作
    // find aligned index 找到align index
    //ind_float范围应该为 0~nOrientation-1
    scalar_t ind_float = theta * nOrientation / (2 * PI);//对应论文中的r，为啥要乘以n~？
    int ind =  floor(ind_float);//求不大于ind_float的最大整数，取整！
    scalar_t l_var = ind_float - (scalar_t)ind;//公式9的α
    scalar_t r_var = 1.0 - l_var;//α就是r的小数部分，衡量theta角度所对应的特征图的偏向
    //正确的开始通道！
    ind = (ind + nOrientation) % nOrientation;//为什么不直接取余数？
    /*rotated channel 旋转通道
    o的取值范围为0~nOrientation-1
    rot和rot_plus值是相邻的，对应公式9f的上标
    */
    int ind_rot = (o - ind + nOrientation) % nOrientation;
    int ind_rot_plus = (ind_rot + 1 + nOrientation) % nOrientation; 
    /*这个代码一个难懂点就是 对于一行（2维）数据， 怎么对应4维
    roi_batch_ind * channels * nOrientation：对应到哪个roi
    c * nOrientation：对应到哪个通道（应该说组，1组有nOrientation个方向）
    ind_rot)：对应到哪个方向
    */
    const scalar_t* offset_bottom_data =
        bottom_data + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot) * height * width;

    const scalar_t* offset_bottom_data_plus =
        bottom_data + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot_plus) * height * width;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sample_num > 0)
        ? sample_num
        : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
    // // 三目运算，设置了>0的sample_num，那么x方向取这么多个点，y同理，总共2*2=4个采样点
    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosscalar_theta = cos(theta);//计算cos和sin值
    scalar_t sinscalar_theta = sin(theta);

    //在 bin 里做平均池化。cout是bin内采样个数
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    scalar_t output_val = 0.;//初始化
    // y方向遍历
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1，
        // 计算采样点的y坐标：roi的h + bin的位置（如：7*7的第几个bin）+ bin内的偏移（bin宽高 
        除以采样点个数）
        const scalar_t yy = roi_start_h + ph * bin_size_h +
            static_cast<scalar_t>(iy + .5f) * bin_size_h /
                static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
        //x方向遍历，计算采样点的x坐标
        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                static_cast<scalar_t>(roi_bin_grid_w);
        //坐标逆变换到原图坐标
        //关于x y 的计算推导见下图
        // Rotate by theta around the center and translate
        // scalar_t x = xx * cosscalar_theta + yy * sinscalar_theta + roi_center_w;
        // scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
        scalar_t x = xx * cosscalar_theta - yy * sinscalar_theta + roi_center_w;
        scalar_t y = xx * sinscalar_theta + yy * cosscalar_theta + roi_center_h;

        scalar_t val = bilinear_interpolate<scalar_t>(
            offset_bottom_data, height, width, y, x);//双线性插值
        scalar_t val_plus = bilinear_interpolate<scalar_t>(
            offset_bottom_data_plus, height, width, y, x);
        output_val += r_var * val + l_var * val_plus;//对应原文公式9
        }//论文中的方向对齐、SC操作难道就是相邻通道加权求和吗？
    }
    output_val /= count;// 这里的align取值方式是均值

    top_data[index] = output_val;//插值完该位置赋值
    }
}

x y 计算详解：已知 RoI区域的宽高分别为roi_width 、roi_height，中心点O坐标为（roi_center_w，roi_center_h），设A点坐标为（x1，y1），求P点坐标
解析：对于A点的y1，y1+roi_height/2=roi_center_h ->y1=roi_center_h -roi_height/2
x1同理 ->P点的y坐标yp=y1+|AP|=roi_center_h -roi_height/2+|AP|
代码里是先减去roi_height/2，再加上roi_center_h！

scalar_t roi_start_h = -roi_height / 2.0;
······
const scalar_t yy = roi_start_h + ph * bin_size_h +static_cast<scalar_t>(iy + .5f) *bin_size_h /static_cast<scalar_t>(roi_bin_grid_h); //先减去roi_height/2
······
scalar_t x = xx * cosscalar_theta - yy * sinscalar_theta + roi_center_w;//再加上roi_center_h

后面部分

int RiROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
                                const float spatial_scale, const int sample_num,
                                const int channels, const int height,
                                const int width, const int num_rois,
                                const int pooled_height, const int pooled_width,
                                const int nOrientation,
                                at::Tensor output) {
    const int output_size = num_rois * pooled_height * pooled_width * channels * nOrientation;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        features.type(), "RiROIAlignLaucherForward", ([&] {
            const scalar_t *bottom_data = features.data<scalar_t>();
            const scalar_t *rois_data = rois.data<scalar_t>();
            scalar_t *top_data = output.data<scalar_t>();
            
            RiROIAlignForward<scalar_t>
                <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>> (
                    output_size, bottom_data, rois_data, scalar_t(spatial_scale),
                    sample_num, channels, height, width, pooled_height,
                    pooled_width, nOrientation, top_data);
        }));
    THCudaCheck(cudaGetLastError());
    return 1;
}

template <typename scalar_t>
__device__ void bilinear_interpolate_gradient(const int height, const int width,
                                              scalar_t y, scalar_t x,
                                              scalar_t &w1, scalar_t &w2,
                                              scalar_t &w3, scalar_t &w4,
                                              int &x_low, int &x_high,
                                              int &y_low, int &y_high) {
  // deal with cases that inverse elements are out of feature map boundary
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    w1 = w2 = w3 = w4 = 0.;
    x_low = x_high = y_low = y_high = -1;
    return;
  }

  if (y <= 0) y = 0;
  if (x <= 0) x = 0;

  y_low = (int)y;
  x_low = (int)x;

  if (y_low >= height - 1) {
    y_high = y_low = height - 1;
    y = (scalar_t)y_low;
  } else {
    y_high = y_low + 1;
  }

  if (x_low >= width - 1) {
    x_high = x_low = width - 1;
    x = (scalar_t)x_low;
  } else {
    x_high = x_low + 1;
  }

  scalar_t ly = y - y_low;
  scalar_t lx = x - x_low;
  scalar_t hy = 1. - ly;
  scalar_t hx = 1. - lx;

  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

  return;
}

template <typename scalar_t>
__global__ void RiROIAlignBackward(
    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
    const scalar_t spatial_scale, const int sample_num, const int channels,
    const int height, const int width, const int pooled_height,
    const int pooled_width, const int nOrientation, scalar_t *bottom_diff) {

    CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int o = (index / pooled_width / pooled_height) % nOrientation;
    int c = (index / pooled_width / pooled_height / nOrientation) % channels;
    int n = index / pooled_width / pooled_height / nOrientation / channels;

    const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not round
    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
    scalar_t theta = offset_bottom_rois[5];
    

    // Force malformed ROIs to be 1x1
    roi_width = max(roi_width, (scalar_t)1.);
    roi_height = max(roi_height, (scalar_t)1.);
    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) / static_cast<scalar_t>(pooled_height);
    scalar_t bin_size_w = static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);

    // find aligned index
    scalar_t ind_float = theta * nOrientation / (2 * PI);
    int ind =  floor(ind_float);
    scalar_t l_var = ind_float - (scalar_t)ind;
    scalar_t r_var = 1.0 - l_var;
    // correct start channel
    ind = (ind + nOrientation) % nOrientation;
    // rotated channel
    int ind_rot = (o - ind + nOrientation) % nOrientation;
    int ind_rot_plus = (ind_rot + 1 + nOrientation) % nOrientation; 
   
    scalar_t* offset_bottom_diff =
        bottom_diff + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot) * height * width;

    scalar_t* offset_bottom_diff_plus =
        bottom_diff + (roi_batch_ind * channels * nOrientation + c * nOrientation + ind_rot_plus) * height * width;


    int top_offset = (n * channels * nOrientation + c * nOrientation + o) * pooled_height * pooled_width;
    const scalar_t* offset_top_diff = top_diff + top_offset;
    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid_h = (sample_num > 0)
        ? sample_num
        : ceil(roi_height / pooled_height);  // e.g., = 2
    int roi_bin_grid_w =
        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);

    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
    // Appropriate translation needs to be applied after.
    scalar_t roi_start_h = -roi_height / 2.0;
    scalar_t roi_start_w = -roi_width / 2.0;
    scalar_t cosTheta = cos(theta);
    scalar_t sinTheta = sin(theta);

    // We do average (integral) pooling inside a bin
    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4

    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
      const scalar_t yy = roi_start_h + ph * bin_size_h +
          static_cast<scalar_t>(iy + .5f) * bin_size_h /
              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
        const scalar_t xx = roi_start_w + pw * bin_size_w +
            static_cast<scalar_t>(ix + .5f) * bin_size_w /
                static_cast<scalar_t>(roi_bin_grid_w);

        // Rotate by theta around the center and translate
        // scalar_t x = xx * cosTheta + yy * sinTheta + roi_center_w;
        // T y = yy * cosTheta - xx * sinTheta + roi_center_h;
        scalar_t x = xx * cosTheta - yy * sinTheta + roi_center_w;
        scalar_t y = xx * sinTheta + yy * cosTheta + roi_center_h;

        scalar_t w1, w2, w3, w4;
        int x_low, x_high, y_low, y_high;

        bilinear_interpolate_gradient<scalar_t>(
            height,
            width,
            y,
            x,
            w1,
            w2,
            w3,
            w4,
            x_low,
            x_high,
            y_low,
            y_high);

        scalar_t g1 = top_diff_this_bin * w1 / count;
        scalar_t g2 = top_diff_this_bin * w2 / count;
        scalar_t g3 = top_diff_this_bin * w3 / count;
        scalar_t g4 = top_diff_this_bin * w4 / count;

        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
          atomicAdd(
              offset_bottom_diff + y_low * width + x_low, g1*r_var);
          atomicAdd(
              offset_bottom_diff + y_low * width + x_high, g2*r_var);
          atomicAdd(
              offset_bottom_diff + y_high * width + x_low, g3*r_var);
          atomicAdd(
              offset_bottom_diff + y_high * width + x_high, g4*r_var);
          
          atomicAdd(
              offset_bottom_diff_plus + y_low * width + x_low, g1*l_var);
          atomicAdd(
              offset_bottom_diff_plus + y_low * width + x_high, g2*l_var);
          atomicAdd(
              offset_bottom_diff_plus + y_high * width + x_low, g3*l_var);
          atomicAdd(
              offset_bottom_diff_plus + y_high * width + x_high, g4*l_var);
        }  // if
      }  // ix
    }  // iy
  }  // CUDA_1D_KERNEL_LOOP
}  // RoIAlignBackward

int RiROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
    const float spatial_scale, const int sample_num,
    const int channels, const int height,
    const int width, const int num_rois,
    const int pooled_height, const int pooled_width,
    const int nOrientation,
    at::Tensor bottom_grad) {
        const int output_size = num_rois * pooled_height * pooled_width * channels * nOrientation;
        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
            top_grad.type(), "RiROIAlignLaucherBackward", ([&] {
              const scalar_t *top_diff = top_grad.data<scalar_t>();
              const scalar_t *rois_data = rois.data<scalar_t>();
              scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
              if (sizeof(scalar_t) == sizeof(double)) {
                fprintf(stderr, "double is not supported\n");
                exit(-1);
              }
      
              RiROIAlignBackward<scalar_t>
                  <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                      output_size, top_diff, rois_data, spatial_scale, sample_num,
                      channels, height, width, pooled_height, pooled_width, nOrientation,
                      bottom_diff);
            }));
        THCudaCheck(cudaGetLastError());
        return 1;

    }

仅供参考，若有错误，欢迎指正！！！

inline关键字参考：https://blog.csdn.net/u010853261/article/details/84940716
template学习：https://blog.csdn.net/lianhunqianr1/article/details/79966911
__global__学习：https://blog.csdn.net/heiheiya/article/details/82019309

代码解析参考:https://zhuanlan.zhihu.com/p/75171514
https://blog.csdn.net/xiaoxu1025/article/details/103703930
https://blog.csdn.net/qq_43088746/article/details/99637170
https://blog.csdn.net/u011622208/article/details/91355191
https://blog.csdn.net/liyuan02/article/details/6750828

、