CUDA 并行加速基础之 Reduce 和 Scan 的实现

最新推荐文章于 2024-07-11 00:10:24 发布

wangbowj123

最新推荐文章于 2024-07-11 00:10:24 发布

阅读量2.7k

点赞数 5

分类专栏： CUDA编程文章标签： cuda 并行加速并行算法

本文链接：https://blog.csdn.net/wangbowj123/article/details/105521175

版权

CUDA编程专栏收录该内容

4 篇文章 1 订阅

订阅专栏

前言

我们知道硬件拥有其独特的并行性，为了发挥这一特色。我们要将平时串行执行的程序用并行性算法重新改写才能充分发挥 GPU 的优势。

实例：做求和：1+2+3+4+···

为了做这样一个累加和的加速，有两种简单的实现方法，分别是 Redece 进行归约（二分），或者是用 Scan 通过控制步长进行扫描求和。

Reduce

在这里插入图片描述
如上图所示为了并行执行累加，我们要构造出一些线程，每个线程并行工作，从而达到加速的目的。
Reduce 的原理如上图所示。我们构建1024个线程块，每个线程块中含有1024个线程，每个block中的线程（每次迭代只有上次迭代一半的线程数）并行的去计算求和。

求和规则是：每次迭代中左半边的线程分别加上右半边的线程分配得到的数，即只有一半的线程在工作，如此循环往复，最终0号线程得到的结果就是该线程块所有数的和。最后再运行最后一个线程块，将之前得到的线程块和当作输入数据，进行最后一次并行计算，从而得到最终和。

实现代码如下，通过两种访存方式进行实现：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in, 
            int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_intermediate, d_in);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_intermediate, d_in);
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
            (d_out, d_intermediate);
    }
    else
    {
        global_reduce_kernel<<<blocks, threads>>>
            (d_out, d_intermediate);
    }
}

int main(int argc, char **argv)
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
               devProps.name, (int)devProps.totalGlobalMem, 
               (int)devProps.major, (int)devProps.minor, 
               (int)devProps.clockRate);
    }

    const int ARRAY_SIZE = 1 << 20;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for(int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, * d_intermediate, * d_out;

    // allocate GPU memory
    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **) &d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 

    int whichKernel = 0;
    if (argc == 2) {
        whichKernel = atoi(argv[1]);
    }
        
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    switch(whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        }
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        for (int i = 0; i < 100; i++)
        {
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        }
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);    
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);
        
    return 0;
}

Scan

Scan 算法便是通过控制每次迭代的步长，从而达到并行求和，原理如下图所示：
在这里插入图片描述
即，第一次的求和步长是1，第二次是2…

实现代码如下（也是通过两种访存方式实现）：

#include <stdio.h>

__global__ void global_scan(float* d_out,float* d_in){
  int idx = threadIdx.x;
  float out = 0.00f;
  d_out[idx] = d_in[idx];
  __syncthreads();
  for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
    if(idx-interpre>=0){
      out = d_out[idx]+d_out[idx-interpre];
    }
    __syncthreads();
    if(idx-interpre>=0){
      d_out[idx] = out;
      out = 0.00f;
    }
  }
}

__global__ void shared_scan(float *d_out, float *d_in)
{
    __shared__ float sdata[8];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int myid = threadIdx.x;
    sdata[myid] = d_in[idx];
    __syncthreads();
    d_out[idx] = sdata[myid];
    __syncthreads();
    float out = 0.0f;
    for(int interpre = 1; interpre < sizeof(sdata); interpre *= 2)
    {
        if(myid - interpre >= 0)
        {
            out = d_out[myid] + d_out[myid - interpre];
        }
        __syncthreads();

        if(myid - interpre >= 0)
        {
            d_out[idx] = out;
            out = 0.0f;
        }
    
    }

}
int main(int argc,char** argv){
  const int ARRAY_SIZE = 8;
  const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

  // generate the input array on the host
  float h_in[ARRAY_SIZE];
  for(int i=0;i<ARRAY_SIZE;i++){
    h_in[i] = float(i);
  }
  float h_out[ARRAY_SIZE];

  // declare GPU memory pointers
  float* d_in;
  float* d_out;

  // allocate GPU memory
  cudaMalloc((void**) &d_in,ARRAY_BYTES);
  cudaMalloc((void**) &d_out,ARRAY_BYTES);

  // transfer the array to GPU
  cudaMemcpy(d_in,h_in,ARRAY_BYTES,cudaMemcpyHostToDevice);

  // launch the kernel
  //global_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);
  shared_scan<<<1, ARRAY_SIZE>>>(d_out, d_in);

  // copy back the result array to the GPU
  cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);

  // print out the resulting array
  for(int i=0;i<ARRAY_SIZE;i++){
    printf("%f",h_out[i]);
    printf(((i%4) != 3) ? "\t" : "\n");
  }

  // free GPU memory allocation
  cudaFree(d_in);
  cudaFree(d_out);

  return 0;


}

wangbowj123

关注

5
点赞
踩
10

收藏

觉得还不错? 一键收藏
打赏
0
评论
CUDA 并行加速基础之 Reduce 和 Scan 的实现

前言我们知道硬件拥有其独特的并行性，为了发挥这一特色。我们要将平时串行执行的程序用并行性算法重新改写才能充分发挥 GPU 的优势。实例：做求和：1+2+3+4+···为了做这样一个累加和的加速，有两种简单的实现方法，分别是 Redece 进行归约（二分），或者是用 Scan 通过控制步长进行扫描求和。Reduce如上图所示为了并行执行累加，我们要构造出一些线程，每个线程并行工作，从...
复制链接

扫一扫