扫描算法

最新推荐文章于 2021-07-12 15:36:05 发布

古路

最新推荐文章于 2021-07-12 15:36:05 发布

阅读量2.7k

点赞数 1

分类专栏： CUDA 文章标签： cuda扫描算法

本文链接：https://blog.csdn.net/fb_941219/article/details/103190046

版权

CUDA 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

扫描算法

0.引言

1.code

#include <stdio.h>

__global__ void global_scan(float* d_out,float* d_in){
  int idx = threadIdx.x;
  float out = 0.00f;
  d_out[idx] = d_in[idx];
  __syncthreads();
  for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
    if(idx-interpre>=0){
      out = d_out[idx]+d_out[idx-interpre];
    }
    __syncthreads();
    if(idx-interpre>=0){
      d_out[idx] = out;
      out = 0.00f;
    }
  }
}


__global__ void shmem_scan(float* d_out,float* d_in){
  int idx = threadIdx.x;
  float out = 0.00f;
  __shared__ float sh_arr[sizeof(d_in)];

  // copy data from "array" in global memory to sh_arr in shared memory.
  // here, each thread is responsible for copying a single element.
  sh_arr[idx] = d_in[idx];
  //同步函数是同步同一个块里面的时间
  __syncthreads();    // ensure all the writes to shared memory have completed

  for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
    if(idx-interpre>=0){
      out = sh_arr[idx]+sh_arr[idx-interpre];
    }
    __syncthreads();

    if(idx-interpre>=0){
      sh_arr[idx] = out;
      d_out[idx] = out;
      out = 0.00f;
    }
    __syncthreads();
  }

}


int main(int argc,char** argv){
  const int ARRAY_SIZE = 8;
  const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

  // generate the input array on the host
  float h_in[ARRAY_SIZE];
  for(int i=0;i<ARRAY_SIZE;i++){
    h_in[i] = float(i);
  }
  float h_out[ARRAY_SIZE];

  // declare GPU memory pointers
  float* d_in;
  float* d_out;

  // allocate GPU memory
  cudaMalloc((void**) &d_in,ARRAY_BYTES);
  cudaMalloc((void**) &d_out,ARRAY_BYTES);

  // transfer the array to GPU
  cudaMemcpy(d_in,h_in,ARRAY_BYTES,cudaMemcpyHostToDevice);

  // launch the kernel
  shmem_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);
  //global_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);

  // copy back the result array to the GPU
  cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);

  // print out the resulting array
  for(int i=0;i<ARRAY_SIZE;i++){
    printf("%f",h_out[i]);
    printf(((i%4) != 3) ? "\t" : "\n");
  }

  // free GPU memory allocation
  cudaFree(d_in);
  cudaFree(d_out);

  return 0;


}

解析：
Alt

2.result

Alt

最后一位就是相加的结果.

古路

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
扫描算法

扫描算法0.引言1.code2.result0.引言1.code#include <stdio.h>__global__ void global_scan(float* d_out,float* d_in){ int idx = threadIdx.x; float out = 0.00f; d_out[idx] = d_in[idx]; __syncthre...
复制链接

扫一扫

专栏目录