0.引言
1.code
#include <stdio.h>
__global__ void global_scan(float* d_out,float* d_in){
int idx = threadIdx.x;
float out = 0.00f;
d_out[idx] = d_in[idx];
__syncthreads();
for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
if(idx-interpre>=0){
out = d_out[idx]+d_out[idx-interpre];
}
__syncthreads();
if(idx-interpre>=0){
d_out[idx] = out;
out = 0.00f;
}
}
}
__global__ void shmem_scan(float* d_out,float* d_in){
int idx = threadIdx.x;
float out = 0.00f;
__shared__ float sh_arr[sizeof(d_in)];
// copy data from "array" in global memory to sh_arr in shared memory.
// here, each thread is responsible for copying a single element.
sh_arr[idx] = d_in[idx];
//同步函数是同步同一个块里面的时间
__syncthreads(); // ensure all the writes to shared memory have completed
for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
if(idx-interpre>=0){
out = sh_arr[idx]+sh_arr[idx-interpre];
}
__syncthreads();
if(idx-interpre>=0){
sh_arr[idx] = out;
d_out[idx] = out;
out = 0.00f;
}
__syncthreads();
}
}
int main(int argc,char** argv){
const int ARRAY_SIZE = 8;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
for(int i=0;i<ARRAY_SIZE;i++){
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
// declare GPU memory pointers
float* d_in;
float* d_out;
// allocate GPU memory
cudaMalloc((void**) &d_in,ARRAY_BYTES);
cudaMalloc((void**) &d_out,ARRAY_BYTES);
// transfer the array to GPU
cudaMemcpy(d_in,h_in,ARRAY_BYTES,cudaMemcpyHostToDevice);
// launch the kernel
shmem_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);
//global_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);
// copy back the result array to the GPU
cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);
// print out the resulting array
for(int i=0;i<ARRAY_SIZE;i++){
printf("%f",h_out[i]);
printf(((i%4) != 3) ? "\t" : "\n");
}
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
解析:
2.result
最后一位就是相加的结果.