CUDA并发规约求和
https://www.cnblogs.com/viviman/archive/2012/11/21/2780286.html
https://blog.csdn.net/abcjennifer/article/details/43528407
规约求和
规约求和的核函数代码如下:
__global__ void RowSum(float* A, float* B){
int bid = blockIdx.x;
int tid = threadIdx.x;
__shared__ s_data[128];
//read data to shared memory
s_data[tid] = A[bid*128 + tid];
__synctheads(); //sync
for(int i=64; i>0; i/=2){
if(tid<i)
s_data[tid] = s_data[tid] + s_data[tid+i] ;
__synctheads();
}
if(tid==0)
B[bid] = s_data[0];
}