CUDA并行计算中,求最大值和求和原理一样,都属于规约算法。我是求复数模长的最大值。
核函数如下:
__global__ void array_max(const Complex* __restrict__ in, float* __restrict__ blockMax, const int in_len)
{
extern __shared__ float sha_partialMax2[];
const int tid = threadIdx.x;
const int gTid = blockIdx.x * (blockDim.x * 2) + tid;
// 复制全局数据到共享内存
Complex in0, in1;
in0 = in[gTid];
sha_partialMax2[tid] = in0.x*in0.x + in0.y*in0.y;
in1 = in[gTid + blockDim.x];
sha_partialMax2[tid + blockDim.x] = in1.x*in1.x + in1.y*in1.y;
// block内前半部分与后半部分对应比较大小
if (blockDim.x > 512)
{
__syncthreads();
if (sha_partialMax2[tid] < sha_partialMax2[tid + 1024])
sha_partialMax2[tid] = sha_partialMax2[tid + 1024];
}
if (blockDim.x > 256 && tid < 512)
{
__syncthreads();
if (sha_partialMax2[tid] < sha_partialMax2[tid + 512])
sha_partialMax2[tid] = sha_partialMax2