CUDA----规约
介绍
数组的一些如:求和、求最大值最小值、乘积等操作,在单核程序中我们往往是通过for循环遍历所有数组元素,并依次进行操作得到最终的结果。但是对于并行程序而言,这种串行思路需要结合原子操作来实现,性能比较差。利用归并的思想,线程之间两两结合,可以充分利用并行加速效果。本文我们以对数组所有元素求和为例:
s
=
∑
i
N
A
[
i
]
where
N
≥
1
\begin{equation} s = \sum_i^N A[i] \quad\text{where}\quad N\ge 1 \end{equation}
s=i∑NA[i]whereN≥1
本文所有的测试均在NVIDIA Tesla V100
上进行,数组测试长度为
2
24
=
16777216
2^{24} = 16777216
224=16777216(64MB)。为了能够更好的统一测试环境,所有的kernel都被放在同一个源文件中。
代码实现
原子操作
__global__ void kernel1(float* arr, int nElem, float* result)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid < nElem)
{
atomicAdd(result, arr[gid]); //- 原子操作,累加
}
}
规约
规约——交叉配对
__global__ void kernel2(float* arr, int nElem, float* result)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float tmp[BLOCK_SIZE];
if(gid < nElem)
{
tmp[threadIdx.x] = arr[gid];
}
else
{
tmp[threadIdx.x] = 0.0f;
}
__syncthreads();
for(int strip = 1; strip < blockDim.x; strip = strip*2)
{
if(threadIdx.x % (2*strip) == 0)
tmp[threadIdx.x] += tmp[threadIdx.x + strip];
__syncthreads();
}
if(threadIdx.x == 0)
result[blockIdx.x] = tmp[0];
}
规约——交错配对
__global__ void kernel3(float* arr, int nElem, float* result)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float tmp[BLOCK_SIZE];
if(gid < nElem)
{
tmp[threadIdx.x] = arr[gid];
}
else
{
tmp[threadIdx.x] = 0.0f;
}
__syncthreads();
for(int strip = blockDim.x / 2; strip > 0; strip = strip / 2)
{
if(threadIdx.x < strip)
tmp[threadIdx.x] += tmp[threadIdx.x + strip];
__syncthreads();
}
if(threadIdx.x == 0)
{
result[blockIdx.x] = tmp[0];
}
}
规约——处理两个block数据
__global__ void kernel4(float* arr, int nElem, float* result)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int gid2 = gid + gridDim.x * blockDim.x;
__shared__ float tmp[BLOCK_SIZE];
tmp[threadIdx.x] = 0.0f;
if(gid < nElem && gid2 < nElem)
{
tmp[threadIdx.x] = arr[gid] + arr[gid2];
}
if(gid < nElem && gid2 >= nElem)
{
tmp[threadIdx.x] = arr[gid];
}
__syncthreads();
for(int strip = blockDim.x / 2; strip > 0; strip = strip / 2)
{
if(threadIdx.x < strip)
tmp[threadIdx.x] += tmp[threadIdx.x + strip];
__syncthreads();
}
if(threadIdx.x == 0)
{
result[blockIdx.x] = tmp[0];
}
}
规约——循环展开
__global__ void kernel5(float* arr, int nElem, float* result)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int gid2 = gid + gridDim.x * blockDim.x;
__shared__ float tmp[BLOCK_SIZE];
tmp[threadIdx.x] = 0.0f;
if(gid < nElem && gid2 < nElem)
{
tmp[threadIdx.x] = arr[gid] + arr[gid2];
}
if(gid < nElem && gid2 >= nElem)
{
tmp[threadIdx.x] = arr[gid];
}
__syncthreads();
if(blockDim.x == 1024)
{
if(threadIdx.x >= 512)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 512];
}
__syncthreads();
if(blockDim.x >= 512)
{
if(threadIdx.x >= 256)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 256];
}
__syncthreads();
if(blockDim.x >= 256)
{
if(threadIdx.x >= 128)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 128];
}
__syncthreads();
if(blockDim.x >= 128)
{
if(threadIdx.x >= 64)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 64];
}
__syncthreads();
if(blockDim.x >= 64)
{
if(threadIdx.x >= 32)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 32];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 16];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 8];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 4];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 2];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 1];
}
if(threadIdx.x == 0)
{
result[blockIdx.x] = tmp[0];
}
}
shuffle warp
__global__ void kernel6(float* arr, int nElem, float* result)
{
__shared__ float tmp[32];
int gid = blockIdx.x * blockDim.x + threadIdx.x;
float data = 0.0f;
if(gid < nElem)
{
data = arr[gid];
}
data += __shfl_down_sync(0xffffffff, data, 16);
data += __shfl_down_sync(0xffffffff, data, 8);
data += __shfl_down_sync(0xffffffff, data, 4);
data += __shfl_down_sync(0xffffffff, data, 2);
data += __shfl_down_sync(0xffffffff, data, 1);
if(threadIdx.x % 32 == 0)
tmp[threadIdx.x / 32] = data;
__syncthreads();
if(threadIdx.x >= 32)
return;
data = tmp[threadIdx.x];
data += __shfl_down_sync(0xffffffff, data, 16);
data += __shfl_down_sync(0xffffffff, data, 8);
data += __shfl_down_sync(0xffffffff, data, 4);
data += __shfl_down_sync(0xffffffff, data, 2);
data += __shfl_down_sync(0xffffffff, data, 1);
if(threadIdx.x == 0)
result[blockIdx.x] = data;
}
性能分析
[mmhe@k057 20221012]$ nvprof ./test
==15379== NVPROF is profiling process 15379, command: ./test
nElem = 16777216: sum = -8.364376e+06 theory = -8.388608e+06
nElem = 16777216: sum = -8.388608e+06 theory = -8.388608e+06
nElem = 16777216: sum = -8.388608e+06 theory = -8.388608e+06
nElem = 16777216: sum = -8.388608e+06 theory = -8.388608e+06
nElem = 16777216: sum = -8.388608e+06 theory = -8.388608e+06
nElem = 16777216: sum = -8.388608e+06 theory = -8.388608e+06
==15379== Profiling application: ./test
==15379== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 73.42% 43.314ms 1 43.314ms 43.314ms 43.314ms kernel1(float*, int, float*)
21.22% 12.517ms 1 12.517ms 12.517ms 12.517ms [CUDA memcpy HtoD]
2.65% 1.5608ms 1 1.5608ms 1.5608ms 1.5608ms kernel2(float*, int, float*)
1.06% 623.65us 1 623.65us 623.65us 623.65us kernel3(float*, int, float*)
0.62% 364.23us 1 364.23us 364.23us 364.23us kernel6(float*, int, float*)
0.60% 354.69us 1 354.69us 354.69us 354.69us kernel4(float*, int, float*)
0.39% 228.77us 1 228.77us 228.77us 228.77us kernel5(float*, int, float*)
0.06% 34.689us 6 5.7810us 1.6640us 7.2960us [CUDA memcpy DtoH]
API calls: 88.80% 594.95ms 3 198.32ms 386.71us 594.00ms cudaMalloc
8.85% 59.294ms 7 8.4706ms 261.03us 43.330ms cudaMemcpy
1.32% 8.8224ms 576 15.316us 310ns 610.01us cuDeviceGetAttribute
0.89% 5.9854ms 6 997.57us 988.40us 1.0013ms cuDeviceTotalMem
0.11% 756.00us 6 126.00us 120.92us 134.45us cuDeviceGetName
0.03% 172.00us 6 28.666us 15.323us 79.021us cudaLaunchKernel
0.00% 20.879us 6 3.4790us 2.7580us 5.0700us cuDeviceGetPCIBusId
0.00% 8.4910us 12 707ns 390ns 1.6300us cuDeviceGet
0.00% 3.1300us 3 1.0430us 333ns 1.7900us cuDeviceGetCount
0.00% 2.8750us 6 479ns 427ns 585ns cuDeviceGetUuid
单纯从运行时间来看,kernel1 >> kernel2 > kernel3 > kernel6 > kernel4 > kernel5
.下面我们逐个分析性能提升的原因。
规约
kernel2 和 kernel1
相比较于原子操作,规约性能提升非常明显,最朴素的实现也有接近30倍的提升。不过这里需要注意的是,规约只在block中进行,因此block局部和的结果需要传到Host上进行最终的求和,因此实际运算量要小于理论结果,但是考虑到减少量仅为 1 1024 \frac{1}{1024} 10241,因此可以忽略不计。
从这个结果我们可以清楚的看到,原子操作的计算和访存利用率都非常低,这是由于在每个block中的thread需要串行的去执行原子加操作,从而大量的时间花在了等待上面,并且可想而知,warp的效率也是非常低的。
可以看到,kernel1 的warp 调度器每次几乎没有就绪的warp可供发射,至于具体的原因,则是由于存在大量的Stall LG Throttle
,Stall Drain
和明显的Stall Long Scoreboard
。
根据官方文档描述:
Stall LG Throttle
高的原因是全局内存的高频访问,这里可能锁定区检测的原因,具体原因暂时不清楚。Stall Drain
高的原因是对内存写入的串行性比较高。Stall Long Scoreboard
高的原因是指令需要等待如全局内存数据的加载完成之后才能执行下一条指令,一般来说,通过足够多的warp可以掩盖这个延迟,但是我们可以看到,该kernel的就绪warp很少,因此这个问题也就非常突出。
kernel3 base kernel2
kernel2 的相邻配对会导致warp内的活动thread数逐渐变少,出现严重的warp发散,进而导致共享内存访问效率低下。