归约问题由于计算操作的对称性,非常适合并行处理。本文以数组求和为例,通过CUDA先实基础版本,并基于基础版本尝试通过不同的优化手段实现几个方案,最后将所有优化手段集成到最终的实现。
其中核函数执行时间是通过Nsight System工具抓取,命令参数如下
nsys profile --stats true a.out 5
基础版本实现
核函数实现
__global__ void kReduceWholeWarpSafe(int64_t* d_src, int64_t* d_sum)
{
int64_t* d_local = d_src + 2 * blockIdx.x * blockDim.x;
for (size_t stride = blockDim.x; stride >= 1; stride /=2)
{
if (threadIdx.x < stride)
{
d_local[threadIdx.x] += d_local[threadIdx.x + stride];
}
__syncthreads();
}
if (threadIdx.x == 0)
{
d_sum[blockIdx.x] = d_local[0];
}
}
执行时间
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
100.0 866378 1 866378.0 866378 866378 kReduceWholeWarpSafe(long*, long*)
8倍循环展开
核函数实现
__global__ void kUnroll8ReduceSafe(int64_t* d_src, int64_t* d_sum)
{
int64_t* d_local = d_src + blockIdx.x * blockDim.x * 8;
for (size_t i = 1; i < 8; i++)
{
d_local[threadIdx.x] += d_local[threadIdx.x + blockDim.x * i];
}
__syncthreads();
//int stride = 0;
for (int stride = blockDim.x/2;stride >=1;stride /=2)
{
if (threadIdx.x < stride)
{
d_local[threadIdx.x] += d_local[threadIdx.x + stride];
}
__syncthreads();
}
if (threadIdx.x == 0)
{
d_sum[blockIdx.x] = d_local[0];
}
}
执行时间
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
100.0 462661 1 462661.0 462661 462661 kUnroll8ReduceSafe(long*, long*)
减少同步
核函数实现
__global__ void kReduceWholeWarpSingle(int64_t* d_src, int64_t* d_sum)
{
int64_t* d_local = d_src + 2 * blockIdx.x * blockDim.x;
for (size_t stride = blockDim.x; stride >= 32; stride /= 2)
{
if (threadIdx.x < stride)
{
d_local[threadIdx.x] += d_local[threadIdx.x + stride];
}
__syncthreads();
}
//减少5次同步
if (threadIdx.x <32)
{
/*volatile*/ int64_t* Mem = d_local;
Mem[threadIdx.x] += Mem[threadIdx.x + 16];
Mem[threadIdx.x] += Mem[threadIdx.x + 8];
Mem[threadIdx.x] += Mem[threadIdx.x + 4];
Mem[threadIdx.x] += Mem[threadIdx.x + 2];
Mem[threadIdx.x] += Mem[threadIdx.x + 1];
}
if (threadIdx.x == 0)
{
d_sum[blockIdx.x] = d_local[0];
}
}
执行时间
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
100.0 664617 1 664617.0 664617 664617 kReduceWholeWarpSingle(long*, long*)
最终版本
综合前的优化手段,核函数实现如下:
template<unsigned int iblocksize>
__global__ void UnrollAllReduce(int64_t* d_src, int64_t* d_sum)
{
int64_t* d_local = d_src + iblocksize * blockIdx.x * 8;
for (size_t i = 1; i < 8; i++)
{
d_local[threadIdx.x] += d_local[threadIdx.x + i * iblocksize];
}
__syncthreads();
if (iblocksize > 512 && threadIdx.x < 512)
{
d_local[threadIdx.x] += d_local[threadIdx.x + 512];
}
__syncthreads();
if (iblocksize > 256 && threadIdx.x < 256)
{
d_local[threadIdx.x] += d_local[threadIdx.x + 256];
}
__syncthreads();
if (iblocksize > 128 && threadIdx.x < 128)
{
d_local[threadIdx.x] += d_local[threadIdx.x + 128];
}
__syncthreads();
if (iblocksize > 64 && threadIdx.x < 64)
{
d_local[threadIdx.x] += d_local[threadIdx.x + 64];
}
__syncthreads();
if (threadIdx.x < 32)
{
volatile int64_t* warpmem = d_local;
warpmem[threadIdx.x] += warpmem[threadIdx.x + 32];
warpmem[threadIdx.x] += warpmem[threadIdx.x + 16];
warpmem[threadIdx.x] += warpmem[threadIdx.x + 8];
warpmem[threadIdx.x] += warpmem[threadIdx.x + 4];
warpmem[threadIdx.x] += warpmem[threadIdx.x + 2];
warpmem[threadIdx.x] += warpmem[threadIdx.x + 1];
}
if (threadIdx.x == 0)
{
d_sum[blockIdx.x] = d_local[0];
}
}
执行时间
Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)
Time(%) Total Time Instances Average Minimum Maximum Name
100.0 387493 1 387493.0 387493 387493 void UnrollAllReduce<512u>(long*, long*)
测试程序
测试程序
int64_t HostSumArray(int64_t* src, int64_t iArrayDim)
{
int64_t iSum = 0;
for (size_t i = 0; i < iArrayDim; i++)
{
iSum += src[i];
}
return iSum;
}
void ReduceTest(int64_t FuncIdx)
{
enum MyEnum
{
WARPSEP = 0,
WHOLEWARP,
WHOLEWARPSAFE,
UNROLL8,
WARPSINGLE,
UNROLLALL
};
const int64_t ReduceSize = 1 << 24;
int64_t* array = new int64_t[ReduceSize];
for (size_t i = 0; i < ReduceSize; i++)
{
array[i] = i;
}
int64_t hSum = HostSumArray(array, ReduceSize);
int64_t* d_src;
checkCUDARtn(cudaMalloc(&d_src, ReduceSize*sizeof(int64_t)));
checkCUDARtn(cudaMemcpy(d_src, array, ReduceSize * sizeof(int64_t), cudaMemcpyHostToDevice));
int64_t iBlkDim = 1024;
int64_t iGrdDim = ((ReduceSize - 1 + iBlkDim) / (iBlkDim*2));
std::cout << "griddim is " << iGrdDim << std::endl;
int64_t dSum = 0;
if (FuncIdx == WARPSEP)
{
int64_t* d_sum;
cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t));
cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t));
KReduceWarpSep << <iGrdDim, iBlkDim >> > (d_src, ReduceSize, d_sum);
cudaDeviceSynchronize();
int64_t* dSumArray = new int64_t[iGrdDim];
cudaMemcpy(&dSumArray, &d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost);
dSum = HostSumArray(dSumArray, iGrdDim);
delete[] dSumArray;
}
else if (FuncIdx == WHOLEWARP)
{
kReduceWholeWarp<< < iGrdDim, iBlkDim >> > (d_src, ReduceSize);
cudaError_t rtn = cudaGetLastError();
std::string sErr = cudaGetErrorString(rtn);
std::cout << sErr << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(&dSum, d_src, sizeof(int64_t), cudaMemcpyDeviceToDevice);
}
else if (FuncIdx == WHOLEWARPSAFE)
{
int64_t* d_sum;
checkCUDARtn(cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t)));
checkCUDARtn(cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t)));
kReduceWholeWarpSafe << <iGrdDim, iBlkDim >> > (d_src, d_sum);
checkCUDARtn(cudaDeviceSynchronize());
int64_t* dSumArray = new int64_t[iGrdDim];
memset(dSumArray, 0, sizeof(int64_t)* iGrdDim);
checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
dSum = HostSumArray(dSumArray, iGrdDim);
delete[] dSumArray;
checkCUDARtn(cudaFree(d_sum));
}
else if (FuncIdx == UNROLL8)
{
int64_t* d_sum;
int iLocalGrdDim = (ReduceSize - 1 + iBlkDim) / (iBlkDim*8);
checkCUDARtn(cudaMalloc(&d_sum, iLocalGrdDim * sizeof(int64_t)));
checkCUDARtn(cudaMemset(d_sum, 0, iLocalGrdDim * sizeof(int64_t)));
kUnroll8ReduceSafe << <iLocalGrdDim, iBlkDim >> > (d_src, d_sum);
checkCUDARtn(cudaDeviceSynchronize());
int64_t* dSumArray = new int64_t[iLocalGrdDim + 100];
memset(dSumArray, 0, sizeof(int64_t)* (iLocalGrdDim + 100));
checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iLocalGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
dSum = HostSumArray(dSumArray, iLocalGrdDim);
delete[] dSumArray;
checkCUDARtn(cudaFree(d_sum));
}
else if (FuncIdx == WARPSINGLE)
{
int64_t* d_sum;
checkCUDARtn(cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t)));
checkCUDARtn(cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t)));
kReduceWholeWarpSingle << <iGrdDim, iBlkDim >> > (d_src, d_sum);
checkCUDARtn(cudaDeviceSynchronize());
int64_t* dSumArray = new int64_t[iGrdDim];
memset(dSumArray, 0, sizeof(int64_t)* iGrdDim);
checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
dSum = HostSumArray(dSumArray, iGrdDim);
delete[] dSumArray;
checkCUDARtn(cudaFree(d_sum));
}
else if (FuncIdx == UNROLLALL)
{
int64_t* d_sum;
int ibkd = 256;
int gridlocal = (ReduceSize + ibkd - 1) / (8 * ibkd);
checkCUDARtn(cudaMalloc(&d_sum, gridlocal * sizeof(int64_t)));
checkCUDARtn(cudaMemset(d_sum, 0, gridlocal * sizeof(int64_t)));
UnrollAllReduce<256> << <gridlocal, ibkd >> > (d_src, d_sum);
cudaError_t rtn = cudaGetLastError();
std::cout << cudaGetErrorString(rtn) << std::endl;
checkCUDARtn(cudaDeviceSynchronize());
int64_t* dSumArray = new int64_t[gridlocal];
memset(dSumArray, 0, sizeof(int64_t)* gridlocal);
checkCUDARtn(cudaMemcpy(dSumArray, d_sum, gridlocal * sizeof(int64_t), cudaMemcpyDeviceToHost));
dSum = HostSumArray(dSumArray, gridlocal);
delete[] dSumArray;
checkCUDARtn(cudaFree(d_sum));
}
if (dSum != hSum)
{
std::cout << "sum not equal (h_d)" << hSum << " " << dSum << std::endl;
return;
}
std::cout << "sum equal (h_d)" << hSum << " " << dSum << std::endl;
checkCUDARtn(cudaFree(d_src));
delete[] array;
}
总结
最终版本通过循环展开,消除同步,模板等技术,对比基础版本最终得到加速比2.2.