CUDA处理归约问题

归约问题由于计算操作的对称性,非常适合并行处理。本文以数组求和为例,通过CUDA先实基础版本,并基于基础版本尝试通过不同的优化手段实现几个方案,最后将所有优化手段集成到最终的实现。
其中核函数执行时间是通过Nsight System工具抓取,命令参数如下

nsys profile --stats true a.out 5

基础版本实现

核函数实现

__global__ void kReduceWholeWarpSafe(int64_t* d_src, int64_t* d_sum)
{
	int64_t* d_local = d_src + 2 * blockIdx.x * blockDim.x;
	for (size_t stride = blockDim.x; stride >= 1; stride /=2)
	{
		if (threadIdx.x < stride)
		{
			d_local[threadIdx.x] += d_local[threadIdx.x + stride];
		}
		__syncthreads();
	}
	if (threadIdx.x == 0)
	{
		d_sum[blockIdx.x] = d_local[0];
	}
	
}

执行时间

Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)

Time(%)      Total Time   Instances         Average         Minimum         Maximum  Name                              
                                                                                                                       
  100.0          866378           1        866378.0          866378          866378  kReduceWholeWarpSafe(long*, long*)

8倍循环展开

核函数实现

__global__ void kUnroll8ReduceSafe(int64_t* d_src, int64_t* d_sum)
{
	int64_t* d_local = d_src + blockIdx.x * blockDim.x * 8;
	for (size_t i = 1; i < 8; i++)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + blockDim.x * i];
	}
	__syncthreads();
	//int stride = 0;
	for (int stride = blockDim.x/2;stride >=1;stride /=2)
	{
		if (threadIdx.x < stride)
		{
			d_local[threadIdx.x] += d_local[threadIdx.x + stride];
		}
		__syncthreads();
	}

	if (threadIdx.x == 0)
	{
		d_sum[blockIdx.x] = d_local[0];
	}
	
}

执行时间

Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)

Time(%)      Total Time   Instances         Average         Minimum         Maximum  Name                             
                                                                                                                      
  100.0          462661           1        462661.0          462661          462661  kUnroll8ReduceSafe(long*, long*) 

减少同步

核函数实现

__global__ void kReduceWholeWarpSingle(int64_t* d_src, int64_t* d_sum)
{
	int64_t* d_local = d_src + 2 * blockIdx.x * blockDim.x;
	for (size_t stride = blockDim.x; stride >= 32; stride /= 2)
	{
		if (threadIdx.x < stride)
		{
			d_local[threadIdx.x] += d_local[threadIdx.x + stride];
		}
		__syncthreads();
	}
	//减少5次同步
	if (threadIdx.x <32)
	{
		/*volatile*/ int64_t* Mem = d_local;
		Mem[threadIdx.x] += Mem[threadIdx.x + 16];
		Mem[threadIdx.x] += Mem[threadIdx.x + 8];
		Mem[threadIdx.x] += Mem[threadIdx.x + 4];
		Mem[threadIdx.x] += Mem[threadIdx.x + 2];
		Mem[threadIdx.x] += Mem[threadIdx.x + 1];
	}
	
	if (threadIdx.x == 0)
	{
		d_sum[blockIdx.x] = d_local[0];
	}

}

执行时间

Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)

Time(%)      Total Time   Instances         Average         Minimum         Maximum  Name                                
                                                                                                                         
  100.0          664617           1        664617.0          664617          664617  kReduceWholeWarpSingle(long*, long*)

最终版本

综合前的优化手段,核函数实现如下:

template<unsigned int iblocksize>
__global__ void UnrollAllReduce(int64_t* d_src, int64_t* d_sum)
{
	int64_t* d_local = d_src + iblocksize * blockIdx.x * 8;
	for (size_t i = 1; i < 8; i++)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + i * iblocksize];
	}
	__syncthreads();
	if (iblocksize > 512 && threadIdx.x < 512)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + 512];
	}
	__syncthreads();
	if (iblocksize > 256 && threadIdx.x < 256)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + 256];
	}
	__syncthreads();
	if (iblocksize > 128 && threadIdx.x < 128)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + 128];
	}
	__syncthreads();
	if (iblocksize > 64 && threadIdx.x < 64)
	{
		d_local[threadIdx.x] += d_local[threadIdx.x + 64];
	}
	__syncthreads();
	if (threadIdx.x < 32)
	{
		volatile int64_t* warpmem = d_local;
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 32];
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 16];
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 8];
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 4];
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 2];
		warpmem[threadIdx.x] += warpmem[threadIdx.x + 1];
	}
	if (threadIdx.x == 0)
	{
		d_sum[blockIdx.x] = d_local[0];
	}
}

执行时间

Generating CUDA Kernel Statistics...
CUDA Kernel Statistics (nanoseconds)

Time(%)      Total Time   Instances         Average         Minimum         Maximum  Name                                    
                                                                                                                             
  100.0          387493           1        387493.0          387493          387493  void UnrollAllReduce<512u>(long*, long*)

测试程序

测试程序

int64_t HostSumArray(int64_t* src, int64_t iArrayDim)
{
	int64_t iSum = 0;
	for (size_t i = 0; i < iArrayDim; i++)
	{
		iSum += src[i];
	}
	return iSum;
}
void ReduceTest(int64_t FuncIdx)
{
	enum MyEnum
	{
		WARPSEP = 0,
		WHOLEWARP,
		WHOLEWARPSAFE,
		UNROLL8,
		WARPSINGLE,
		UNROLLALL
	};
	const int64_t ReduceSize =  1 << 24;
	int64_t* array = new int64_t[ReduceSize];
	for (size_t i = 0; i < ReduceSize; i++)
	{
		array[i] = i;
	}
	int64_t hSum = HostSumArray(array, ReduceSize);
	int64_t* d_src;
	
	checkCUDARtn(cudaMalloc(&d_src, ReduceSize*sizeof(int64_t)));
	checkCUDARtn(cudaMemcpy(d_src, array, ReduceSize * sizeof(int64_t), cudaMemcpyHostToDevice));
	int64_t iBlkDim = 1024;
	int64_t iGrdDim = ((ReduceSize - 1 + iBlkDim) / (iBlkDim*2));
	std::cout << "griddim is " << iGrdDim << std::endl;
	int64_t dSum = 0;
	if (FuncIdx == WARPSEP)
	{
		int64_t* d_sum;
		cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t));
		cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t));
		KReduceWarpSep << <iGrdDim, iBlkDim >> > (d_src, ReduceSize, d_sum);
		cudaDeviceSynchronize();
		int64_t* dSumArray = new int64_t[iGrdDim];
		cudaMemcpy(&dSumArray, &d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost);
		dSum = HostSumArray(dSumArray, iGrdDim);
		delete[] dSumArray;
	}
	else if (FuncIdx == WHOLEWARP)	
	{
		kReduceWholeWarp<< < iGrdDim, iBlkDim >> > (d_src, ReduceSize);
		cudaError_t rtn = cudaGetLastError();
		std::string sErr = cudaGetErrorString(rtn);
		std::cout << sErr << std::endl;
		cudaDeviceSynchronize();
		cudaMemcpy(&dSum, d_src, sizeof(int64_t), cudaMemcpyDeviceToDevice);
	}
	else if (FuncIdx == WHOLEWARPSAFE)
	{
		int64_t* d_sum;
		checkCUDARtn(cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t)));
		checkCUDARtn(cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t)));
		kReduceWholeWarpSafe << <iGrdDim, iBlkDim >> > (d_src, d_sum);
		checkCUDARtn(cudaDeviceSynchronize());
		int64_t* dSumArray = new int64_t[iGrdDim];
		memset(dSumArray, 0, sizeof(int64_t)* iGrdDim);
		checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
		dSum = HostSumArray(dSumArray, iGrdDim);
		delete[] dSumArray;
		checkCUDARtn(cudaFree(d_sum));
	}
	else if (FuncIdx == UNROLL8)
	{
		int64_t* d_sum;
		int iLocalGrdDim = (ReduceSize - 1 + iBlkDim) / (iBlkDim*8);
		checkCUDARtn(cudaMalloc(&d_sum, iLocalGrdDim * sizeof(int64_t)));
		checkCUDARtn(cudaMemset(d_sum, 0, iLocalGrdDim * sizeof(int64_t)));
		kUnroll8ReduceSafe << <iLocalGrdDim, iBlkDim >> > (d_src, d_sum);
		checkCUDARtn(cudaDeviceSynchronize());
		int64_t* dSumArray = new int64_t[iLocalGrdDim + 100];
		memset(dSumArray, 0, sizeof(int64_t)* (iLocalGrdDim + 100));
		checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iLocalGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
		dSum = HostSumArray(dSumArray, iLocalGrdDim);
		delete[] dSumArray;
		checkCUDARtn(cudaFree(d_sum));
	}
	else if (FuncIdx == WARPSINGLE)
	{

		int64_t* d_sum;
		checkCUDARtn(cudaMalloc(&d_sum, iGrdDim * sizeof(int64_t)));
		checkCUDARtn(cudaMemset(d_sum, 0, iGrdDim * sizeof(int64_t)));
		kReduceWholeWarpSingle << <iGrdDim, iBlkDim >> > (d_src, d_sum);
		checkCUDARtn(cudaDeviceSynchronize());
		int64_t* dSumArray = new int64_t[iGrdDim];
		memset(dSumArray, 0, sizeof(int64_t)* iGrdDim);
		checkCUDARtn(cudaMemcpy(dSumArray, d_sum, iGrdDim * sizeof(int64_t), cudaMemcpyDeviceToHost));
		dSum = HostSumArray(dSumArray, iGrdDim);
		delete[] dSumArray;
		checkCUDARtn(cudaFree(d_sum));
	}
	else if (FuncIdx == UNROLLALL)
	{
		int64_t* d_sum;
		int ibkd = 256;
		int gridlocal = (ReduceSize + ibkd - 1) / (8 * ibkd);
		checkCUDARtn(cudaMalloc(&d_sum, gridlocal * sizeof(int64_t)));
		checkCUDARtn(cudaMemset(d_sum, 0, gridlocal * sizeof(int64_t)));
		UnrollAllReduce<256> << <gridlocal, ibkd >> > (d_src, d_sum);
		cudaError_t rtn = cudaGetLastError();
		std::cout << cudaGetErrorString(rtn) << std::endl;
		checkCUDARtn(cudaDeviceSynchronize());
		int64_t* dSumArray = new int64_t[gridlocal];
		memset(dSumArray, 0, sizeof(int64_t)* gridlocal);
		checkCUDARtn(cudaMemcpy(dSumArray, d_sum, gridlocal * sizeof(int64_t), cudaMemcpyDeviceToHost));
		dSum = HostSumArray(dSumArray, gridlocal);
		delete[] dSumArray;
		checkCUDARtn(cudaFree(d_sum));
	}

	
	
	if (dSum != hSum)
	{
		std::cout << "sum not equal (h_d)" << hSum << " " << dSum << std::endl;
		return;
	}
	std::cout << "sum equal (h_d)" << hSum << " " << dSum << std::endl;
	checkCUDARtn(cudaFree(d_src));
	delete[] array;
}

总结

最终版本通过循环展开,消除同步,模板等技术,对比基础版本最终得到加速比2.2.

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值