CUDA并行规约(交错配对-完全展开-终极版)

通过前面的文章,我们对32以下的迭代循环进行了展开处理,实际上,由于线程块的长度限制(GTX1050Ti是1024),可以说循环次数是确定的,因此可以将循环完全展开,

即 1024 512 256 128 64 都可以展开计算,唯一需要注意的每次计算之后都要进行同步,原因在前文已有解释。下面给出代码。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include "math.h"
#include "stdlib.h"

//错误检查的宏定义
#define CHECK(call)									\
{													\
	const cudaError_t status=call;					\
if (status!=cudaSuccess)							\
	{												\
	printf("文件:%s,函数:%s,行号:%d\n",__FILE__,		\
						__FUNCTION__,__LINE__);		\
	printf("%s", cudaGetErrorString(status));		\
	exit(1);										\
	}												\
}													\

//核函数
__global__ void Kernel(int *d_data, int *d_local_sum, int N)
{
	int tid = threadIdx.x;
	int index = 8 * blockIdx.x*blockDim.x + threadIdx.x;
	int *data = d_data + 8 * blockIdx.x*blockDim.x;


	if (index + 7 * blockDim.x < N)
	{
		int a = d_data[index];
		int a1 = d_data[index + blockDim.x];
		int a2 = d_data[index + 2 * blockDim.x];
		int a3 = d_data[index + 3 * blockDim.x];
		int a4 = d_data[index + 4 * blockDim.x];
		int a5 = d_data[index + 5 * blockDim.x];
		int a6 = d_data[index + 6 * blockDim.x];
		int a7 = d_data[index + 7 * blockDim.x];
		d_data[index] = (a + a1 + a2 + a3 + a4 + a5 + a6 + a7);
	}
	
	__syncthreads();

	if (blockDim.x >= 1024 && tid < 512)  data[tid] += data[tid + 512]; __syncthreads();
	if (blockDim.x >= 512 && tid < 256)	  data[tid] += data[tid + 256]; __syncthreads();
	if (blockDim.x >= 256 && tid < 128)	  data[tid] += data[tid + 128]; __syncthreads();
	if (blockDim.x >= 128 && tid < 64)	  data[tid] += data[tid + 64];  __syncthreads();


	if (tid < 32)
	{
		volatile int *vmen = data;	//volatile 确保每次对全局内存的修改都即时修改! 
		vmen[tid] += vmen[tid + 32];
		vmen[tid] += vmen[tid + 16];
		vmen[tid] += vmen[tid + 8];
		vmen[tid] += vmen[tid + 4];
		vmen[tid] += vmen[tid + 2];
		vmen[tid] += vmen[tid + 1];
	}

	if (tid == 0)
	{
		d_local_sum[blockIdx.x] = data[0];
	}
}

//主函数
int main()
{

	//基本参数设置
	cudaSetDevice(0);
	const int N = 1 << 29;
	int local_length = 1024;
	long long total_sum = 0;

	dim3 grid(((N + local_length - 1) / local_length), 1);
	dim3 block(local_length, 1);

	int *h_data = nullptr;
	int *h_local_sum = nullptr;
	int *d_data = nullptr;
	int *d_local_sum = nullptr;


	//Host&Deivce内存申请及数组初始化
	h_data = (int*)malloc(N * sizeof(int));
	h_local_sum = (int*)malloc(int(grid.x / 8) * sizeof(int));


	CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));

	CHECK(cudaMalloc((void**)&d_local_sum, int(grid.x / 8) * sizeof(int)));

	int s = 0;
	for (int i = 0; i < N; i++)
	{
		h_data[i] = int(10 * sin(0.02*3.14*i));//限制数组元素值,防止最终求和值超过long long的范围
		s += h_data[i];
	}

	//数据拷贝至Device
	CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));

	//for (int i = 0; i < 200; i++)
	//执行核函数
	Kernel << <grid.x / 8, block >> > (d_data, d_local_sum, N);

	//数据拷贝至Host
	CHECK(cudaMemcpy(h_local_sum, d_local_sum, int(grid.x / 8) * sizeof(int),
		cudaMemcpyDeviceToHost));

	//同步&重置设备
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaDeviceReset());


	for (int i = 0; i < int(grid.x / 8); i++)
	{
		total_sum += h_local_sum[i];
	}

	printf("%I64d \n", total_sum);

	//getchar();
	return 0;

}



此外,为了能够可以在程序中改变线程块的长度,可采用模板化编程,利用switch case 语句对核函数进行调用,让编译器对核函数中没有执行的if语句进行移除,提高效率。终极版本代码如下:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include "math.h"
#include "stdlib.h"

//错误检查的宏定义
#define CHECK(call)									\
{													\
	const cudaError_t status=call;					\
if (status!=cudaSuccess)							\
	{												\
	printf("文件:%s,函数:%s,行号:%d\n",__FILE__,		\
						__FUNCTION__,__LINE__);		\
	printf("%s", cudaGetErrorString(status));		\
	exit(1);										\
	}												\
}													\

//核函数
template<unsigned int iblocksize>
__global__ void Kernel(int *d_data, int *d_local_sum, int N)
{
	int tid = threadIdx.x;
	int index = 8 * blockIdx.x*blockDim.x + threadIdx.x;
	int *data = d_data + 8 * blockIdx.x*blockDim.x;


	if (index + 7 * blockDim.x < N)
	{
		int a = d_data[index];
		int a1 = d_data[index + blockDim.x];
		int a2 = d_data[index + 2 * blockDim.x];
		int a3 = d_data[index + 3 * blockDim.x];
		int a4 = d_data[index + 4 * blockDim.x];
		int a5 = d_data[index + 5 * blockDim.x];
		int a6 = d_data[index + 6 * blockDim.x];
		int a7 = d_data[index + 7 * blockDim.x];
		d_data[index] = (a + a1 + a2 + a3 + a4 + a5 + a6 + a7);
	}

	__syncthreads();

	if (iblocksize >= 1024 && tid < 512)  data[tid] += data[tid + 512]; __syncthreads();
	if (iblocksize >= 512 && tid < 256)	  data[tid] += data[tid + 256]; __syncthreads();
	if (iblocksize >= 256 && tid < 128)	  data[tid] += data[tid + 128]; __syncthreads();
	if (iblocksize >= 128 && tid < 64)	  data[tid] += data[tid + 64];  __syncthreads();


	if (tid < 32)
	{
		volatile int *vmen = data;	//volatile 确保每次对全局内存的修改都即时修改! 
		vmen[tid] += vmen[tid + 32];
		vmen[tid] += vmen[tid + 16];
		vmen[tid] += vmen[tid + 8];
		vmen[tid] += vmen[tid + 4];
		vmen[tid] += vmen[tid + 2];
		vmen[tid] += vmen[tid + 1];
	}

	if (tid == 0)
	{
		d_local_sum[blockIdx.x] = data[0];
	}
}

//主函数
int main()
{

	//基本参数设置
	cudaSetDevice(0);
	const int N = 1 << 29;
	const int local_length = 1024;
	long long total_sum = 0;

	dim3 grid(((N + local_length - 1) / local_length), 1);
	dim3 block(local_length, 1);

	int *h_data = nullptr;
	int *h_local_sum = nullptr;
	int *d_data = nullptr;
	int *d_local_sum = nullptr;


	//Host&Deivce内存申请及数组初始化
	h_data = (int*)malloc(N * sizeof(int));
	h_local_sum = (int*)malloc(int(grid.x / 8) * sizeof(int));


	CHECK(cudaMalloc((void**)&d_data, N * sizeof(int)));

	CHECK(cudaMalloc((void**)&d_local_sum, int(grid.x / 8) * sizeof(int)));

	int s = 0;
	for (int i = 0; i < N; i++)
	{
		h_data[i] = int(10 * sin(0.02*3.14*i));//限制数组元素值,防止最终求和值超过long long的范围
		s += h_data[i];
	}

	//数据拷贝至Device
	CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));

	//for (int i = 0; i < 200; i++)
	//执行核函数
	//Kernel<local_length> << <grid.x / 8, block >> > (d_data, d_local_sum, N);
	switch (block.x)
	{
	case 1024:
		Kernel<1024> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
	case 512:
		Kernel<512> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
	case 256:
		Kernel<256> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
	case 128:
		Kernel<128> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;
	case 64:
		Kernel<64> << <grid.x / 8, block >> > (d_data, d_local_sum, N); break;

	}

	//数据拷贝至Host
	CHECK(cudaMemcpy(h_local_sum, d_local_sum, int(grid.x / 8) * sizeof(int),
		cudaMemcpyDeviceToHost));

	//同步&重置设备
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaDeviceReset());


	for (int i = 0; i < int(grid.x / 8); i++)
	{
		total_sum += h_local_sum[i];
	}

	printf("%I64d \n", total_sum);

	//getchar();
	return 0;

}


  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值