CUDA并行归约算法(一)
并行归约(Reduction)是一种并行算法,对于符合结合律的二元操作符,将输入的数组划分为更小的数据库,每个线程计算1个数据块的部分结果,最后把所有部分结果再计算,得出最终结果。
二元操作符可以是求和、取最大、取最小、平方、逻辑与或等。
Example
例如一般对一个数组求和,我们会进行累加计算
int array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
int sum = 0;
for(int i = 7; i >= 0; --i) {
sum += array[i];
}
如果改变思路计算一下
考虑到GPU可以每次加法开辟1个线程,因此对于这种求和问题,串行求和不能发挥GPU的功力。
改用并行归约算法。
并行归约算法
总体思想:
将数组分块,每个线程计算数据块的和,以此迭代,最后1次迭代求和就是最终结果。
这里仅用了3轮迭代,就计算出数组的总和。
Example
初始化1个16M(16777216)大小的数组,分成为多个大小为1K(1024)的小数据块。
因此,将block大小设置为(1024,1),每个线程完成1个数据块的求和。
一共需要16K ( 16777216 ÷ 1024 = 16384 ) (16777216 \div 1024 = 16384) (16777216÷1024=16384) 个线程块,所以设置grid 大小为(16384,1),每个线程块的求和结果都保存在全局变量里,同一传回主机,由主机进行累加求和。
主函数结构:
先初始化,分配内存,运行核函数,最后与CPU结果进行比对。
主代码:
#include<cuda_runtime.h>
#include<stdio.h>
#include<iostream>
#include <sys/time.h>
#define CHECK(result) \
{ \
const cudaError_t error = result; \
if (error != cudaSuccess) { \
printf("ERROR: %s:%d,", __FILE__, __LINE__); \
printf("code:%d, reason:%s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
double CpuSeconds()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
}
void InitDevice(int devNum) {
int dev = devNum;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
}
int InitialData(int* arr, int size) {
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i < size; i++) {
arr[i] = int(rand() &0xff);
}
}
__global__ void ReduceNeighbour(int* g_idata, int* g_odata, unsigned int n)
{
//set thread ID
unsigned int t_id = threadIdx.x;
// boundary check
if (t_id >= n)
{
return;
}
int *idata = g_idata + blockIdx.x * blockDim.x;
// in-place reduction in global memory
for (int stride = 1; stride < blockDim.x; stride *= 2) {
if((t_id % (2 * stride)) == 0) {
idata[t_id] += idata[t_id + stride];
}
// synchronize within block
__syncthreads();
}
// write result for this block to global memory
if (t_id == 0)
{
g_odata[blockIdx.x] = idata[0]; // 记录每个block的值
}
}
int main(int argc, char** argv)
{
InitDevice(0);
int size = 1 << 24;
std::cout << " array size: " << size << std::endl;
// execution configuration
int blocksize = 1024;
if(argc > 1) {
blocksize = atoi(argv[1]); // 获取命令行输入block大小
}
dim3 block(blocksize , 1);
dim3 grid((size - 1) / block.x + 1, 1);
std::cout << "grid size: " << grid.x << ", block size: " << block.x << std::endl;
// allocate host memory
size_t bytes = size * sizeof(int);
int *idata_host = (int*)malloc(bytes);
int *odata_host = (int*)malloc(grid.x * sizeof(int));
int *tmp = (int*)malloc(bytes);
// initialize the array
InitialData(idata_host, size);
memcpy(tmp, idata_host, bytes);
double s_time, e_time;
int sum_gpu = 0;
// device memory
int* idata_dev = nullptr;
CHECK(cudaMalloc((void**)&idata_dev, bytes)); // 存储大数组
int* odata_dev = nullptr;
CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); // 存储每个数据块的和
// cpu reduction sum
int sum_cpu = 0;
s_time = CpuSeconds();
for(int i =0; i < size; i++) {
sum_cpu += tmp[i];
}
e_time = 1000 * (CpuSeconds() - s_time);
std::cout << "CPU sum: " <<sum_cpu <<std::endl;
std::cout << "CPU reduction elapsed " << e_time << " ms, CPU sum: " << sum_cpu << std::endl;
//kernel reduceNeighbored
CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
CHECK(cudaDeviceSynchronize());
s_time = CpuSeconds();
ReduceNeighbour <<<grid, block >>>(idata_dev, odata_dev, size);
cudaDeviceSynchronize();
cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
sum_gpu = 0;
for (int i = 0; i < grid.x; i++)
sum_gpu += odata_host[i];
e_time = 1000*(CpuSeconds() - s_time);
printf("gpu sum:%d \n", sum_gpu);
printf("gpu reduceNeighbored elapsed %lf ms <<<grid %d block %d>>>\n",
e_time, grid.x, block.x);
// free host memory
free(idata_host);
free(odata_host);
CHECK(cudaFree(idata_dev));
CHECK(cudaFree(odata_dev));
//reset device
cudaDeviceReset();
//check the results
if (sum_gpu == sum_cpu)
{
printf("Test success!\n");
}
return EXIT_SUCCESS;
}
结果:
array size: 16777216
grid size: 16384, block size: 1024
CPU sum: 2138910180
CPU reduction elapsed 53.359 ms, CPU sum: 2138910180
gpu sum:2138910180
gpu reduceNeighbored elapsed 2.512932 ms <<<grid 16384 block 1024>>>
Test success!
可以看出来并行归约计算仅用了2.512932ms,而CPU算法用了53.359ms,效率提高了很多倍。
当然本文斯朴素并行归约算法,还有改动空间,因为启动线程时,1个block有1024个线程,但是随着迭代进行,每次会有一半的线程活跃,一半的线程不活跃,造成了资源浪费。