并行规约通常用于处理大输入数据集,将一组输入值规约一个值。
数据特点:
(1)对于数据集中的元素没有顺序要求。
(2)可将数据分为若干小集合,每个线程处理一个集合。
操作可以是:求最大值(Max)、求最小值(Min)、求和(Sum)、求乘(Product)。
并行规约求和
规约求和是常见应用,将输入数据求和得到一个值。如下面简单例子所示:
规约求和的最简单思想是:先两两求和,然后再两两直至得到最后结果。
核函数代码
// device code
__global__ void vec_add_device(FLOAT* a, int width)
{
int tid = threadIdx.y*blockDim.x + threadIdx.x;
for (int stride = 1; stride < width; stride*=2)
{
if (tid % (2*stride) == 0)
{
a[tid] += a[tid + stride];
}
__syncthreads();
}
}
代码中
完整代码
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>
typedef float FLOAT;
double get_time();
void warm_up();
void vec_add_host(FLOAT* M, int width);
__global__ void vec_add_device(FLOAT* M, FLOAT* N, FLOAT* P, int width);
// <2d grid, 1d block>
#define get_tid() ((blockIdx.y*gridDim.x + blockIdm.x)*blockDim.x + threadIdm.x)
#define get_bid() (blockIdx.y*gridDim.x + blockIdx.x)
double get_time()
{
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if (init != 1)
{
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceCounter(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
// warm up gpu
__global__ void warmup_knl(void)
{
int i, j;
i = 1;
j = 1;
i = i + j;
}
void warm_up()
{
int i = 0;
for (; i < 8; ++i)
{
warmup_knl <<<1, 256 >>> ();
}
}
// host code
void vec_add_host(FLOAT* a, int len)
{
FLOAT sum = a[0];
for (int i = 1; i < len; ++i)
{
a[0] += a[i];
}
}
// device code
__global__ void vec_add_device(FLOAT* a, int width)
{
int tid = threadIdx.y*blockDim.x + threadIdx.x;
for (int stride = 1; stride < width; stride*=2)
{
if (tid % (2*stride) == 0)
{
a[tid] += a[tid + stride];
}
__syncthreads();
}
}
int main()
{
int M = 32, sb = 1;
int N = M*M;
int sg = int(M / sb);
int nbytes = N * sizeof(FLOAT);
dim3 dimGrid(1, 1);
dim3 dimBlock(1024, 1);
cudaError_t cudaStatus = cudaSetDevice(0);
FLOAT* dm = NULL, *hm = NULL;
int iter = 1, i;
double th, td;
warm_up();
/* allocate gpu memory */
cudaMalloc((void**)&dm, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dm failed!");
goto Error;
}
hm = (FLOAT*)malloc(nbytes);
if (hm == NULL)
{
fprintf(stderr, "mallo hm failed!");
goto Error;
}
/* init */
for (i = 0; i < N; ++i) hm[i] = 1.0;
/* copy data to gpu*/
cudaMemcpy(dm, hm, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
warm_up();
// call for gpu
cudaThreadSynchronize();
td = get_time();
for (i = 0; i < iter; ++i) vec_add_device << <dimGrid, dimBlock >> > (dm, N);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "launch kernel failed!");
goto Error;
}
cudaThreadSynchronize();
td = get_time() - td;
// call for cpu
th = get_time();
for (i = 0; i < iter; ++i) vec_add_host(hm, N);
th = get_time() - th;
printf("GPU time: %.8f, CPU time: %.6f, Speedup: %g\n", td, th, th / td);
FLOAT* hm2 = (FLOAT*)malloc(nbytes);
for (int i = 0; i < N; ++i) hm2[i] = 0;
cudaMemcpy(hm2, dm, nbytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
printf("%.1f, %.1f\n", hm[0], hm2[0]);
Error:
// free
cudaFree(dm);
free(hm);
free(hm2);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
return -1;
}
return 0;
}
实验结果
GPU time: 0.00007460, CPU time: 0.000003, Speedup: 0.0361931
1024.0, 1024.0
存在问题:
上面的代码是最简单的规约求和方案,没有做任何的优化,数据处理量也不大。
注意:__syncthreads() 是用于块内同步,无法做到块间同步。