reduce1parallelism.cu
#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;
void timing(const real *h_x);
int main(void)
{
real *h_x = (real *) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = 1.23;
}
real *d_x;
CHECK(cudaMalloc(&d_x, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
extern __shared__ real s_y[];
real y = 0.0; // y在核函数中被定义,不加任何限定符,被存于GPU寄存器中
const int stride = blockDim.x * gridDim.x; //stride为跨度,跨度为整个网格
for (int n = bid * blockDim.x + tid; n < N; n += stride) //大跨度保证合并访问
{
y += d_x[n];
}
s_y[tid] = y; //将寄存器变量y复制到共享内存变量s_y[]
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
y = s_y[tid];
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
for (int i = g.size() >> 1; i > 0; i >>= 1)
{
y += g.shfl_down(y, i);
}
if (tid == 0)
{
d_y[bid] = y; //不再使用原子函数atomicAdd(),精度更高
}
}
real reduce(const real *d_x) //包装函数
{
const int ymem = sizeof(real) * GRID_SIZE;
const int smem = sizeof(real) * BLOCK_SIZE;
real h_y[1] = {0};
real *d_y;
CHECK(cudaMalloc(&d_y, ymem));
//两次调用核函数,比使用原子函数atomicAdd()更加精准
reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N); //把长数组d_x归约到较短数组d_y
reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
CHECK(cudaFree(d_y));
return h_y[0];
}
void timing(const real *d_x)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n", sum);
}
头文件error.cuh中为错误检测宏CHECK函数:
#pragma once
#include <stdio.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
编译:
$ nvcc reduce1parallelism.cu -o reduce1parallelism
运行:
$ ./reduce1parallelism
输出结果:
Time = 0.794592 ms.
Time = 0.62736 ms.
Time = 0.610592 ms.
Time = 0.606592 ms.
Time = 0.617856 ms.
Time = 0.618304 ms.
Time = 0.603584 ms.
Time = 0.600512 ms.
Time = 0.605472 ms.
Time = 0.59984 ms.
Time = 0.610464 ms.
Time = 0.60096 ms.
Time = 0.611584 ms.
Time = 0.618656 ms.
Time = 0.67856 ms.
Time = 0.60352 ms.
Time = 0.616448 ms.
Time = 0.615072 ms.
Time = 0.628576 ms.
Time = 0.645376 ms.
Time = 0.775264 ms.
Time = 0.726016 ms.
Time = 0.708512 ms.
Time = 0.617856 ms.
Time = 0.598976 ms.
Time = 0.608256 ms.
Time = 0.596288 ms.
Time = 0.597056 ms.
Time = 0.615584 ms.
Time = 0.609248 ms.
Time = 0.597184 ms.
Time = 0.597024 ms.
Time = 0.597632 ms.
Time = 0.602528 ms.
Time = 0.601056 ms.
Time = 0.742944 ms.
Time = 0.609504 ms.
Time = 0.613248 ms.
Time = 0.6056 ms.
Time = 0.602528 ms.
Time = 0.600928 ms.
Time = 0.614976 ms.
Time = 0.595936 ms.
Time = 0.602272 ms.
Time = 0.598048 ms.
Time = 0.883232 ms.
Time = 1.02723 ms.
Time = 0.78512 ms.
Time = 0.702624 ms.
Time = 0.601696 ms.
Time = 0.613344 ms.
Time = 0.61104 ms.
Time = 0.61872 ms.
Time = 0.599616 ms.
Time = 0.6416 ms.
Time = 0.768032 ms.
Time = 0.614112 ms.
Time = 0.603744 ms.
Time = 0.669184 ms.
Time = 0.625888 ms.
Time = 0.60544 ms.
Time = 0.599456 ms.
Time = 0.604256 ms.
Time = 0.598432 ms.
Time = 0.621664 ms.
Time = 0.618144 ms.
Time = 0.610752 ms.
Time = 0.622144 ms.
Time = 0.604192 ms.
Time = 0.600512 ms.
Time = 0.605088 ms.
Time = 0.601792 ms.
Time = 0.618976 ms.
Time = 0.613472 ms.
Time = 0.61536 ms.
Time = 0.61776 ms.
Time = 0.617344 ms.
Time = 0.611776 ms.
Time = 0.626016 ms.
Time = 0.606464 ms.
Time = 0.612384 ms.
Time = 0.603264 ms.
Time = 0.616128 ms.
Time = 0.620832 ms.
Time = 0.603264 ms.
Time = 0.602784 ms.
Time = 0.61504 ms.
Time = 0.605088 ms.
Time = 0.6032 ms.
Time = 0.596704 ms.
Time = 0.599136 ms.
Time = 0.60208 ms.
Time = 0.607136 ms.
Time = 1.14854 ms.
Time = 0.792576 ms.
Time = 0.724448 ms.
Time = 0.776928 ms.
Time = 0.602656 ms.
Time = 0.63136 ms.
Time = 0.599328 ms.
sum = 123000064.000000.
(base)
两次调用核函数归约结果为:123000064.000000,精确结果为:123000000.0,比调用原子函数atomicAdd()计算结果:123633392.0更精确。