GPU编程 CUDA C++ 数组归约:利用线程协作组和共享内存提高CUDA核心线程利用率

reduce1parallelism.cu

#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *h_x);

int main(void)
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;    // y在核函数中被定义,不加任何限定符,被存于GPU寄存器中
    const int stride = blockDim.x * gridDim.x;   //stride为跨度,跨度为整个网格
    for (int n = bid * blockDim.x + tid; n < N; n += stride)  //大跨度保证合并访问
    {
        y += d_x[n];
    }
    s_y[tid] = y;    //将寄存器变量y复制到共享内存变量s_y[]
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;   //不再使用原子函数atomicAdd(),精度更高
    }
}

real reduce(const real *d_x)    //包装函数
{
    const int ymem = sizeof(real) * GRID_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));

    //两次调用核函数,比使用原子函数atomicAdd()更加精准
    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);   //把长数组d_x归约到较短数组d_y
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

头文件error.cuh中为错误检测宏CHECK函数:

#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

编译:

$ nvcc reduce1parallelism.cu -o reduce1parallelism

运行:

$ ./reduce1parallelism

输出结果:

Time = 0.794592 ms.
Time = 0.62736 ms.
Time = 0.610592 ms.
Time = 0.606592 ms.
Time = 0.617856 ms.
Time = 0.618304 ms.
Time = 0.603584 ms.
Time = 0.600512 ms.
Time = 0.605472 ms.
Time = 0.59984 ms.
Time = 0.610464 ms.
Time = 0.60096 ms.
Time = 0.611584 ms.
Time = 0.618656 ms.
Time = 0.67856 ms.
Time = 0.60352 ms.
Time = 0.616448 ms.
Time = 0.615072 ms.
Time = 0.628576 ms.
Time = 0.645376 ms.
Time = 0.775264 ms.
Time = 0.726016 ms.
Time = 0.708512 ms.
Time = 0.617856 ms.
Time = 0.598976 ms.
Time = 0.608256 ms.
Time = 0.596288 ms.
Time = 0.597056 ms.
Time = 0.615584 ms.
Time = 0.609248 ms.
Time = 0.597184 ms.
Time = 0.597024 ms.
Time = 0.597632 ms.
Time = 0.602528 ms.
Time = 0.601056 ms.
Time = 0.742944 ms.
Time = 0.609504 ms.
Time = 0.613248 ms.
Time = 0.6056 ms.
Time = 0.602528 ms.
Time = 0.600928 ms.
Time = 0.614976 ms.
Time = 0.595936 ms.
Time = 0.602272 ms.
Time = 0.598048 ms.
Time = 0.883232 ms.
Time = 1.02723 ms.
Time = 0.78512 ms.
Time = 0.702624 ms.
Time = 0.601696 ms.
Time = 0.613344 ms.
Time = 0.61104 ms.
Time = 0.61872 ms.
Time = 0.599616 ms.
Time = 0.6416 ms.
Time = 0.768032 ms.
Time = 0.614112 ms.
Time = 0.603744 ms.
Time = 0.669184 ms.
Time = 0.625888 ms.
Time = 0.60544 ms.
Time = 0.599456 ms.
Time = 0.604256 ms.
Time = 0.598432 ms.
Time = 0.621664 ms.
Time = 0.618144 ms.
Time = 0.610752 ms.
Time = 0.622144 ms.
Time = 0.604192 ms.
Time = 0.600512 ms.
Time = 0.605088 ms.
Time = 0.601792 ms.
Time = 0.618976 ms.
Time = 0.613472 ms.
Time = 0.61536 ms.
Time = 0.61776 ms.
Time = 0.617344 ms.
Time = 0.611776 ms.
Time = 0.626016 ms.
Time = 0.606464 ms.
Time = 0.612384 ms.
Time = 0.603264 ms.
Time = 0.616128 ms.
Time = 0.620832 ms.
Time = 0.603264 ms.
Time = 0.602784 ms.
Time = 0.61504 ms.
Time = 0.605088 ms.
Time = 0.6032 ms.
Time = 0.596704 ms.
Time = 0.599136 ms.
Time = 0.60208 ms.
Time = 0.607136 ms.
Time = 1.14854 ms.
Time = 0.792576 ms.
Time = 0.724448 ms.
Time = 0.776928 ms.
Time = 0.602656 ms.
Time = 0.63136 ms.
Time = 0.599328 ms.
sum = 123000064.000000.
(base) 

两次调用核函数归约结果为:123000064.000000,精确结果为:123000000.0,比调用原子函数atomicAdd()计算结果:123633392.0更精确。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

温柔的行子

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值