GPU编程 CUDA C++ 数组归约：利用线程协作组和共享内存提高CUDA核心线程利用率

最新推荐文章于 2024-03-04 23:43:00 发布

温柔的行子

最新推荐文章于 2024-03-04 23:43:00 发布

阅读量119

点赞数 1

文章标签： c++ 开发语言 c语言算法数据挖掘

本文链接：https://blog.csdn.net/qq_42536162/article/details/129915492

版权

reduce1parallelism.cu

#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *h_x);

int main(void)
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;    // y在核函数中被定义，不加任何限定符，被存于GPU寄存器中
    const int stride = blockDim.x * gridDim.x;   //stride为跨度，跨度为整个网格
    for (int n = bid * blockDim.x + tid; n < N; n += stride)  //大跨度保证合并访问
    {
        y += d_x[n];
    }
    s_y[tid] = y;    //将寄存器变量y复制到共享内存变量s_y[]
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;   //不再使用原子函数atomicAdd()，精度更高
    }
}

real reduce(const real *d_x)    //包装函数
{
    const int ymem = sizeof(real) * GRID_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));

    //两次调用核函数，比使用原子函数atomicAdd()更加精准
    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);   //把长数组d_x归约到较短数组d_y
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

头文件error.cuh中为错误检测宏CHECK函数：

#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

编译：

$ nvcc reduce1parallelism.cu -o reduce1parallelism

运行：

$ ./reduce1parallelism

输出结果：

Time = 0.794592 ms.
Time = 0.62736 ms.
Time = 0.610592 ms.
Time = 0.606592 ms.
Time = 0.617856 ms.
Time = 0.618304 ms.
Time = 0.603584 ms.
Time = 0.600512 ms.
Time = 0.605472 ms.
Time = 0.59984 ms.
Time = 0.610464 ms.
Time = 0.60096 ms.
Time = 0.611584 ms.
Time = 0.618656 ms.
Time = 0.67856 ms.
Time = 0.60352 ms.
Time = 0.616448 ms.
Time = 0.615072 ms.
Time = 0.628576 ms.
Time = 0.645376 ms.
Time = 0.775264 ms.
Time = 0.726016 ms.
Time = 0.708512 ms.
Time = 0.617856 ms.
Time = 0.598976 ms.
Time = 0.608256 ms.
Time = 0.596288 ms.
Time = 0.597056 ms.
Time = 0.615584 ms.
Time = 0.609248 ms.
Time = 0.597184 ms.
Time = 0.597024 ms.
Time = 0.597632 ms.
Time = 0.602528 ms.
Time = 0.601056 ms.
Time = 0.742944 ms.
Time = 0.609504 ms.
Time = 0.613248 ms.
Time = 0.6056 ms.
Time = 0.602528 ms.
Time = 0.600928 ms.
Time = 0.614976 ms.
Time = 0.595936 ms.
Time = 0.602272 ms.
Time = 0.598048 ms.
Time = 0.883232 ms.
Time = 1.02723 ms.
Time = 0.78512 ms.
Time = 0.702624 ms.
Time = 0.601696 ms.
Time = 0.613344 ms.
Time = 0.61104 ms.
Time = 0.61872 ms.
Time = 0.599616 ms.
Time = 0.6416 ms.
Time = 0.768032 ms.
Time = 0.614112 ms.
Time = 0.603744 ms.
Time = 0.669184 ms.
Time = 0.625888 ms.
Time = 0.60544 ms.
Time = 0.599456 ms.
Time = 0.604256 ms.
Time = 0.598432 ms.
Time = 0.621664 ms.
Time = 0.618144 ms.
Time = 0.610752 ms.
Time = 0.622144 ms.
Time = 0.604192 ms.
Time = 0.600512 ms.
Time = 0.605088 ms.
Time = 0.601792 ms.
Time = 0.618976 ms.
Time = 0.613472 ms.
Time = 0.61536 ms.
Time = 0.61776 ms.
Time = 0.617344 ms.
Time = 0.611776 ms.
Time = 0.626016 ms.
Time = 0.606464 ms.
Time = 0.612384 ms.
Time = 0.603264 ms.
Time = 0.616128 ms.
Time = 0.620832 ms.
Time = 0.603264 ms.
Time = 0.602784 ms.
Time = 0.61504 ms.
Time = 0.605088 ms.
Time = 0.6032 ms.
Time = 0.596704 ms.
Time = 0.599136 ms.
Time = 0.60208 ms.
Time = 0.607136 ms.
Time = 1.14854 ms.
Time = 0.792576 ms.
Time = 0.724448 ms.
Time = 0.776928 ms.
Time = 0.602656 ms.
Time = 0.63136 ms.
Time = 0.599328 ms.
sum = 123000064.000000.
(base)

两次调用核函数归约结果为：123000064.000000，精确结果为：123000000.0，比调用原子函数atomicAdd()计算结果：123633392.0更精确。

温柔的行子

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
GPU编程 CUDA C++ 数组归约：利用线程协作组和共享内存提高CUDA核心线程利用率

两次调用核函数归约结果为：123000064.000000，精确结果为：123000000.0，比调用原子函数atomicAdd()计算结果：123633392.0更精确。
复制链接

扫一扫