GPU编程 CUDA C++ 数组归约，性能优化：利用静态全局内存代替动态全局内存，避免反复分配和释放GPU显存

温柔的行子

于 2023-04-02 19:57:53 发布

阅读量253

点赞数 1

文章标签： c++ 开发语言算法人工智能

本文链接：https://blog.csdn.net/qq_42536162/article/details/129915706

版权

静态全局内存在编译期间就会分配好，不会在运行时反复分配，故性能更高：

reduce2static.cu

#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;  //使用线程协作组

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;   //常量

void timing(const real *d_x);

int main(void)
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];   //使用共享内存

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}

__device__ real static_y[GRID_SIZE];   //在函数外定义静态全局内存变量

real reduce(const real *d_x)
{
    real *d_y;
    //利用cudaGetSymbolAddress()函数获得一个指向静态全局变量static_y内存地址的指针d_y
    CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
    
    const int smem = sizeof(real) * BLOCK_SIZE;

    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    real h_y[1] = {0};
    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    // CHECK(cudaMemcpyFromSymbol(h_y, static_y, sizeof(real)); // also ok

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

头文件error.cuh定义错误检查宏CHECK函数：

#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

编译：

$ nvcc reduce2static.cu -o reduce2static

运行：

$ ./reduce2static

输出结果：

Time = 0.57488 ms.
Time = 0.481184 ms.
Time = 0.486592 ms.
Time = 0.496288 ms.
Time = 0.490688 ms.
Time = 0.483872 ms.
Time = 0.484064 ms.
Time = 0.481376 ms.
Time = 0.48432 ms.
Time = 0.479776 ms.
Time = 0.477376 ms.
Time = 0.478592 ms.
Time = 0.47968 ms.
Time = 0.481024 ms.
Time = 0.483936 ms.
Time = 0.479744 ms.
Time = 0.481024 ms.
Time = 0.49072 ms.
Time = 0.48128 ms.
Time = 0.488608 ms.
Time = 0.481728 ms.
Time = 0.482176 ms.
Time = 0.484096 ms.
Time = 0.478528 ms.
Time = 0.47712 ms.
Time = 0.477664 ms.
Time = 0.726176 ms.
Time = 0.65888 ms.
Time = 0.734848 ms.
Time = 0.712736 ms.
Time = 0.483712 ms.
Time = 0.479552 ms.
Time = 0.477472 ms.
Time = 0.482432 ms.
Time = 0.482272 ms.
Time = 0.48016 ms.
Time = 0.480896 ms.
Time = 0.481728 ms.
Time = 0.49392 ms.
Time = 0.478144 ms.
Time = 0.478368 ms.
Time = 0.483008 ms.
Time = 0.47776 ms.
Time = 0.479264 ms.
Time = 0.491968 ms.
Time = 0.48112 ms.
Time = 0.480384 ms.
Time = 0.484864 ms.
Time = 0.483936 ms.
Time = 0.482272 ms.
Time = 0.480672 ms.
Time = 0.480832 ms.
Time = 0.477696 ms.
Time = 0.47824 ms.
Time = 0.48128 ms.
Time = 0.490912 ms.
Time = 1.20912 ms.
Time = 0.660928 ms.
Time = 0.73088 ms.
Time = 0.484032 ms.
Time = 0.484032 ms.
Time = 0.480928 ms.
Time = 0.477312 ms.
Time = 0.48592 ms.
Time = 0.489568 ms.
Time = 0.496 ms.
Time = 0.480704 ms.
Time = 0.479328 ms.
Time = 0.483648 ms.
Time = 0.4776 ms.
Time = 0.477696 ms.
Time = 0.482592 ms.
Time = 0.484064 ms.
Time = 0.484096 ms.
Time = 0.47792 ms.
Time = 0.48144 ms.
Time = 0.47856 ms.
Time = 0.493248 ms.
Time = 0.478176 ms.
Time = 0.482272 ms.
Time = 0.724128 ms.
Time = 0.477184 ms.
Time = 0.484832 ms.
Time = 0.49392 ms.
Time = 0.482176 ms.
Time = 0.477824 ms.
Time = 0.477248 ms.
Time = 0.477632 ms.
Time = 0.484608 ms.
Time = 0.476864 ms.
Time = 0.480864 ms.
Time = 0.4808 ms.
Time = 0.4976 ms.
Time = 0.4912 ms.
Time = 0.47808 ms.
Time = 0.493984 ms.
Time = 0.480384 ms.
Time = 0.489184 ms.
Time = 0.48288 ms.
Time = 0.481792 ms.
sum = 123000064.000000.
(base)

比较本人之前博客中的结果（GPU编程 CUDA C++ 数组归约：利用线程协作组和共享内存提高CUDA核心线程利用率_温柔的行子的博客-CSDN博客），速度快了大约20%，精度保持不变。