静态全局内存在编译期间就会分配好,不会在运行时反复分配,故性能更高:
reduce2static.cu
#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups; //使用线程协作组
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240; //常量
void timing(const real *d_x);
int main(void)
{
real *h_x = (real *) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = 1.23;
}
real *d_x;
CHECK(cudaMalloc(&d_x, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
extern __shared__ real s_y[]; //使用共享内存
real y = 0.0;
const int stride = blockDim.x * gridDim.x;
for (int n = bid * blockDim.x + tid; n < N; n += stride)
{
y += d_x[n];
}
s_y[tid] = y;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
y = s_y[tid];
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
for (int i = g.size() >> 1; i > 0; i >>= 1)
{
y += g.shfl_down(y, i);
}
if (tid == 0)
{
d_y[bid] = y;
}
}
__device__ real static_y[GRID_SIZE]; //在函数外定义静态全局内存变量
real reduce(const real *d_x)
{
real *d_y;
//利用cudaGetSymbolAddress()函数获得一个指向静态全局变量static_y内存地址的指针d_y
CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
const int smem = sizeof(real) * BLOCK_SIZE;
reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);
reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);
real h_y[1] = {0};
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
// CHECK(cudaMemcpyFromSymbol(h_y, static_y, sizeof(real)); // also ok
return h_y[0];
}
void timing(const real *d_x)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("sum = %f.\n", sum);
}
头文件error.cuh定义错误检查宏CHECK函数:
#pragma once
#include <stdio.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
编译:
$ nvcc reduce2static.cu -o reduce2static
运行:
$ ./reduce2static
输出结果:
Time = 0.57488 ms.
Time = 0.481184 ms.
Time = 0.486592 ms.
Time = 0.496288 ms.
Time = 0.490688 ms.
Time = 0.483872 ms.
Time = 0.484064 ms.
Time = 0.481376 ms.
Time = 0.48432 ms.
Time = 0.479776 ms.
Time = 0.477376 ms.
Time = 0.478592 ms.
Time = 0.47968 ms.
Time = 0.481024 ms.
Time = 0.483936 ms.
Time = 0.479744 ms.
Time = 0.481024 ms.
Time = 0.49072 ms.
Time = 0.48128 ms.
Time = 0.488608 ms.
Time = 0.481728 ms.
Time = 0.482176 ms.
Time = 0.484096 ms.
Time = 0.478528 ms.
Time = 0.47712 ms.
Time = 0.477664 ms.
Time = 0.726176 ms.
Time = 0.65888 ms.
Time = 0.734848 ms.
Time = 0.712736 ms.
Time = 0.483712 ms.
Time = 0.479552 ms.
Time = 0.477472 ms.
Time = 0.482432 ms.
Time = 0.482272 ms.
Time = 0.48016 ms.
Time = 0.480896 ms.
Time = 0.481728 ms.
Time = 0.49392 ms.
Time = 0.478144 ms.
Time = 0.478368 ms.
Time = 0.483008 ms.
Time = 0.47776 ms.
Time = 0.479264 ms.
Time = 0.491968 ms.
Time = 0.48112 ms.
Time = 0.480384 ms.
Time = 0.484864 ms.
Time = 0.483936 ms.
Time = 0.482272 ms.
Time = 0.480672 ms.
Time = 0.480832 ms.
Time = 0.477696 ms.
Time = 0.47824 ms.
Time = 0.48128 ms.
Time = 0.490912 ms.
Time = 1.20912 ms.
Time = 0.660928 ms.
Time = 0.73088 ms.
Time = 0.484032 ms.
Time = 0.484032 ms.
Time = 0.480928 ms.
Time = 0.477312 ms.
Time = 0.48592 ms.
Time = 0.489568 ms.
Time = 0.496 ms.
Time = 0.480704 ms.
Time = 0.479328 ms.
Time = 0.483648 ms.
Time = 0.4776 ms.
Time = 0.477696 ms.
Time = 0.482592 ms.
Time = 0.484064 ms.
Time = 0.484096 ms.
Time = 0.47792 ms.
Time = 0.48144 ms.
Time = 0.47856 ms.
Time = 0.493248 ms.
Time = 0.478176 ms.
Time = 0.482272 ms.
Time = 0.724128 ms.
Time = 0.477184 ms.
Time = 0.484832 ms.
Time = 0.49392 ms.
Time = 0.482176 ms.
Time = 0.477824 ms.
Time = 0.477248 ms.
Time = 0.477632 ms.
Time = 0.484608 ms.
Time = 0.476864 ms.
Time = 0.480864 ms.
Time = 0.4808 ms.
Time = 0.4976 ms.
Time = 0.4912 ms.
Time = 0.47808 ms.
Time = 0.493984 ms.
Time = 0.480384 ms.
Time = 0.489184 ms.
Time = 0.48288 ms.
Time = 0.481792 ms.
sum = 123000064.000000.
(base)
比较本人之前博客中的结果(GPU编程 CUDA C++ 数组归约:利用线程协作组和共享内存提高CUDA核心线程利用率_温柔的行子的博客-CSDN博客),速度快了大约20%,精度保持不变。