【CUDA】Memory Coalescing（内存合并）

最新推荐文章于 2025-03-12 19:37:01 发布

xll_bit

最新推荐文章于 2025-03-12 19:37:01 发布

阅读量2.2k

点赞数 1

分类专栏： c++ CUDA 性能分析文章标签：算法 linux c++

本文链接：https://blog.csdn.net/xll_bit/article/details/117702476

版权

c++ 同时被 3 个专栏收录

17 篇文章

订阅专栏

性能分析

6 篇文章

订阅专栏

CUDA

5 篇文章

订阅专栏

内存合并的基础是DRAM Burst。DRAM的地址空间被分成不同的Burst段，当处理器从DRAM访问数据时，位于相同段的其他数据也会被传送到处理器，如下图所示：

利用这一特性，当我们从全局内存搬运数据到共享内存或者寄存器时，相邻的线程应该尽量访问相邻的数据，从而达到内存合并的目的，如下图所示：

如下为实现内存合并的一般性规则：

即，访问元素的id以能以 (X + threadIdx.x)的形式表示，其中X是与threadIdx.x无关的量。

如下是两组实验，main函数详见博客：https://blog.csdn.net/xll_bit/article/details/117700829?spm=1001.2014.3001.5501

本文只粘贴核函数的实现：

//单个线程对数组a按列访问，每个线程在同一指令下访问同一行中相邻元素，能进行内存合并访问
__global__ void kernel_globalx(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x < K && y < M){
        my_type tmp = 0;
        for(int n = 0; n < N; n++){
            tmp += a[y * N + n];
        }
        //c[y * K + x] = tmp;
        c[0] = tmp;
    }
}
//单个线程对数组a按行访问，每个线程在同一指令下访问同一列中相邻元素，不能进行内存合并
__global__ void kernel_globaly(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x < K && y < M){
        my_type tmp = 0;
        for(int n = 0; n < N; n++){
            tmp += a[n * K + x];
        }
        //c[y * K + x] = tmp;
        c[0] = tmp;
    }
}

/* 运行结果
gpu globalx:  16.0523
gpu globaly:  18.9172
*/

从运行时间来看，globaly的运行时间长于globalx，和上面理论一致。

另外，在使用tile形式将全局内存搬到共享内存的过程中时，对两个矩阵的元素访问是都可以进行内存合并的

以下为相关的核函数以及结果：

__global__ void kernel_shared1(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    __shared__ my_type s_a[TILE_SIZE][TILE_SIZE];
    __shared__ my_type s_b[TILE_SIZE][TILE_SIZE];
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    my_type tmp = 0;
    for(int n = 0; n < N; n += TILE_SIZE){
        s_a[ty][tx] = 0;
        int x1 = y *N + n + tx;
        int x2 = (n + ty) * K + x;
        if( y < M && n + tx < N){
            s_a[ty][tx] = a[ x1];
        }
        __syncthreads();
        for(int i = 0; i < TILE_SIZE; i ++){
            tmp += s_a[ty][i];
        }
        __syncthreads();
    }
}
__global__ void kernel_shared2(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    __shared__ my_type s_a[TILE_SIZE][TILE_SIZE];
    __shared__ my_type s_b[TILE_SIZE][TILE_SIZE];
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    my_type tmp = 0;
    for(int n = 0; n < N; n += TILE_SIZE){
        s_b[ty][tx] = 0;
        int x1 = y *N + n + tx;
        int x2 = (n + ty) * K + x;
        if (x < K && n + ty < N){
            s_b[ty][tx] = b[ x2 ];
        }
        __syncthreads();
        for(int i = 0; i < TILE_SIZE; i ++){
            tmp += s_b[i][tx];
        }
        __syncthreads();
    }
}

/* 运行结果
gpu shared1:  3.42726
gpu shared2:  4.54283
*/