共享内存
共享内存设置
#include<stdio.h>
#include<stdlib.h>
int main(int argc, char **argv)
{
cudaSharedMemConfig pConfig;
cudaDeviceGetSharedMemConfig(&pConfig);
switch(pConfig)
{
case cudaSharedMemBankSizeDefault: printf("bank size is default\n"); break;
case cudaSharedMemBankSizeFourByte: printf("bank size is 4 byte\n"); break;
case cudaSharedMemBankSizeEightByte: printf("bank size is 8 byte\n"); break;
}
return 0;
}
bank size is 4 byte
也就是说,当前的存储体是32位的。
共享内存数据访问和存储
测试一下通过行主导和列主导两种方式读写SMEM的特点和区别。
#include<stdio.h>
#include<stdlib.h>
#define BDIMX 32
#define BDIMY 32
__global__ void setRowReadRow(int *out)
{
//- 创建静态共享内存
__shared__ int tile[BDIMY][BDIMX];
int index = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = index;
__syncthreads();
out[index] = tile[threadIdx.y][threadIdx.x];
}
__global__ void setColReadCol(int *out)
{
//- 创建静态共享内存
__shared__ int tile[BDIMX][BDIMY];
int index = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.x][threadIdx.y] = index;
__syncthreads();
out[index] = tile[threadIdx.x][threadIdx.y];
}
int main(int argc, char **argv)
{
cudaSetDevice(5);
int *out;
int nElem = 1024;
cudaMalloc((void **)&out, nElem * sizeof(int));
dim3 block(BDIMX,BDIMY);
dim3 grid(1);
setRowReadRow<<<grid, block>>>(out);
cudaDeviceSynchronize();
cudaFuncSetSharedMemConfig(setColReadCol, cudaSharedMemBankSizeFourByte);
setColReadCol<<<grid, block>>>(out);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
第一个核函数采用行主导读取和写入,可以预计,同一个warp中的thread访问的int(4字节)数据依次分布在不同的bank中,因此不会出现bank冲突。因此读写均为一个事务。第二个核函数采用列主导读取和写入,同一个warp中的thread访问的数据全部位于同一个bank中,因此会出现bank冲突。
==55784== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (5)"
Kernel: setColReadCol(int*)
1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 16.000000 16.000000 16.000000
1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 16.000000 16.000000 16.000000
Kernel: setRowReadRow(int*)
1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.000000 1.000000 1.000000
1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.000000 1.000000 1.000000
比较奇怪的是,我这里已经显式设置了第二个核函数采用4字节的bank,照理说应该会出现32路bank冲突,即一个warp的thread请求,需要32个内存事务来服务。但输出结果表明,这是8字节bank。
填充解除bank冲突
共享内存加速归并算法
利用共享内存的速度远大于全局内存的特性,因此,在每个block中,先将要处理的数据从GMEM中加载到SMEM中,然后再进行归并处理。
#include<stdlib.h>
#include<stdio.h>
#define BLOCK_DIM_X 1024
__global__ void reduceGmem(int *g_idata, int *g_odata, int nElem)
{
int index_global = blockIdx.x * blockDim.x + threadIdx.x;
int *ref_idata = g_idata + blockIdx.x * blockDim.x;
if (index_global < nElem)
{
if(blockDim.x >= 1024 && threadIdx.x < 512) ref_idata[threadIdx.x] += ref_idata[threadIdx.x + 512];
__syncthreads();
if(blockDim.x >= 512 && threadIdx.x < 256) ref_idata[threadIdx.x] += ref_idata[threadIdx.x + 256];
__syncthreads();
if(blockDim.x >= 256 && threadIdx.x < 128) ref_idata[threadIdx.x] += ref_idata[threadIdx.x + 128];
__syncthreads();
if(blockDim.x >= 128 && threadIdx.x < 64) ref_idata[threadIdx.x] += ref_idata[threadIdx.x + 64];
__syncthreads();
if(threadIdx.x<32)
{
volatile int *vsmem = ref_idata;
vsmem[threadIdx.x] += vsmem[threadIdx.x + 32];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 16];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 8];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 4];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 2];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 1];
}
if(threadIdx.x == 0) g_odata[blockIdx.x] = ref_idata[0];
}
}
__global__ void reduceSmem(int *g_idata, int *g_odata, int nElem)
{
__shared__ int smem[BLOCK_DIM_X];
int index_global = blockIdx.x * blockDim.x + threadIdx.x;
int *ref_idata = g_idata + blockIdx.x * blockDim.x;
if (index_global < nElem)
{
smem[threadIdx.x] = ref_idata[threadIdx.x]; //- 加载GMEM到SMEM中
__syncthreads(); //- 这步同步非常重要
if(blockDim.x >= 1024 && threadIdx.x < 512) smem[threadIdx.x] += smem[threadIdx.x + 512];
__syncthreads();
if(blockDim.x >= 512 && threadIdx.x < 256) smem[threadIdx.x] += smem[threadIdx.x + 256];
__syncthreads();
if(blockDim.x >= 256 && threadIdx.x < 128) smem[threadIdx.x] += smem[threadIdx.x + 128];
__syncthreads();
if(blockDim.x >= 128 && threadIdx.x < 64) smem[threadIdx.x] += smem[threadIdx.x + 64];
__syncthreads();
if(threadIdx.x<32)
{
volatile int *vsmem = smem;
vsmem[threadIdx.x] += vsmem[threadIdx.x + 32];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 16];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 8];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 4];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 2];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 1];
}
if(threadIdx.x == 0) g_odata[blockIdx.x] = smem[0];
}
}
int main(int argc, char **argv)
{
cudaSetDevice(5);
int nElem = 1<<24;
printf("arr size is %d\n", nElem);
int *g_idata, *g_odata, *h_idata, *h_odata;
dim3 block(BLOCK_DIM_X);
dim3 grid((nElem + block.x - 1) / block.x);
h_idata = (int *)malloc(nElem * sizeof(int));
h_odata = (int *)malloc(grid.x * sizeof(int));
cudaMalloc((void **)&g_idata, nElem * sizeof(int));
cudaMalloc((void **)&g_odata, grid.x * sizeof(int));
for(int i=0;i<nElem;i++)
{
h_idata[i] = 1;
}
cudaMemcpy(g_idata, h_idata, nElem * sizeof(int), cudaMemcpyHostToDevice);
reduceSmem<<<grid, block>>>(g_idata, g_odata, nElem);
cudaDeviceSynchronize();
reduceGmem<<<grid, block>>>(g_idata, g_odata, nElem);
cudaDeviceSynchronize();
cudaMemcpy(h_odata, g_odata, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
int result = 0;
for(int i = 0; i<grid.x; i++)
{
result += h_odata[i];
}
if(result == nElem)
printf("result is correct!\n");
else
printf("result is error!\n");
free(h_odata);
cudaFree(g_idata);
cudaFree(g_odata);
cudaDeviceReset();
return 0;
}
[mmhe@k231 chapter5]$ nvprof ./test
==31397== NVPROF is profiling process 31397, command: ./test
arr size is 16777216
result is correct!
==31397== Profiling application: ./test
==31397== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 37.43% 15.287ms 1 15.287ms 15.287ms 15.287ms reduceGmem(int*, int*, int)
37.00% 15.109ms 1 15.109ms 15.109ms 15.109ms reduceSmem(int*, int*, int)
25.54% 10.431ms 1 10.431ms 10.431ms 10.431ms [CUDA memcpy HtoD]
0.03% 11.231us 1 11.231us 11.231us 11.231us [CUDA memcpy DtoH]
可以看到,两者的运行时间并没有什么区别。
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (5)"
Kernel: reduceSmem(int*, int*, int)
1 gld_transactions Global Load Transactions 524288 524288 524288
1 gst_transactions Global Store Transactions 16384 16384 16384
Kernel: reduceGmem(int*, int*, int)
1 gld_transactions Global Load Transactions 1277952 1277952 1277952
1 gst_transactions Global Store Transactions 606208 606208 606208
SMEM的使用,减少了对GMEM的访问次数。
正如之前提到过的规约算法的优化逻辑,可以令一个block负责多块数据。
一个block负责4块数据
__global__ void reduceSmemUnroll(int *g_idata, int *g_odata, int nElem)
{
__shared__ int smem[BLOCK_DIM_X];
int index_global = blockIdx.x * blockDim.x + threadIdx.x;
if (index_global < nElem)
{
//- 一个线程负责4个数据
smem[threadIdx.x] = g_idata[index_global]
+ g_idata[index_global + blockDim.x * gridDim.x]
+ g_idata[index_global + 2 * blockDim.x * gridDim.x]
+ g_idata[index_global + 3 * blockDim.x * gridDim.x];
__syncthreads();
if(blockDim.x >= 1024 && threadIdx.x < 512) smem[threadIdx.x] += smem[threadIdx.x + 512];
__syncthreads();
if(blockDim.x >= 512 && threadIdx.x < 256) smem[threadIdx.x] += smem[threadIdx.x + 256];
__syncthreads();
if(blockDim.x >= 256 && threadIdx.x < 128) smem[threadIdx.x] += smem[threadIdx.x + 128];
__syncthreads();
if(blockDim.x >= 128 && threadIdx.x < 64) smem[threadIdx.x] += smem[threadIdx.x + 64];
__syncthreads();
if(threadIdx.x<32)
{
volatile int *vsmem = smem;
vsmem[threadIdx.x] += vsmem[threadIdx.x + 32];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 16];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 8];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 4];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 2];
vsmem[threadIdx.x] += vsmem[threadIdx.x + 1];
}
if(threadIdx.x == 0) g_odata[blockIdx.x] = smem[0];
}
}
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (5)"
Kernel: reduceSmemUnroll(int*, int*, int)
1 gld_transactions Global Load Transactions 524288 524288 524288
1 gst_transactions Global Store Transactions 4096 4096 4096
Kernel: reduceGmem(int*, int*, int)
1 gld_transactions Global Load Transactions 1277952 1277952 1277952
1 gst_transactions Global Store Transactions 606208 606208 606208
存储事务减少至1/4,但是加载事务保持不变
矩阵转置算法
__global__ void transposeGmem(int *A, int *B, int row, int col)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < col && y < row)
{
B[x * row + y] = A[y * col + x];
}
}
直接采用GMEM实现,数据加载是合并的,但是数据保存是发散的。
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: transposeGmem(int*, int*, int, int)
1 gld_transactions Global Load Transactions 8388608 8388608 8388608
1 gst_transactions Global Store Transactions 268435456 268435456 268435456
1 gld_transactions_per_request Global Load Transactions Per Request 1.000000 1.000000 1.000000
1 gst_transactions_per_request Global Store Transactions Per Request 32.000000 32.000000 32.000000
场量内存
常量内存的特性是,一个warp中访问同一个场量内存的地址是最理想的,如果同一个warp中的thread访问不同的场量地址,则需要串行。
场量内存只能在主机端进行创建和初始化。
差分计算
f ′ ( x ) = c 0 [ f ( x + 4 b ) − f ( x − 4 b ) ] + c 1 [ f ( x + 3 b ) − f ( x − 3 b ) ] − c 2 [ f ( x + 2 b ) − f ( x − 2 b ) ] + c 3 [ f ( x + b ) − f ( x − b ) ] \begin{split} f'(x) = &c_0\left[f(x+4b)-f(x-4b)\right]\\ &+c_1\left[f(x+3b)-f(x-3b)\right]\\ &-c_2\left[f(x+2b)-f(x-2b)\right]\\ &+c_3\left[f(x+b)-f(x-b)\right] \end{split} f′(x)=c0[f(x+4b)−f(x−4b)]+c1[f(x+3b)−f(x−3b)]−c2[f(x+2b)−f(x−2b)]+c3[f(x+b)−f(x−b)]
#include<stdlib.h>
#include<stdio.h>
#include"../../CodeSamples/common/common.h"
#define RAID 4
#define LENGTH (1<<24)
#define BLOCK_DIM_X 1024
__constant__ float coeff[RAID]; //- 在全局区域内声明场量变量
__host__ void init(float *arr, int nElem)
{
for (int i = 0; i < nElem; i++)
{
arr[i] = (float)(rand() & 0xFF) / 100.0f;
}
}
__global__ void stencil_ld(float *A, float *B, int nElem)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float smem[BLOCK_DIM_X + 2 * RAID];
//- 载入数据到SMEM
smem[threadIdx.x + RAID] = A[index + RAID];
if (threadIdx.x < RAID)
{
smem[threadIdx.x] = A[index];
smem[threadIdx.x + BLOCK_DIM_X + RAID] = A[index + BLOCK_DIM_X + RAID];
}
__syncthreads();
//- 执行差分操作
float a0 = coeff[0] * (smem[threadIdx.x + RAID + 4] - smem[threadIdx.x + RAID - 4]);
float a1 = coeff[1] * (smem[threadIdx.x + RAID + 3] - smem[threadIdx.x + RAID - 3]);
float a2 = coeff[2] * (smem[threadIdx.x + RAID + 2] - smem[threadIdx.x + RAID - 2]);
float a3 = coeff[3] * (smem[threadIdx.x + RAID + 1] - smem[threadIdx.x + RAID - 1]);
B[index] = a0 + a1 + a2 + a3;
}
int main(int argc, char **argv)
{
int nElem = LENGTH + 2 * RAID;
float h_coeff[RAID] = {0.8f, -0.2f, 0.03809f, -0.00357f};
cudaMemcpyToSymbol(coeff, h_coeff, RAID * sizeof(float)); //- 主机代码初始化场量内存
float *h_arrA, *h_arrB, *d_arrA, *d_arrB;
h_arrA = (float *)malloc(nElem * sizeof(float));
cudaMalloc((void **)&d_arrA, nElem * sizeof(float));
init(h_arrA, nElem);
cudaMemcpy(d_arrA, h_arrA, nElem * sizeof(float), cudaMemcpyHostToDevice);
nElem = nElem - 2 * RAID;
h_arrB = (float *)malloc(nElem * sizeof(float));
cudaMalloc((void **)&d_arrB, nElem * sizeof(float));
dim3 block(BLOCK_DIM_X);
dim3 grid((nElem + block.x - 1) / block.x);
printf("grid=(%d %d %d) block=(%d %d %d)\n", grid.x, grid.y, grid.z, block.x, block.y, block.z);
stencil_ld<<<grid, block>>>(d_arrA, d_arrB, nElem);
cudaDeviceSynchronize();
cudaMemcpy(h_arrB, d_arrB, nElem * sizeof(float), cudaMemcpyDeviceToHost);
free(h_arrA);
free(h_arrB);
cudaFree(d_arrA);
cudaFree(d_arrB);
cudaDeviceReset();
return 0;
}
首先在全局区域内创建一个场量数组,然后利用主机代码将值拷贝过去,对其进行初始化。
在核函数中,首先将GMEM中的数据,拷贝到SMEM中,然后计算。
只读缓存
在GMEM中创建arr,然后通过修饰符或者内置函数将其缓存到只读缓存中。
常量缓存与只读缓存之间的区别
- 都是只读的
- 常量缓存64KB、只读缓存48KB
- 常量缓存在同一读取中更好,只读缓存适合分散读取。