CUDA——kernel发布

最新推荐文章于 2023-08-13 13:59:07 发布

UCAS_HMM

最新推荐文章于 2023-08-13 13:59:07 发布

阅读量282

点赞数 1

分类专栏： CUDA 文章标签： CUDA 矩阵加法性能优化内存带宽 FLOPS

本文链接：https://blog.csdn.net/UCAS_HMM/article/details/127397146

版权

CUDA 专栏收录该内容

10 篇文章 3 订阅

订阅专栏

1D矢量加法

$\begin{equation} {\bf C=A+B} \end{equation}$

问题分析

基本概念：operations per cycle (OPC), operations per second (OPS), floating-point operations per second (FLOPS),
运行设备为Tesla K80，cuda核心的主频为823.5MHz，板载2个GPU，每个GPU包含13个流处理器，每个处理器中包含192个cuda核心，单核单周期单精度计算次数为2。单精度峰值FLOPS=8.73TFLOPS和我自己计算出来的不太一样。

$\frac{823.5MHz\times 2\times 13\times 192\times 2}{1000\times 1000\times 1000\times 1000} = 8.222TFLOPS$

这是因为通过deviceQuery查到的主频是基础主频，而官网上的宣传中提到过一句是：借助 NVIDIA GPU Boost 使单精度浮点运算能力高达 8.73 万亿次。Boost是一种动态调频的技术，类似于CPU超频，对于K80来说，最高的主频可以达到875MHz，代入计算即可得到一致的结果。
内存带宽为480GB/s和官方参数一致，其中显存位宽为384bit（内存位宽是在一个时钟周期内所能传送数据的位数，位数越大则瞬间所能传输的数据量越大，这是内存的重要参数之一。内存位宽是在一个时钟周期内所能传送数据的位数，位数越大则瞬间所能传输的数据量越大，这是内存的重要参数之一），内存时钟频率为2.505GHz，

$\frac{2\times 384bit\times2.505GHz\times2}{8bits/byte} = 480.96GB/s$

因此理论上，指令和字节比为17指令/字节。使用nvprof --metrics inst_executed ./test 1024得到执行的指令数为9437184，而传输的数据量67108864字节，因此属于受内存带宽限制的算例。

代码实现

#include<stdlib.h>
#include<stdio.h>

__host__ void init(int *arr, int nElem)
{
    for(int i = 0; i < nElem; i++)
    {
        arr[i] = i;
    }
}

__global__ void sumArr(int *arrA, int *arrB, int *arrC, int nElem)
{
    int Index = blockIdx.x * blockDim.x + threadIdx.x;
    if (Index < nElem)
    {
        arrC[Index] = arrA[Index] + arrB[Index];
    }
}

__host__ void check(int *arrA, int *arrB, int *arrC, int nElem)
{
    for(int i = 0; i < nElem; i++)
    {
        if(arrC[i] != (arrA[i] + arrB[i]))
        {
            printf("error at %d\n", i);
            return;
        }
    }
    printf("result is correct!\n");
    return;
}

int main(int argc, char **argv)
{
    int nElem = 1<<24;
    int *h_arrA = (int *)malloc(nElem * sizeof(int));
    int *h_arrB = (int *)malloc(nElem * sizeof(int));
    int *h_arrC = (int *)malloc(nElem * sizeof(int));
    init(h_arrA, nElem);
    init(h_arrB, nElem);

    int *d_arrA, *d_arrB, *d_arrC;
    cudaMalloc((void **)&d_arrA, nElem * sizeof(int));
    cudaMalloc((void **)&d_arrB, nElem * sizeof(int));
    cudaMalloc((void **)&d_arrC, nElem * sizeof(int));
    cudaMemcpy(d_arrA, h_arrA, nElem * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_arrB, h_arrB, nElem * sizeof(int), cudaMemcpyHostToDevice);

    dim3 block(atoi(argv[1]));  //- block的值通过命令行输入
    dim3 grid((nElem+block.x-1)/block.x);
    printf("gridDim:(%d,%d,%d) blockDim:(%d,%d,%d)\n", grid.x, grid.y, grid.z, block.x, block.y, block.z);
    sumArr<<<grid, block>>>(d_arrA, d_arrB, d_arrC, nElem);

    cudaMemcpy(h_arrC, d_arrC, nElem * sizeof(int), cudaMemcpyDeviceToHost);
    check(h_arrA, h_arrB, h_arrC, nElem);
    
    free(h_arrA);
    free(h_arrB);
    free(h_arrC);
    cudaFree(d_arrA);
    cudaFree(d_arrB);
    cudaFree(d_arrC);
    cudaDeviceReset();
    return 0;
}

测试线程组织配置对运行时间的影响

//- 下面是在K80上采用nvprof统计的运行时间
//- grid	block	thread	    HToD	sum	    DToH	occupancy
//- 16384	1024	16777216	19.326	1.6645	54.421	0.767550
//- 32768	512	    16777216	26.904	1.6278	42.557	0.780929
//- 65536	256	    16777216	23.01	1.6014	44.85	0.818572
//- 131072	128	    16777216	22.489	1.5926	38.169	0.840699
//- 262144	64	    16777216	22.884	1.9762	47.284	0.415732
//- 524288	32	    16777216	19.236	3.3125	41.2	0.202111

可以发现，随着每个block内的线程数量减小，核函数运行时间先减小后增加。解释：

如果每个block内的线程数很少，意味着每个block中的warp数量很少。对于本3.7版本来说，每个SM最大的常驻block数被限制为16。以最后一个为例，尽管block数量很多，但是由于触及到16个block的限制，导致一个SM上的常驻warp为16。每个SM的最大允许常驻warp数量为64，因此占有率仅为25%，这是造成速度特别慢的原因，参考后两种配置的占有率可以看出来。
如果block内的线程数量很多，以第一个为例，此时一个block中存在32个warp，block总数也达到了16384，为什么不如第二种配置快呢？这一块没有理解。

2D矩阵加法

代码实现

#include<stdlib.h>
#include<stdio.h>

__host__ void init(int *mat, int row, int col)
{
    for(int i = 0; i < row; i++)
    {
        for (int j = 0; j < col; j++)
        {
            mat[i * col + j] = i;
        }
    }
}

__global__ void sumMat(int *matA, int *matB, int *matC, int row, int col)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (y<row && x<col)
    {
        matC[y * col + x] = matA[y * col + x] + matB[y * col + x];
    }
}

__host__ void check(int *matA, int *matB, int *matC, int row, int col)
{
    for(int i = 0; i < row; i++)
    {
        for (int j = 0; j < col; j++)
        {
            if (matC[i * col + j] != (matA[i * col + j] + matB[i * col + j]))
            {
                printf("error happend at row %d col %d (%d + %d != %d)\n", i, j, matA[i * col + j], matB[i * col + j], matC[i * col + j]);
                return;
            }
        }
    }
    printf("result is correct\n");
}

int main(int argc, char **argv)
{
    int row = 1<<14;
    int col = 1<<14;
    int nElem = row * col;
    printf("matrix (%d, %d) size is %d, memory size is %0.2fMB\n", row, col, nElem, float(nElem) * sizeof(int)/1024/1024);

    int *h_matA = (int *)malloc(nElem * sizeof(int));
    int *h_matB = (int *)malloc(nElem * sizeof(int));
    int *h_matC = (int *)malloc(nElem * sizeof(int));
    init(h_matA, row, col);
    init(h_matB, row, col);

    int *d_matA, *d_matB, *d_matC;
    cudaMalloc((void**)&d_matA, nElem * sizeof(int));
    cudaMalloc((void**)&d_matB, nElem * sizeof(int));
    cudaMalloc((void**)&d_matC, nElem * sizeof(int));

    cudaMemcpy(d_matA, h_matA, nElem * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matB, h_matB, nElem * sizeof(int), cudaMemcpyHostToDevice);

    dim3 block(atoi(argv[1]), atoi(argv[2]));
    dim3 grid((col + block.x - 1) / block.x, (row + block.y - 1) / block.y);
    printf("gridDim:(%d,%d,%d) blockDim:(%d,%d,%d)\n", grid.x, grid.y, grid.z, block.x, block.y, block.z);
    sumMat<<<grid, block>>>(d_matA, d_matB, d_matC, row, col);
    cudaMemcpy(h_matC, d_matC, nElem * sizeof(int), cudaMemcpyDeviceToHost);
    check(h_matA, h_matB, h_matC, row, col);
    free(h_matA);
    free(h_matB);
    free(h_matC);
    cudaFree(d_matA);
    cudaFree(d_matB);
    cudaFree(d_matC);
    cudaDeviceReset();

    return 0;
}

性能测试

注意，下面几个测试都是开启了L1缓存机制的，即加载内存事务粒度为128字节。

行主导访问

行主导访问即是以threadIdx.x为最内层的索引来计算全局内存索引。具体的实现体现在核函数的索引计算式上：

matC[y * col + x] = matA[y * col + x] + matB[y * col + x];

在这种模式下，同一个warp的线程访问的全局内存GMEM是连续的，因为一个block中的warp划分也是根据threadIdx.x来进行的。

row		col		grid.x	grid.y	block.x	block.y	HToD	kernel	DToH	gld_transaction	gst_transaction	gld_efficiency	gst_efficiency
16384	16384	512		512		32		32		341.51	28.978	694.76	16777216		8388608			100%			100%
16384	16384	512		1024	32		16		368.87	27.884	632.65	16777216		8388608			100%			100%
16384	16384	1024	1024	16		16		320.09	28.452	730.47	33554432		16777216		50.0%			100%

列主导访问

列主导访问是以threadIdx.y为最内层索引来计算全局索引，如：

matC[x * col + y] = matA[x * col + y] + matB[x * col + y];

此时，同一个warp中的threads访问的GMEM数据位置相互间隔。

row		col		grid.x	grid.y	block.x	block.y	HToD	kernel	DToH	gld_transaction	gst_transaction	gld_efficiency	gst_efficiency
16384	16384	512		512		32		32		347.35	209.57	647.71	536870912		268435456		3.125%			12.50%
16384	16384	512		1024	32		16		330.66	236.29	666.43	536870912		268435456		3.125%			12.50%
16384	16384	1024	1024	16		16		335.05	204.19	662.40	268435456		134217728		6.25%			25.00%

分析

内存加载事务数量：行主导时，一个warp中的32个threads访问的GMEM是连续的，一次计算需要加载A和B两个int类型（4字节）数据，因此一个warp针对A和B需要加载的数据量均为 $32\times4=128$ 字节。当使用L1缓存时，内存事务的粒度是128字节，因此对A和B各自只需要一个内存事务就能完成。因此对于整个计算，总的加载内存事务为： $2\times\frac{16384\times16384\times4}{128}=16777216$ ，加载效率100%，没有带宽浪费。而对于列主导，由于同一个warp的线程访问的数据在物理上都不连续，因此各自都需要一个事务来服务，因此有多少个线程，就需要多少个内存事务，即 $2\times16384\times16384=536870912$ 个，由于一个事务的粒度为128字节，因此一共加载的数据量为 $536870912\times128=64GB$ ，而实际A和B共计内存2GB，因此加载有效率为 $1/32 = 3.125$ 。
内存加载事务：行主导时的最后一种情况（16,16），因为blockDim.x=16，根据块内warp划分的机制，需要首先将二维块转化成一维，然后每32个线程组成一个块，并且转化时，是以threadIdx.x为最内层索引的。因此对于这个二维block来说，相当于每两行的线程归属同一个warp，而本例子中，线程的逻辑索引和数据的GMEM索引是一一对应的，也就是说，每个warp中，前16个thread和后16个thread访问的GMEM数据不是连续的，因此需要两个128字节的事务来服务，利用率为50%。因此访问数据量要翻倍。对于列主导的情况，则刚好相反，由于每个warp中的thread访问的GMEM是两行16个int，也就是说，每一列有两个访问，共16列。由于是列主导，因此同一列元素的内存是连续的，因此只需要16个内存访问事务服务。相比于前两种情况，效率提升了一倍，事务减少一倍。
内存存储事务数量：首先需要明确的是，GMEM的写入是没有经过L1缓存的，因此内存粒度是32字节/段。但是，内存写入事务可以是1、2、4段三种类型。对于行主导的前两种情况，由于一个warp需要执行的写入数据量为 $32\times4=128$ 字节，因此可以采用4段这种写入事务来处理，因此总的写入内存事务就是 $\frac{16384\times16384\times4}{4\times32}=8388608$ ，此时的写入事务效率为100%。对于最后一种情况，由于一个warp中前一半和后一半的数据在物理上是分割的，各自只有 $16\times4=64$ 字节是连续的，因此此时一个warp可以采用2段这种写入事务来处理，因此总的写入内存事务就是 $\frac{16384\times16384\times4}{2\times32}=16777216$ ，但是由于没有浪费，因此利用率依然是100%。对于列主导访问，一个warp中的32个thread需要写入的GMEM都是彼此间隔的，每个都是一个int大小，即4字节，因此此时采用的是1段写入事务，利用率为 $4/32 = 12.5$ ，此时的总的写入事务数量为 $16384\times16384=268435456$ 。对于列主导的最后一种情况，同样可以类似于加载事务一样的分析过程，此时采用的依然是1段写入事务，只不过现在一个事务能够处理两个归属于同一列的两个线程所访问的数据，因此写入事务数量缩小一半，利用率增加一倍。
并行性：可以看到，行主导和列主导的前两个情况表明，blockDim.y从32缩小到16之后，一个性能提升了，另一个性能下降了，原因暂时不清楚。

使用1D grid和1D block来组织矩阵加法线程

性能分析

row		col		grid.x	block.x	HToD	kernel	DToH	gld_transaction	gst_transaction	gld_efficiency	gst_efficiency
16384	16384	262144	1024    359.66	26.477	624.90	16777216		8388608		    100%			100%
16384	16384	524288	512 	320.96	25.871	623.31	16777216		8388608		    100%			100%
16384	16384	1048576	256		414.88	25.532	633.29	16777216		8388608		    100%			100%
16384	16384	2097152	128		313.32	25.351	613.34	16777216		8388608		    100%			100%
16384	16384	4194304	64		349.00	31.032	617.91	16777216		8388608		    100%			100%
16384	16384	8388608	32		313.69	52.573	649.81	16777216		8388608		    100%			100%

执行时间：采用小一点的block可以增加同时处于活跃状态下的warp数量，因此效率更高。但是block太小以至于触及到每个SM限制的常驻block数量上限时（K80是16），对于blockDim.x=64和32来说，SM中的warp分别为32和16，小于SM限制的64warp，因此SM的利用率很小，导致运行时间变长。
内存事务：由于采用的是一维的线程组织，所以每个warp中对应的GMEM都是连续的，因此载入事务效率为100%，粒度为128字节；存储事务均采用4段，一个事务的粒度也是128字节，因此数量可以很容易算出来。

使用2D grid和1D block来组织矩阵加法线程\

row		col		grid.x	grid.y	block.x	HToD	kernel	DToH	gld_transaction	gst_transaction	gld_efficiency	gst_efficiency
16384	16384	16	    16384	1024    359.66	66.994	624.90	16777216		8388608		    100%			100%
16384	16384	32	    16384	512 	320.96	65.074	623.31	16777216		8388608		    100%			100%
16384	16384	64	    16384	256		414.88	64.318	633.29	16777216		8388608		    100%			100%
16384	16384	128	    16384	128		313.32	63.931	613.34	16777216		8388608		    100%			100%
16384	16384	256	    16384	64		349.00	117.51	617.91	16777216		8388608		    100%			100%
16384	16384	512	    16384	32		313.69	230.25	649.81	16777216		8388608		    100%			100%