CUDA矩阵乘法

最新推荐文章于 2021-01-11 14:59:14 发布

wxb_blog

最新推荐文章于 2021-01-11 14:59:14 发布

阅读量2.7k

点赞数 2

分类专栏： CUDA_GPU编程文章标签： cuda nvidia

本文链接：https://blog.csdn.net/wxb1553725576/article/details/78070384

版权

CUDA_GPU编程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

CUDA矩阵乘法

背景

大多数情况下，我们是不需要自己去实现矩阵乘法的，因为Nvidia提供了cuda版的cublas库，我们利用库函数就可以搞定。但是，总会有些情况下，我们需要实现自己的矩阵乘法。这里我们要实现的是自己的cublasSgemm函数。

cublasSgemm介绍

cublasSgemm函数的功能可以用如下的公式表示：

α∗op(A)op(B)+β∗C,α和β是标量，其余是矩阵，op表示转置或者非转置
cublasSgemm函数原型如下：
- 参数解释：
  op(A) 是 m x k 维矩阵，op(B) 是 k x n 维矩阵，C 是 m x n 维矩阵
  注意：如果A需要转置，那么 $A^T$ 是 m x k 维矩阵；如果A不需要转置，那么A是 m x k 维矩阵。B同理。
  lda >= (transa == CUBLAS_OP_N) ? max(1,m) : max(1,k)
  ldb >= (transb == CUBLAS_OP_N) ? max(1,k) : max(1,n)
  ldc >= max(1,m)
  注意： ld 是 leading dimension的缩写，指的是矩阵元素在存储时嵌套在最外面的维度。之所以是 $>=$ 的原因在于可能需要进行内存对齐。

实现

这里我们用两种不同的方法实现：使用优化和不使用优化，完整的程序可以从资料的链接获取。注意：虽然A和B可能需要转置，但是我们不需要进行物理转置，只需要在计算元素地址的时候使用不同的方法就可以。

方法一 ：不使用优化

__global__ void
matrixMul_(cublasHandle_t handle,
       cublasOperation_t transA, cublasOperation_t transB,
       int M, int N, int K,
       const float alpha,
       const float *A, int lda,
       const float *B, int ldb,
       const float beta,
       float *C, int ldc)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    float sum = 0.0;
    for(int i = 0; i < K; i++) {
        int a_addr = (transA == CUBLAS_OP_T) ? _GET_ADDR_T(by, i, lda) : _GET_ADDR_N(by, i, lda);
        int b_addr = (transB == CUBLAS_OP_T) ? _GET_ADDR_T(i, bx, ldb) : _GET_ADDR_N(i, bx, ldb);
        sum += A[a_addr] * B[b_addr];
    }

    int c_addr = _GET_ADDR_N(by, bx, ldc);
    C[c_addr] = alpha * sum + beta * C[c_addr];
}

注意：_GET_ADDR_T 和 _GET_ADDR_N分别计算转置和不转置时的元素地址。

方法二 ：使用优化

// C = alpha * op(A) * op(B) + beta * C
// handle : not used, just keep same with cublasSgemm
template <int BLOCK_SIZE> __global__ void
matrixMul_2(cublasHandle_t handle,
        cublasOperation_t transa, cublasOperation_t transb,
        int m, int n, int k,
        const float alpha,   // use (float*) seems wrong, why ?
        const float *A, int lda,
        const float *B, int ldb,
        const float beta,    // use (float*) seems wrong, why ?
        float *C, int ldc)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Steps each thread, k is col of op(A) or row of op(B)
    int steps = int((k + BLOCK_SIZE - 1) / (BLOCK_SIZE));

    // Csub is used to shtore the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for(int step = 0; step < steps; ++step)
    {
        // Declaration of the shared memory array As used to
        // store the sub-matrix of A and B
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from device memory
        // to shared memory, each thread loads
        // one element of each matrix
        int a_x = BLOCK_SIZE * step + tx;
        int a_y = BLOCK_SIZE * by   + ty;
        int a_addr = (transa == CUBLAS_OP_T) ? _GET_ADDR_T(a_y, a_x, lda)
                                             : _GET_ADDR_N(a_y, a_x, lda);
        As[ty][tx] = _OUT_OF_RANGE(a_y, m, a_x, k) ? 0.0
                                                   : A[a_addr];

        int b_x = BLOCK_SIZE * bx   + tx;
        int b_y = BLOCK_SIZE * step + ty;
        int b_addr = (transb == CUBLAS_OP_T) ? _GET_ADDR_T(b_y, b_x, ldb)
                                             : _GET_ADDR_N(b_y, b_x, ldb);
        Bs[ty][tx] = _OUT_OF_RANGE(b_y, k, b_x, n) ? 0.0
                                                   : B[b_addr];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrx
        for(int bs = 0; bs < BLOCK_SIZE; ++bs)
        {
            Csub += As[ty][bs] * Bs[bs][tx];
        }

        // Synchronize to make sure that the preceding
        // computation is done befroe laoding two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    int c_x = bx * BLOCK_SIZE + tx;
    int c_y = by * BLOCK_SIZE + ty;
    int c_addr = _GET_ADDR_N(c_y, c_x, ldc);
    if(!_OUT_OF_RANGE(c_y, m, c_x, n)) {
        C[c_addr] = alpha * Csub + beta * C[c_addr];
    }
}

注意： __syncthreads()用于同步一个block内的线程，BLOCK_SIZE用于设置每一小块的大小。

对比分析
- 方法一中没有使用任何优化技巧，每次计算都从GPU的全局内存中取数据，读写速度较慢。计算C中的一个元素，需要k次乘法，2k(只包括A和B)次取数， $\alpha*op(A)op(B)$ 一共需要取2mnk次数， $\alpha * op(A)op(B) + \beta*C$ 一共需要2mnk + mn次读，mn次写。
- 方法二中使用共享内存，每次从全局内存中取一个小块（块大小为BLOCK_SIZE * BLOCK_SIZE）的元素，然后存放在共享内存中，每次计算C中的一个块。每个小块需要从全局内存中取 $2*BLOCK\_SIZE^2$ 次数。为了计算C中一个块的元素，需要从全局内存取 $\lceil{\frac{k}{BLOCK\_SIZE}}\rceil$ 个块，因此计算C中一个块的元素，需要取 $2*BLOCK\_SIZE^2 * \lceil{\frac{k}{BLOCK\_size}\rceil}$ 。 C中共有 $\lceil{\frac{m}{BLOCK\_size}\rceil} * \lceil{\frac{n}{BLOCK\_SIZE}\rceil}$ 个小块，因此需要从全局内存中取 $2*BLOCK\_SIZE^2 * \lceil{\frac{k}{BLOCK\_size}\rceil} * \lceil{\frac{m}{BLOCK\_size}\rceil} * \lceil{\frac{n}{BLOCK\_SIZE}\rceil} \approx \frac{2mnk}{BLOCK\_SIZE}$ 。 $\alpha * op(A)op(B) + \beta*C$ 需要 $\frac{2mnk}{BLOCK\_SIZE}$ + mn次全局内存读，mn次写。
- 通过对比，可以发现，使用共享内存后从全局内存读取A和B的次数变为了原来的 $\frac{1}{BLOCK\_SIZE}$ ，从全局内存读取C的次数保持不变。如果忽略从共享内存中读数据的时间，则方法二的执行时间是方法一的 $\frac{1}{BLOCK\_SIZE}$ 。当然，由于方法二使用了同步函数，而且共享内存的读取也会占用时间，实际加速比会比这个低。方法二可以用下图表示：
行主序vs列主序
- 假设我们有一个3x3的矩阵A = [1 2 3; 4 5 6; 7 8 9]， c/c++是行主序的，在内存中存储的顺序是 [1 2 3 4 5 6 7 8 9]; 但是cuda是列主序，在内存中的存储的顺序是[1 4 7 2 5 8 3 6 9]。