CUDA Samples 之 matrixMul 矩阵乘法详解

OOOrchid

已于 2022-10-11 20:53:36 修改

阅读量800

点赞数 1

分类专栏： CUDA编程文章标签：矩阵算法 cuda

于 2022-10-08 22:39:47 首次发布

本文链接：https://blog.csdn.net/qq_43232373/article/details/127215719

版权

CUDA Samples 之 matrixMul 矩阵乘法详解

文章目录

CUDA Samples 之 matrixMul 矩阵乘法详解

1. 怎么说

CUDA进行矩阵乘法。

这里记录了使用不同 CUDA API 或工具进行矩阵乘法计算的示例。

其中2.1和2.2记录使用 CUDA runtime API实现的矩阵乘法，2.1基于全局内存实现，2.2基于共享内存实现，这两个实现实现了共享内存对于常见代码的优化。

2.3、2.4、2.5分别基于动态编译、Driver API、 CUBLAS库实现。2.3、2.4展示的代码都基于2.2展示的共享内存版本代码进行延伸。

2. 怎么做

具体的代码及思路展示部分，代码部分去掉了繁复的各种 check 函数，仅保留核心代码。

2.1 基于CUDA runtime全局内存的矩阵乘法

这里实现基于全局内存的矩阵乘法的简单实现。每个线程读取矩阵 A 的一行和矩阵 B 的一列，并计算输出矩阵 C 的相应元素，具体示意图如下图所示。由下面这张图，可以分析到如果从全局内存中读取 A，共需要读取 B.width 次(每次只读取 A 的一行)，同理， B 共需要读取 A.height 次。
在这里插入图片描述
这里没有官方示例代码，自己依据手册简单实现了下，具体代码：

// System includes
#include <stdio.h>
#include <math.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define BLOCK_SIZE 32

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's height
 */
__global__ void MatrixMulCUDA(float* C, float* A, float* B, int wA, int hB) {
   

    // 每一个thread计算C矩阵中的一个元素
    float Cvalue = 0;
    // 这里计算A的行数，B的列数
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;

    // 实际上在for循环中取值，取得是A的每一行，B的每一列
    for (int e = 0; e < wA; ++e) {
   
        Cvalue += A[row * wA + e] * B[e * hB + col];
    }
    // 将结果赋值给C中对应的位
    C[row * hB + col] = Cvalue;
}

void ConstantInit(float* data, int size, float val) {
   
    for (int i = 0; i < size; ++i) {
   
        data[i] = val;
    }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int block_size, const dim3& dimsA, const dim3& dimsB) {
   

    // 为矩阵A、B分配主机内存
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A;
    cudaMallocHost(&h_A, mem_size_A);

    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B;
    cudaMallocHost(&h_B, mem_size_B);

    // 为矩阵C分配主机内存 320*640
    dim3 dimsC(dimsA.x, dimsB.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
    float* h_C;
    cudaMallocHost(&h_C, mem_size_C);

    // 初始化主机内存
    const float valB = 0.5f;
    ConstantInit(h_A, size_A, 2.0f);
    ConstantInit(h_B, size_B, valB);

    // 为矩阵A、B、C分配设备内存
    float* d_A, * d_B, * d_C;
    cudaMalloc(reinterpret_cast<void**>(&d_A), mem_size_A);
    cudaMalloc(reinterpret_cast<void**>(&d_B), mem_size_B);
    cudaMalloc(reinterpret_cast<void**>(&d_C), mem_size_C);

    // 定义cuda流
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    // 从主机内存向设备内存拷贝数据
    cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream);

    // 设置参数
    dim3 threads(block_size, block_size);
    dim3 grid(dimsA.x / threads.x, dimsB.y / threads.y);

    // 核函数运行
    printf("Computing result using CUDA Kernel...\n");
    MatrixMulCUDA<<<grid, threads, 0, stream >>> (d_C, d_A, d_B, dimsA.x, dimsB.y);

    printf("done\n");
    cudaStreamSynchronize(stream);

    // 将结果从设备内存拷贝至主机内存
    cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
    
    // 打印结果查看是否正确
    for (int x = 0; x < dimsA.x; x++) {
   
        for (int y = 0; y < dimsB.y; y++) {
   
            printf("value of C[%d][%d] is %f: \n", x, y, h_C[x * y]);
        }
    }

   // 根据公式对数据结果是否正确
    printf("Checking computed result for correctness: ");
    bool correct = true;

    double eps = 1.e-6;  // machine zero

    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
   
        double abs_err = fabs(h_C[i] - (dimsB.y * valB));
        double dot_length = dimsB.y;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err / abs_val / dot_length;

        if (rel_err > eps) {
   
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
                i, h_C[i], dimsB.y * valB, eps);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

    // 清理内存
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 1;
}


/**
 * 主程序
 */
int<