【代码笔记】CUDA官方样例库之矩阵乘matrixMul基础系列版本-CSDN博客

本文链接：https://blog.csdn.net/weixin_43319948/article/details/140686663

矩阵乘（General Matrix-matrix Multiplication，GEMM）是一类很重要的应用，尤其是在大语言模型领域，其是注意力机制的热点。Nvidia官方库在0_Introduction部分包含了四个版本的矩阵乘：matrixMul、matrixMul_nvrtc、matrixMulDrv和matrixMulDynJIT。本文将以matrixMul为基础，逐步展开对系列各个版本矩阵乘的梳理。

1 代码仓库

1.1 基本信息

仓库：Nvidia的官方样例仓库
样例路径：<your_code_folder>/cuda-samples-master/Samples/0_Introduction/matrixMul*
编译配置：需要在Makefile中指明CUDA_PATH（例如：CUDA_PATH ?= /usr/local/cuda-12.1/）

1.2 版本说明

matrixMul：基础分块GEMM与Shared Memory使用
matrixMul_nvrtc：区别于nvcc的运行时编译场景
matrixMulDrv：利用CUDA Drive API
matrixMulDynlinkJIT：JIT编译ptx

2 基础GEMM系列版本

2.1 matrixMul

说明：采取分块的思想，利用Shared Memory优化的基础版GEMM

2.2 matrixMul_nvrtc

说明：另一场景下，利用nvrtc的GEMM
代码结构区别：对host和device侧代码做了分离。即host写cpp，device的kernel写cu

Makefile新增内容

操作系统检查 lsb_release -i -s 2>/dev/null | grep -i ubuntu

架构检查（不支持ARMv7）

#This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - matrixMul_nvrtc is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif

基于目标/Host的操作系统与架构的特化处理

目标操作系统特化处理（根据$(TARGET_OS)）
- 添加库目录、编译选项等
目标架构特化处理（根据$(TARGET_ARCH)）
- 赋值CUDA_SEARCH_PATH
HOST侧架构处理（根据$(HOST_ARCH)）

# libNVRTC specific libraries
ifeq ($(TARGET_OS),darwin)
 LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

添加include路径与nvrtc库

INCLUDES += -I$(CUDA_PATH)/include	
LIBRARIES += -lnvrtc

准备cooperative_group环境

build: matrixMul_nvrtc
$(EXEC) cp "$(CUDA_PATH)/include/cooperative_groups.h" .
$(EXEC) cp -r "$(CUDA_PATH)/include/cooperative_groups" .

Host侧实现（cpp）

main函数
- 相比于基础版matrixMul而言，这里main的变动主要是调用接口函数有所变动
```
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
```

接口函数（matrix Multiply）

流程

内存显存分配
- 分配Host侧内存并初始化
- 分配Device侧显存与初始化

nvrtc相关

kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);

CUmodule module = loadCUBIN(cubin, argc, argv);

Host侧Device侧数据拷贝

Kernel函数绑定

用cuModuleGetFunction 将Kernel函数绑定到kernel_addr

CUfunction kernel_addr;
if (block_size == 16) {
	checkCudaErrors(
	cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
} else {
	checkCudaErrors(
	cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}

启动

迭代100次，用cuLaunchKernel启动Kernel，每次迭代用cuCtxSynchronize同步

	  int nIter = 300;

	 for (int j = 0; j < nIter; j++) {
	    checkCudaErrors(
	        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
	                       threads.x, threads.y, threads.z,     /* block dim */
	                       0, 0,    /* shared mem, stream */
	                       &arr[0], /* arguments */
	                       0));
	
	    checkCudaErrors(cuCtxSynchronize());
	  }

数据回传与结果校验

完整代码

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
                   dim3 &dimsB) {
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA.x * dimsA.y;
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A = (float *)malloc(mem_size_A);
  unsigned int size_B = dimsB.x * dimsB.y;
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B = (float *)malloc(mem_size_B);

  // Initialize host memory
  const float valB = 0.01f;
  constantInit(h_A, size_A, 1.0f);
  constantInit(h_B, size_B, valB);

  // Allocate device memory
  CUdeviceptr d_A, d_B, d_C;

  char *cubin, *kernel_file;
  size_t cubinSize;

  kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);

  CUmodule module = loadCUBIN(cubin, argc, argv);

  // Allocate host matrix C
  dim3 dimsC(dimsB.x, dimsA.y, 1);
  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
  float *h_C = (float *)malloc(mem_size_C);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));

  // copy host memory to device
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));

  // Setup execution parameters
  dim3 threads(block_size, block_size);
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

  // Create and start timer
  printf("Computing result using CUDA Kernel...\n");

  CUfunction kernel_addr;
  if (block_size == 16) {
    checkCudaErrors(
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
  } else {
    checkCudaErrors(
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
  }

  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
                 (void *)&dimsB.x};

  // Execute the kernel
  int nIter = 300;

  for (int j = 0; j < nIter; j++) {
    checkCudaErrors(
        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
                       threads.x, threads.y, threads.z,     /* block dim */
                       0, 0,    /* shared mem, stream */
                       &arr[0], /* arguments */
                       0));

    checkCudaErrors(cuCtxSynchronize());
  }

  // Copy result from device to host
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));

  printf("Checking computed result for correctness: ");

  bool correct = true;

  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps

  double eps = 1.e-6;  // machine zero

  for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
    double dot_length = dimsA.x;
    double abs_val = fabs(h_C[i]);
    double rel_err = abs_err / abs_val / dot_length;

    if (rel_err > eps) {
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
             h_C[i], dimsA.x * valB, eps);
      correct = false;
    }
  }

  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");

  // Clean up memory
  free(h_A);
  free(h_B);
  free(h_C);

  checkCudaErrors(cuMemFree(d_A));
  checkCudaErrors(cuMemFree(d_B));
  checkCudaErrors(cuMemFree(d_C));

  if (correct) {
    return EXIT_SUCCESS;
  } else {
    return EXIT_FAILURE;
  }
}

Device侧实现（cu）

增加了协作组的概念：cooperative_groups，并将之前线程同步改为协作组同步

	#include <cooperative_groups.h>
	/*	...	*/
	cooperative_groups::thread_block cta =
  cooperative_groups::this_thread_block();
  /*	...	*/
  cooperative_groups::sync(cta);

按block大小细粒度封装，并对外暴露接口

extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
                                                 int wA, int wB) {
  matrixMulCUDA<16>(C, A, B, wA, wB);
}

extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
                                                 int wA, int wB) {
  matrixMulCUDA<32>(C, A, B, wA, wB);
}

2.3 matrixMulDrv

说明：利用CUDA Driver API启动的GEMM版本。矩阵信息新增到头文件中

Host侧（cpp）

说明：将main内部过程整体封装到runTest中

runTes函数

声明matrixMul并绑定

// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;

initCUDA(argc, argv, &matrixMul, &block_size);

Device侧数组由基本数据类型指针转为CUdeviceptr类型

// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));

计时器设置

// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);

// start the timer
sdkStartTimer(&timer);
/*	...	*/
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);

简易版启动方式

简单对args进行填充，并送cuLaunchKernel启动

// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 // Launching (simplier method)
 size_t Matrix_Width_A = (size_t)WA;
 size_t Matrix_Width_B = (size_t)WB;
 void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
 // new CUDA 4.0 Driver API Kernel launch call
 checkCudaErrors(cuLaunchKernel(
     matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
     2 * block_size * block_size * sizeof(float), NULL, args, NULL));

高级版启动方式

以预定长度argBuffer内部偏移追加的方式充填参数，并进一步封装其他宏定义变量成数组，送cuLaunchKernel启动

  // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
 // Launching (advanced method)
 int offset = 0;
 char argBuffer[256];

 // pass in launch parameters (not actually de-referencing CUdeviceptr).
 // CUdeviceptr is storing the value of the parameters
 *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
 offset += sizeof(d_C);
 *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
 offset += sizeof(d_A);
 *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
 offset += sizeof(d_B);

 size_t Matrix_Width_A = (size_t)WA;
 size_t Matrix_Width_B = (size_t)WB;

 *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
 offset += sizeof(Matrix_Width_A);
 *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
 offset += sizeof(Matrix_Width_B);

 void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
                                  CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
                                  CU_LAUNCH_PARAM_END};

 // new CUDA 4.0 Driver API Kernel launch call
 checkCudaErrors(cuLaunchKernel(
     matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
     2 * block_size * block_size * sizeof(float), NULL, NULL,
     reinterpret_cast<void **>(&kernel_launch_config)));

Device侧（cu）

Kernel变动不大，主要是Shared Memory变量的宏和模板多了size_type的占位变量类型

宏定义：Shared Memory的As和Bs封了个宏AS和BS

#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
/*
	...
*/
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);

size_type模板参数

template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
                         size_type wB) {
                           // Block index
					  size_type bx = blockIdx.x;
					  size_type by = blockIdx.y;
					
					  // Thread index
					  size_type tx = threadIdx.x;
					  size_type ty = threadIdx.y;
                         /*
                         	...
                         */
                         }
// C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
                                              size_t wA, size_t wB) {
 matrixMul<8, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
                                               size_t wA, size_t wB) {
 matrixMul<16, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
                                               size_t wA, size_t wB) {
 matrixMul<32, size_t>(C, A, B, wA, wB);
}

2.4 matrixMulDynlinkJIT

说明：利用动态JIT直接编译ptx的GEMM版本，整体流程会比之前复杂些，并且涉及ptx汇编

编译执行流程

*前置步骤（官方库已完成ptx转c步骤，可略过）
- 准备matrixMul的ptx（官方已准备32和64版本，在extras文件夹下）
- 执行ptx转c和h的脚本（脚本也在extras下）
- 将生成的脚本拷贝到上层（项目文件夹下），并配置好文件名
确认项目文件夹下存在ptx转c的代码文件
编译make
执行make run

注意，官方样例的ptx所转的c，与他直接给到的c存在差异，注意辨析

Host侧

main函数

初始化绑定matrixMul

// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));

向前兼容了CUDA 4.0 API之前版本的启动

#if __CUDA_API_VERSION >= 4000
	// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
	 /*	...	*/
#else // __CUDA_API_VERSION <= 3020
	 // This is the older CUDA Driver API for Kernel Parameter passing and Kernel Launching
	 /*	...	*/
#endif

initCUDA

设置JIT

设置JIT大小选项等配置
编译加载ptxdump（就是ptx转化的字符数组）到module

	   // setup JIT compilation options and perform compilation
{
    // in this branch we use compilation with parameters
    const unsigned int jitNumOptions = 3;
    CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
    void **jitOptVals = new void *[jitNumOptions];

    // set up size of compilation log buffer
    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
    int jitLogBufferSize = 1024;
    jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

    // set up pointer to the compilation log buffer
    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
    char *jitLogBuffer = new char[jitLogBufferSize];
    jitOptVals[1] = jitLogBuffer;

    // set up pointer to set the Maximum # of registers for a particular kernel
    jitOptions[2] = CU_JIT_MAX_REGISTERS;
    int jitRegCount = 32;
    jitOptVals[2] = (void *)(size_t)jitRegCount;

    // compile with set parameters
    printf("> Compiling CUDA module\n");

	#if defined(_WIN64) || defined(__LP64__)
	        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
	#else
	        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
	#endif

    printf("> PTX JIT log:\n%s\n", jitLogBuffer);

    delete [] jitOptions;
    delete [] jitOptVals;
    delete [] jitLogBuffer;
}

按函数名绑定ptx的函数到cuFunction（即matrixMul）

if (CUDA_SUCCESS != status)
{
    printf("Error while compiling PTX\n");
    cuCtxDestroy(g_cuContext);
    exit(EXIT_FAILURE);
}

// retrieve CUDA function from the compiled module
status = cuModuleGetFunction(&cuFunction, cuModule,
                             (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");

if (CUDA_SUCCESS != status)
{
    cuCtxDestroy(g_cuContext);
    exit(EXIT_FAILURE);
}

*pMatrixMul = cuFunction;

Device侧

matrixMul_kernel_<64/32>.ptx

四个函数的函数签名

.entry matrixMul_bs16_32bit (
	.param .u64 __cudaparm_matrixMul_bs16_32bit_C,
	.param .u64 __cudaparm_matrixMul_bs16_32bit_A,
	.param .u64 __cudaparm_matrixMul_bs16_32bit_B,
	.param .s32 __cudaparm_matrixMul_bs16_32bit_wA,
	.param .s32 __cudaparm_matrixMul_bs16_32bit_wB)
.entry matrixMul_bs16_64bit (
	.param .u64 __cudaparm_matrixMul_bs16_64bit_C,
	.param .u64 __cudaparm_matrixMul_bs16_64bit_A,
	.param .u64 __cudaparm_matrixMul_bs16_64bit_B,
	.param .u64 __cudaparm_matrixMul_bs16_64bit_wA,
	.param .u64 __cudaparm_matrixMul_bs16_64bit_wB)
.entry matrixMul_bs32_32bit (
	.param .u64 __cudaparm_matrixMul_bs32_32bit_C,
	.param .u64 __cudaparm_matrixMul_bs32_32bit_A,
	.param .u64 __cudaparm_matrixMul_bs32_32bit_B,
	.param .s32 __cudaparm_matrixMul_bs32_32bit_wA,
	.param .s32 __cudaparm_matrixMul_bs32_32bit_wB)
.entry matrixMul_bs32_64bit (
	.param .u64 __cudaparm_matrixMul_bs32_64bit_C,
	.param .u64 __cudaparm_matrixMul_bs32_64bit_A,
	.param .u64 __cudaparm_matrixMul_bs32_64bit_B,
	.param .u64 __cudaparm_matrixMul_bs32_64bit_wA,
	.param .u64 __cudaparm_matrixMul_bs32_64bit_wB)

乘加指令优化
- 在loop循环内，大量使用乘加指令（16个mad）：%6 =(%f4*%f5)+%f1
```
ld.shared.f32 	%f4, [%rd8+0];
ld.shared.f32 	%f5, [%rd6+0];
mad.f32 	%f6, %f4, %f5, %f1;
```