矩阵乘(General Matrix-matrix Multiplication,GEMM)是一类很重要的应用,尤其是在大语言模型领域,其是注意力机制的热点。Nvidia官方库在0_Introduction部分包含了四个版本的矩阵乘:matrixMul、matrixMul_nvrtc、matrixMulDrv和matrixMulDynJIT。本文将以matrixMul为基础,逐步展开对系列各个版本矩阵乘的梳理。
【代码笔记】CUDA官方样例库之矩阵乘matrixMul基础系列版本
1 代码仓库
1.1 基本信息
- 仓库:Nvidia的官方样例仓库
- 样例路径:
<your_code_folder>/cuda-samples-master/Samples/0_Introduction/matrixMul*
- 编译配置:需要在Makefile中指明CUDA_PATH(例如:CUDA_PATH ?= /usr/local/cuda-12.1/)
1.2 版本说明
- matrixMul:基础分块GEMM与Shared Memory使用
- matrixMul_nvrtc:区别于nvcc的运行时编译场景
- matrixMulDrv:利用CUDA Drive API
- matrixMulDynlinkJIT:JIT编译ptx
2 基础GEMM系列版本
2.1 matrixMul
- 说明:采取分块的思想,利用Shared Memory优化的基础版GEMM
2.2 matrixMul_nvrtc
- 说明:另一场景下,利用nvrtc的GEMM
- 代码结构区别:对host和device侧代码做了分离。即host写cpp,device的kernel写cu
Makefile新增内容
- 操作系统检查
lsb_release -i -s 2>/dev/null | grep -i ubuntu
- 架构检查(不支持ARMv7)
#This sample is not supported on ARMv7 ifeq ($(TARGET_ARCH),armv7l) $(info >>> WARNING - matrixMul_nvrtc is not supported on ARMv7 - waiving sample <<<) SAMPLE_ENABLED := 0 endif
- 基于目标/Host的操作系统与架构的特化处理
- 目标操作系统特化处理(根据
$(TARGET_OS)
)- 添加库目录、编译选项等
- 目标架构特化处理(根据
$(TARGET_ARCH)
)- 赋值
CUDA_SEARCH_PATH
- 赋值
- HOST侧架构处理(根据
$(HOST_ARCH)
)
# libNVRTC specific libraries ifeq ($(TARGET_OS),darwin) LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA endif ifeq ($(TARGET_OS),darwin) ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA else ifeq ($(TARGET_ARCH),x86_64) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs endif ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs ifdef TARGET_OVERRIDE CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs endif endif ifeq ($(TARGET_ARCH),ppc64le) CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs endif ifeq ($(HOST_ARCH),ppc64le) CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs endif CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) ifeq ("$(CUDALIB)","") $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) SAMPLE_ENABLED := 0 else CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) LIBRARIES += -L$(CUDALIB) -lcuda endif endif
- 目标操作系统特化处理(根据
- 添加include路径与nvrtc库
INCLUDES += -I$(CUDA_PATH)/include LIBRARIES += -lnvrtc
- 准备cooperative_group环境
build: matrixMul_nvrtc $(EXEC) cp "$(CUDA_PATH)/include/cooperative_groups.h" . $(EXEC) cp -r "$(CUDA_PATH)/include/cooperative_groups" .
Host侧实现(cpp)
- main函数
- 相比于基础版matrixMul而言,这里main的变动主要是调用接口函数有所变动
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
- 相比于基础版matrixMul而言,这里main的变动主要是调用接口函数有所变动
- 接口函数(matrix Multiply)
- 流程
- 内存显存分配
- 分配Host侧内存并初始化
- 分配Device侧显存与初始化
- nvrtc相关
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1); CUmodule module = loadCUBIN(cubin, argc, argv);
- Host侧Device侧数据拷贝
- Kernel函数绑定
- 用
cuModuleGetFunction
将Kernel函数绑定到kernel_addr
CUfunction kernel_addr; if (block_size == 16) { checkCudaErrors( cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16")); } else { checkCudaErrors( cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32")); }
- 用
- 启动
- 迭代100次,用
cuLaunchKernel
启动Kernel,每次迭代用cuCtxSynchronize
同步
int nIter = 300; for (int j = 0; j < nIter; j++) { checkCudaErrors( cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */ threads.x, threads.y, threads.z, /* block dim */ 0, 0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); checkCudaErrors(cuCtxSynchronize()); }
- 数据回传与结果校验
- 迭代100次,用
- 完整代码
/** * Run a simple test of matrix multiplication using CUDA */ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) { // Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // Initialize host memory const float valB = 0.01f; constantInit(h_A, size_A, 1.0f); constantInit(h_B, size_B, valB); // Allocate device memory CUdeviceptr d_A, d_B, d_C; char *cubin, *kernel_file; size_t cubinSize; kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1); CUmodule module = loadCUBIN(cubin, argc, argv); // Allocate host matrix C dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); float *h_C = (float *)malloc(mem_size_C); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); } checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); // copy host memory to device checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); // Setup execution parameters dim3 threads(block_size, block_size); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); // Create and start timer printf("Computing result using CUDA Kernel...\n"); CUfunction kernel_addr; if (block_size == 16) { checkCudaErrors( cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16")); } else { checkCudaErrors( cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32")); } void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x}; // Execute the kernel int nIter = 300; for (int j = 0; j < nIter; j++) { checkCudaErrors( cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */ threads.x, threads.y, threads.z, /* block dim */ 0, 0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); checkCudaErrors(cuCtxSynchronize()); } // Copy result from device to host checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C)); printf("Checking computed result for correctness: "); bool correct = true; // test relative error by the formula // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps double eps = 1.e-6; // machine zero for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) { double abs_err = fabs(h_C[i] - (dimsA.x * valB)); double dot_length = dimsA.x; double abs_val = fabs(h_C[i]); double rel_err = abs_err / abs_val / dot_length; if (rel_err > eps) { printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); correct = false; } } printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); printf( "\nNOTE: The CUDA Samples are not meant for performance measurements. " "Results may vary when GPU Boost is enabled.\n"); // Clean up memory free(h_A); free(h_B); free(h_C); checkCudaErrors(cuMemFree(d_A)); checkCudaErrors(cuMemFree(d_B)); checkCudaErrors(cuMemFree(d_C)); if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } }
- 内存显存分配
- 流程
Device侧实现(cu)
- 增加了协作组的概念:cooperative_groups,并将之前线程同步改为协作组同步
#include <cooperative_groups.h> /* ... */ cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); /* ... */ cooperative_groups::sync(cta);
- 按block大小细粒度封装,并对外暴露接口
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB) { matrixMulCUDA<16>(C, A, B, wA, wB); } extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB) { matrixMulCUDA<32>(C, A, B, wA, wB); }
2.3 matrixMulDrv
- 说明:利用CUDA Driver API启动的GEMM版本。矩阵信息新增到头文件中
Host侧(cpp)
- 说明:将main内部过程整体封装到runTest中
- runTes函数
- 声明matrixMul并绑定
// initialize CUDA CUfunction matrixMul = NULL; int block_size = 0; initCUDA(argc, argv, &matrixMul, &block_size);
- Device侧数组由基本数据类型指针转为
CUdeviceptr
类型// allocate device memory CUdeviceptr d_A; checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); CUdeviceptr d_B; checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
- 计时器设置
// create and start timer StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); // start the timer sdkStartTimer(&timer); /* ... */ // stop and destroy timer sdkStopTimer(&timer); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer);
- 简易版启动方式
- 简单对args进行填充,并送
cuLaunchKernel
启动
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // Launching (simplier method) size_t Matrix_Width_A = (size_t)WA; size_t Matrix_Width_B = (size_t)WB; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; // new CUDA 4.0 Driver API Kernel launch call checkCudaErrors(cuLaunchKernel( matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 2 * block_size * block_size * sizeof(float), NULL, args, NULL));
- 简单对args进行填充,并送
- 高级版启动方式
- 以预定长度argBuffer内部偏移追加的方式充填参数,并进一步封装其他宏定义变量成数组,送cuLaunchKernel启动
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // Launching (advanced method) int offset = 0; char argBuffer[256]; // pass in launch parameters (not actually de-referencing CUdeviceptr). // CUdeviceptr is storing the value of the parameters *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C; offset += sizeof(d_C); *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A; offset += sizeof(d_A); *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B; offset += sizeof(d_B); size_t Matrix_Width_A = (size_t)WA; size_t Matrix_Width_B = (size_t)WB; *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A; offset += sizeof(Matrix_Width_A); *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B; offset += sizeof(Matrix_Width_B); void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END}; // new CUDA 4.0 Driver API Kernel launch call checkCudaErrors(cuLaunchKernel( matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 2 * block_size * block_size * sizeof(float), NULL, NULL, reinterpret_cast<void **>(&kernel_launch_config)));
- 声明matrixMul并绑定
Device侧(cu)
- Kernel变动不大,主要是Shared Memory变量的宏和模板多了size_type的占位变量类型
- 宏定义:Shared Memory的As和Bs封了个宏AS和BS
#define AS(i, j) As[i][j] #define BS(i, j) Bs[i][j] /* ... */ for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
- size_type模板参数
template <int block_size, typename size_type> __device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB) { // Block index size_type bx = blockIdx.x; size_type by = blockIdx.y; // Thread index size_type tx = threadIdx.x; size_type ty = threadIdx.y; /* ... */ } // C wrappers around our template kernel extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB) { matrixMul<8, size_t>(C, A, B, wA, wB); } extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB) { matrixMul<16, size_t>(C, A, B, wA, wB); } extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB) { matrixMul<32, size_t>(C, A, B, wA, wB); }
- 宏定义:Shared Memory的As和Bs封了个宏AS和BS
2.4 matrixMulDynlinkJIT
- 说明:利用动态JIT直接编译ptx的GEMM版本,整体流程会比之前复杂些,并且涉及ptx汇编
编译执行流程
- *前置步骤(官方库已完成ptx转c步骤,可略过)
- 准备matrixMul的ptx(官方已准备32和64版本,在extras文件夹下)
- 执行ptx转c和h的脚本(脚本也在extras下)
- 将生成的脚本拷贝到上层(项目文件夹下),并配置好文件名
- 确认项目文件夹下存在ptx转c的代码文件
- 编译
make
- 执行
make run
- 注意,官方样例的ptx所转的c,与他直接给到的c存在差异,注意辨析
Host侧
- main函数
- 初始化绑定matrixMul
// initialize CUDA CUfunction matrixMul = NULL; int block_size = 0; checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
- 向前兼容了CUDA 4.0 API之前版本的启动
#if __CUDA_API_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) /* ... */ #else // __CUDA_API_VERSION <= 3020 // This is the older CUDA Driver API for Kernel Parameter passing and Kernel Launching /* ... */ #endif
- 初始化绑定matrixMul
- initCUDA
- 设置JIT
- 设置JIT大小选项等配置
- 编译加载ptxdump(就是ptx转化的字符数组)到module
// setup JIT compilation options and perform compilation { // in this branch we use compilation with parameters const unsigned int jitNumOptions = 3; CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; void **jitOptVals = new void *[jitNumOptions]; // set up size of compilation log buffer jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; int jitLogBufferSize = 1024; jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; char *jitLogBuffer = new char[jitLogBufferSize]; jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[2] = CU_JIT_MAX_REGISTERS; int jitRegCount = 32; jitOptVals[2] = (void *)(size_t)jitRegCount; // compile with set parameters printf("> Compiling CUDA module\n"); #if defined(_WIN64) || defined(__LP64__) status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); #else status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); #endif printf("> PTX JIT log:\n%s\n", jitLogBuffer); delete [] jitOptions; delete [] jitOptVals; delete [] jitLogBuffer; }
- 按函数名绑定ptx的函数到cuFunction(即matrixMul)
if (CUDA_SUCCESS != status) { printf("Error while compiling PTX\n"); cuCtxDestroy(g_cuContext); exit(EXIT_FAILURE); } // retrieve CUDA function from the compiled module status = cuModuleGetFunction(&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); if (CUDA_SUCCESS != status) { cuCtxDestroy(g_cuContext); exit(EXIT_FAILURE); } *pMatrixMul = cuFunction;
- 设置JIT
Device侧
- matrixMul_kernel_<64/32>.ptx
- 四个函数的函数签名
.entry matrixMul_bs16_32bit ( .param .u64 __cudaparm_matrixMul_bs16_32bit_C, .param .u64 __cudaparm_matrixMul_bs16_32bit_A, .param .u64 __cudaparm_matrixMul_bs16_32bit_B, .param .s32 __cudaparm_matrixMul_bs16_32bit_wA, .param .s32 __cudaparm_matrixMul_bs16_32bit_wB) .entry matrixMul_bs16_64bit ( .param .u64 __cudaparm_matrixMul_bs16_64bit_C, .param .u64 __cudaparm_matrixMul_bs16_64bit_A, .param .u64 __cudaparm_matrixMul_bs16_64bit_B, .param .u64 __cudaparm_matrixMul_bs16_64bit_wA, .param .u64 __cudaparm_matrixMul_bs16_64bit_wB) .entry matrixMul_bs32_32bit ( .param .u64 __cudaparm_matrixMul_bs32_32bit_C, .param .u64 __cudaparm_matrixMul_bs32_32bit_A, .param .u64 __cudaparm_matrixMul_bs32_32bit_B, .param .s32 __cudaparm_matrixMul_bs32_32bit_wA, .param .s32 __cudaparm_matrixMul_bs32_32bit_wB) .entry matrixMul_bs32_64bit ( .param .u64 __cudaparm_matrixMul_bs32_64bit_C, .param .u64 __cudaparm_matrixMul_bs32_64bit_A, .param .u64 __cudaparm_matrixMul_bs32_64bit_B, .param .u64 __cudaparm_matrixMul_bs32_64bit_wA, .param .u64 __cudaparm_matrixMul_bs32_64bit_wB)
- 乘加指令优化
- 在loop循环内,大量使用乘加指令(16个mad):
%6 =(%f4*%f5)+%f1
ld.shared.f32 %f4, [%rd8+0]; ld.shared.f32 %f5, [%rd6+0]; mad.f32 %f6, %f4, %f5, %f1;
- 在loop循环内,大量使用乘加指令(16个mad):
- 四个函数的函数签名
3 个人小结
Nvidia官方库在0_Introduction中共提供了四个版本的基础matrixMul实现,每个版本各有侧重。在实施分块GEMM策略与Shared Memory缓存优化的基础之上,进一步充分利用CUDA的接口、甚至使用ptx编写Kenel会取得更高的性能收益。