【代码笔记】CUDA官方样例库之矩阵乘matrixMul基础系列版本

矩阵乘(General Matrix-matrix Multiplication,GEMM)是一类很重要的应用,尤其是在大语言模型领域,其是注意力机制的热点。Nvidia官方库在0_Introduction部分包含了四个版本的矩阵乘:matrixMul、matrixMul_nvrtc、matrixMulDrv和matrixMulDynJIT。本文将以matrixMul为基础,逐步展开对系列各个版本矩阵乘的梳理。



1 代码仓库

1.1 基本信息

  • 仓库:Nvidia的官方样例仓库
  • 样例路径:<your_code_folder>/cuda-samples-master/Samples/0_Introduction/matrixMul*
  • 编译配置:需要在Makefile中指明CUDA_PATH(例如:CUDA_PATH ?= /usr/local/cuda-12.1/)

1.2 版本说明

  • matrixMul:基础分块GEMM与Shared Memory使用
  • matrixMul_nvrtc:区别于nvcc的运行时编译场景
  • matrixMulDrv:利用CUDA Drive API
  • matrixMulDynlinkJIT:JIT编译ptx

2 基础GEMM系列版本

2.1 matrixMul

  • 说明:采取分块的思想,利用Shared Memory优化的基础版GEMM
    GEMM分块解决流程

2.2 matrixMul_nvrtc

  • 说明:另一场景下,利用nvrtc的GEMM
  • 代码结构区别:对host和device侧代码做了分离。即host写cpp,device的kernel写cu

Makefile新增内容

  • 操作系统检查 lsb_release -i -s 2>/dev/null | grep -i ubuntu
  • 架构检查(不支持ARMv7)
    #This sample is not supported on ARMv7
    ifeq ($(TARGET_ARCH),armv7l)
    $(info >>> WARNING - matrixMul_nvrtc is not supported on ARMv7 - waiving sample <<<)
    SAMPLE_ENABLED := 0
    endif
    
  • 基于目标/Host的操作系统与架构的特化处理
    • 目标操作系统特化处理(根据$(TARGET_OS)
      • 添加库目录、编译选项等
    • 目标架构特化处理(根据$(TARGET_ARCH)
      • 赋值CUDA_SEARCH_PATH
    • HOST侧架构处理(根据$(HOST_ARCH)
    # libNVRTC specific libraries
    ifeq ($(TARGET_OS),darwin)
     LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
    endif
    
    ifeq ($(TARGET_OS),darwin)
      ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
    else
      ifeq ($(TARGET_ARCH),x86_64)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
        CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
        CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
      endif
    
      ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
        ifdef TARGET_OVERRIDE
            CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
        endif
      endif
    
      ifeq ($(TARGET_ARCH),ppc64le)
        CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
      endif
    
      ifeq ($(HOST_ARCH),ppc64le)
        CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
      endif
    
      CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
      ifeq ("$(CUDALIB)","")
        $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
        SAMPLE_ENABLED := 0
      else
        CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
        LIBRARIES += -L$(CUDALIB) -lcuda
      endif
    endif
    
  • 添加include路径与nvrtc库
    INCLUDES += -I$(CUDA_PATH)/include	
    LIBRARIES += -lnvrtc
    
  • 准备cooperative_group环境
    build: matrixMul_nvrtc
    $(EXEC) cp "$(CUDA_PATH)/include/cooperative_groups.h" .
    $(EXEC) cp -r "$(CUDA_PATH)/include/cooperative_groups" .
    

Host侧实现(cpp)

  • main函数
    • 相比于基础版matrixMul而言,这里main的变动主要是调用接口函数有所变动
      int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
      
  • 接口函数(matrix Multiply)
    • 流程
      1. 内存显存分配
        • 分配Host侧内存并初始化
        • 分配Device侧显存与初始化
      2. nvrtc相关
        kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
        compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
        
        CUmodule module = loadCUBIN(cubin, argc, argv);
        
      3. Host侧Device侧数据拷贝
      4. Kernel函数绑定
        • cuModuleGetFunction 将Kernel函数绑定到kernel_addr
        CUfunction kernel_addr;
        if (block_size == 16) {
        	checkCudaErrors(
        	cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
        } else {
        	checkCudaErrors(
        	cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
        }
        
      5. 启动
        • 迭代100次,用cuLaunchKernel启动Kernel,每次迭代用cuCtxSynchronize同步
        	  int nIter = 300;
        
        	 for (int j = 0; j < nIter; j++) {
        	    checkCudaErrors(
        	        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
        	                       threads.x, threads.y, threads.z,     /* block dim */
        	                       0, 0,    /* shared mem, stream */
        	                       &arr[0], /* arguments */
        	                       0));
        	
        	    checkCudaErrors(cuCtxSynchronize());
        	  }
        
        1. 数据回传与结果校验
      • 完整代码
        /**
         * Run a simple test of matrix multiplication using CUDA
         */
        int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
                           dim3 &dimsB) {
          // Allocate host memory for matrices A and B
          unsigned int size_A = dimsA.x * dimsA.y;
          unsigned int mem_size_A = sizeof(float) * size_A;
          float *h_A = (float *)malloc(mem_size_A);
          unsigned int size_B = dimsB.x * dimsB.y;
          unsigned int mem_size_B = sizeof(float) * size_B;
          float *h_B = (float *)malloc(mem_size_B);
        
          // Initialize host memory
          const float valB = 0.01f;
          constantInit(h_A, size_A, 1.0f);
          constantInit(h_B, size_B, valB);
        
          // Allocate device memory
          CUdeviceptr d_A, d_B, d_C;
        
          char *cubin, *kernel_file;
          size_t cubinSize;
        
          kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
          compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
        
          CUmodule module = loadCUBIN(cubin, argc, argv);
        
          // Allocate host matrix C
          dim3 dimsC(dimsB.x, dimsA.y, 1);
          unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
          float *h_C = (float *)malloc(mem_size_C);
        
          if (h_C == NULL) {
            fprintf(stderr, "Failed to allocate host matrix C!\n");
            exit(EXIT_FAILURE);
          }
        
          checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
          checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
          checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
        
          // copy host memory to device
          checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
          checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
        
          // Setup execution parameters
          dim3 threads(block_size, block_size);
          dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
        
          // Create and start timer
          printf("Computing result using CUDA Kernel...\n");
        
          CUfunction kernel_addr;
          if (block_size == 16) {
            checkCudaErrors(
                cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
          } else {
            checkCudaErrors(
                cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
          }
        
          void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
                         (void *)&dimsB.x};
        
          // Execute the kernel
          int nIter = 300;
        
          for (int j = 0; j < nIter; j++) {
            checkCudaErrors(
                cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
                               threads.x, threads.y, threads.z,     /* block dim */
                               0, 0,    /* shared mem, stream */
                               &arr[0], /* arguments */
                               0));
        
            checkCudaErrors(cuCtxSynchronize());
          }
        
          // Copy result from device to host
          checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
        
          printf("Checking computed result for correctness: ");
        
          bool correct = true;
        
          // test relative error by the formula
          //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
        
          double eps = 1.e-6;  // machine zero
        
          for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
            double abs_err = fabs(h_C[i] - (dimsA.x * valB));
            double dot_length = dimsA.x;
            double abs_val = fabs(h_C[i]);
            double rel_err = abs_err / abs_val / dot_length;
        
            if (rel_err > eps) {
              printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
                     h_C[i], dimsA.x * valB, eps);
              correct = false;
            }
          }
        
          printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
        
          printf(
              "\nNOTE: The CUDA Samples are not meant for performance measurements. "
              "Results may vary when GPU Boost is enabled.\n");
        
          // Clean up memory
          free(h_A);
          free(h_B);
          free(h_C);
        
          checkCudaErrors(cuMemFree(d_A));
          checkCudaErrors(cuMemFree(d_B));
          checkCudaErrors(cuMemFree(d_C));
        
          if (correct) {
            return EXIT_SUCCESS;
          } else {
            return EXIT_FAILURE;
          }
        }
        

Device侧实现(cu)

  • 增加了协作组的概念:cooperative_groups,并将之前线程同步改为协作组同步
    	#include <cooperative_groups.h>
    	/*	...	*/
    	cooperative_groups::thread_block cta =
      cooperative_groups::this_thread_block();
      /*	...	*/
      cooperative_groups::sync(cta);
    
  • 按block大小细粒度封装,并对外暴露接口
    extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
                                                     int wA, int wB) {
      matrixMulCUDA<16>(C, A, B, wA, wB);
    }
    
    extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
                                                     int wA, int wB) {
      matrixMulCUDA<32>(C, A, B, wA, wB);
    }
    

2.3 matrixMulDrv

  • 说明:利用CUDA Driver API启动的GEMM版本。矩阵信息新增到头文件中

Host侧(cpp)

  • 说明:将main内部过程整体封装到runTest中
  • runTes函数
    • 声明matrixMul并绑定
      // initialize CUDA
      CUfunction matrixMul = NULL;
      int block_size = 0;
      
      initCUDA(argc, argv, &matrixMul, &block_size);
      
    • Device侧数组由基本数据类型指针转为CUdeviceptr类型
      // allocate device memory
      CUdeviceptr d_A;
      checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
      CUdeviceptr d_B;
      checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
      
    • 计时器设置
      // create and start timer
      StopWatchInterface *timer = NULL;
      sdkCreateTimer(&timer);
      
      // start the timer
      sdkStartTimer(&timer);
      /*	...	*/
      // stop and destroy timer
      sdkStopTimer(&timer);
      printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
      sdkDeleteTimer(&timer);
      
    • 简易版启动方式
      • 简单对args进行填充,并送cuLaunchKernel启动
      // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
       // Launching (simplier method)
       size_t Matrix_Width_A = (size_t)WA;
       size_t Matrix_Width_B = (size_t)WB;
       void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
       // new CUDA 4.0 Driver API Kernel launch call
       checkCudaErrors(cuLaunchKernel(
           matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
           2 * block_size * block_size * sizeof(float), NULL, args, NULL));
      
    • 高级版启动方式
      • 以预定长度argBuffer内部偏移追加的方式充填参数,并进一步封装其他宏定义变量成数组,送cuLaunchKernel启动
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
       // Launching (advanced method)
       int offset = 0;
       char argBuffer[256];
      
       // pass in launch parameters (not actually de-referencing CUdeviceptr).
       // CUdeviceptr is storing the value of the parameters
       *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
       offset += sizeof(d_C);
       *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
       offset += sizeof(d_A);
       *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
       offset += sizeof(d_B);
      
       size_t Matrix_Width_A = (size_t)WA;
       size_t Matrix_Width_B = (size_t)WB;
      
       *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
       offset += sizeof(Matrix_Width_A);
       *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
       offset += sizeof(Matrix_Width_B);
      
       void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
                                        CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
                                        CU_LAUNCH_PARAM_END};
      
       // new CUDA 4.0 Driver API Kernel launch call
       checkCudaErrors(cuLaunchKernel(
           matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
           2 * block_size * block_size * sizeof(float), NULL, NULL,
           reinterpret_cast<void **>(&kernel_launch_config)));
      

Device侧(cu)

  • Kernel变动不大,主要是Shared Memory变量的宏和模板多了size_type的占位变量类型
    • 宏定义:Shared Memory的As和Bs封了个宏AS和BS
      #define AS(i, j) As[i][j]
      #define BS(i, j) Bs[i][j]
      /*
      	...
      */
      for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
      
    • size_type模板参数
      template <int block_size, typename size_type>
      __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
                               size_type wB) {
                                 // Block index
      					  size_type bx = blockIdx.x;
      					  size_type by = blockIdx.y;
      					
      					  // Thread index
      					  size_type tx = threadIdx.x;
      					  size_type ty = threadIdx.y;
                               /*
                               	...
                               */
                               }
      // C wrappers around our template kernel
      extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
                                                    size_t wA, size_t wB) {
       matrixMul<8, size_t>(C, A, B, wA, wB);
      }
      extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
                                                     size_t wA, size_t wB) {
       matrixMul<16, size_t>(C, A, B, wA, wB);
      }
      extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
                                                     size_t wA, size_t wB) {
       matrixMul<32, size_t>(C, A, B, wA, wB);
      }
      
      

2.4 matrixMulDynlinkJIT

  • 说明:利用动态JIT直接编译ptx的GEMM版本,整体流程会比之前复杂些,并且涉及ptx汇编

编译执行流程

  1. *前置步骤(官方库已完成ptx转c步骤,可略过)
    • 准备matrixMul的ptx(官方已准备32和64版本,在extras文件夹下)
    • 执行ptx转c和h的脚本(脚本也在extras下)
    • 将生成的脚本拷贝到上层(项目文件夹下),并配置好文件名
  2. 确认项目文件夹下存在ptx转c的代码文件
  3. 编译make
  4. 执行make run
  • 注意,官方样例的ptx所转的c,与他直接给到的c存在差异,注意辨析

Host侧

  • main函数
    • 初始化绑定matrixMul
      // initialize CUDA
      CUfunction matrixMul = NULL;
      int block_size = 0;
      checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
      
    • 向前兼容了CUDA 4.0 API之前版本的启动
      #if __CUDA_API_VERSION >= 4000
      	// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
      	 /*	...	*/
      #else // __CUDA_API_VERSION <= 3020
      	 // This is the older CUDA Driver API for Kernel Parameter passing and Kernel Launching
      	 /*	...	*/
      #endif
      
  • initCUDA
    • 设置JIT
      • 设置JIT大小选项等配置
      • 编译加载ptxdump(就是ptx转化的字符数组)到module
      	   // setup JIT compilation options and perform compilation
      {
          // in this branch we use compilation with parameters
          const unsigned int jitNumOptions = 3;
          CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
          void **jitOptVals = new void *[jitNumOptions];
      
          // set up size of compilation log buffer
          jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
          int jitLogBufferSize = 1024;
          jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
      
          // set up pointer to the compilation log buffer
          jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
          char *jitLogBuffer = new char[jitLogBufferSize];
          jitOptVals[1] = jitLogBuffer;
      
          // set up pointer to set the Maximum # of registers for a particular kernel
          jitOptions[2] = CU_JIT_MAX_REGISTERS;
          int jitRegCount = 32;
          jitOptVals[2] = (void *)(size_t)jitRegCount;
      
          // compile with set parameters
          printf("> Compiling CUDA module\n");
      
      	#if defined(_WIN64) || defined(__LP64__)
      	        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
      	#else
      	        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
      	#endif
      
          printf("> PTX JIT log:\n%s\n", jitLogBuffer);
      
          delete [] jitOptions;
          delete [] jitOptVals;
          delete [] jitLogBuffer;
      }
      
    • 按函数名绑定ptx的函数到cuFunction(即matrixMul)
      if (CUDA_SUCCESS != status)
      {
          printf("Error while compiling PTX\n");
          cuCtxDestroy(g_cuContext);
          exit(EXIT_FAILURE);
      }
      
      // retrieve CUDA function from the compiled module
      status = cuModuleGetFunction(&cuFunction, cuModule,
                                   (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
      
      if (CUDA_SUCCESS != status)
      {
          cuCtxDestroy(g_cuContext);
          exit(EXIT_FAILURE);
      }
      
      *pMatrixMul = cuFunction;
      

Device侧

  • matrixMul_kernel_<64/32>.ptx
    • 四个函数的函数签名
      .entry matrixMul_bs16_32bit (
      	.param .u64 __cudaparm_matrixMul_bs16_32bit_C,
      	.param .u64 __cudaparm_matrixMul_bs16_32bit_A,
      	.param .u64 __cudaparm_matrixMul_bs16_32bit_B,
      	.param .s32 __cudaparm_matrixMul_bs16_32bit_wA,
      	.param .s32 __cudaparm_matrixMul_bs16_32bit_wB)
      .entry matrixMul_bs16_64bit (
      	.param .u64 __cudaparm_matrixMul_bs16_64bit_C,
      	.param .u64 __cudaparm_matrixMul_bs16_64bit_A,
      	.param .u64 __cudaparm_matrixMul_bs16_64bit_B,
      	.param .u64 __cudaparm_matrixMul_bs16_64bit_wA,
      	.param .u64 __cudaparm_matrixMul_bs16_64bit_wB)
      .entry matrixMul_bs32_32bit (
      	.param .u64 __cudaparm_matrixMul_bs32_32bit_C,
      	.param .u64 __cudaparm_matrixMul_bs32_32bit_A,
      	.param .u64 __cudaparm_matrixMul_bs32_32bit_B,
      	.param .s32 __cudaparm_matrixMul_bs32_32bit_wA,
      	.param .s32 __cudaparm_matrixMul_bs32_32bit_wB)
      .entry matrixMul_bs32_64bit (
      	.param .u64 __cudaparm_matrixMul_bs32_64bit_C,
      	.param .u64 __cudaparm_matrixMul_bs32_64bit_A,
      	.param .u64 __cudaparm_matrixMul_bs32_64bit_B,
      	.param .u64 __cudaparm_matrixMul_bs32_64bit_wA,
      	.param .u64 __cudaparm_matrixMul_bs32_64bit_wB)
      
    • 乘加指令优化
      • 在loop循环内,大量使用乘加指令(16个mad):%6 =(%f4*%f5)+%f1
        ld.shared.f32 	%f4, [%rd8+0];
        ld.shared.f32 	%f5, [%rd6+0];
        mad.f32 	%f6, %f4, %f5, %f1;
        

3 个人小结

Nvidia官方库在0_Introduction中共提供了四个版本的基础matrixMul实现,每个版本各有侧重。在实施分块GEMM策略与Shared Memory缓存优化的基础之上,进一步充分利用CUDA的接口、甚至使用ptx编写Kenel会取得更高的性能收益。

  • 9
    点赞
  • 28
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值