矩阵乘法的CUDA示例——使用共享内存、流、事件

知行合一2018

已于 2023-03-20 15:43:04 修改

阅读量490

点赞数 1

文章标签：矩阵算法 CUDA

于 2023-03-20 15:33:28 首次发布

本文链接：https://blog.csdn.net/davidhopper/article/details/129667997

版权

贺志国

计算矩阵C = A * B，使用自己写的核函数，主要熟悉共享内存、流、事件的使用方法。使用分块策略的矩阵乘法原理如下图所示：
matrix-multiplication-with-shared-memory

示例文件matrix_multiply.cu代码如下：

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach.
 *
 */

#include <iostream>
#include <vector>

// CUDA runtime
#include <cuda_runtime.h>

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * A_width is A's width and B_width is B's width
 */
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float *C, float *A, float *B, int A_width,
                              int B_width) {
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  int A_begin = A_width * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int A_end = A_begin + A_width - 1;

  // Step size used to iterate through the sub-matrices of A
  int A_step = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int B_begin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int B_step = BLOCK_SIZE * B_width;

  // C_sub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float C_sub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = A_begin, b = B_begin; a <= A_end; a += A_step, b += B_step) {
    // Declaration of the shared memory array A_sub used to
    // store the sub-matrix of A
    __shared__ float A_sub[BLOCK_SIZE][BLOCK_SIZE];

    // Declaration of the shared memory array B_sub used to
    // store the sub-matrix of B
    __shared__ float B_sub[BLOCK_SIZE][BLOCK_SIZE];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    A_sub[ty][tx] = A[a + A_width * ty + tx];
    B_sub[ty][tx] = B[b + B_width * ty + tx];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll
    for (int k = 0; k < BLOCK_SIZE; ++k) {
      C_sub += A_sub[ty][k] * B_sub[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = B_width * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + B_width * ty + tx] = C_sub;
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int block_size, const dim3 &A_dim, const dim3 &B_dim) {
  // Initialize the host matrices A and B
  unsigned A_size = A_dim.x * A_dim.y;
  unsigned B_size = B_dim.x * B_dim.y;
  unsigned C_size = B_dim.x * A_dim.y;
  const float A_val = 1.0f;
  const float B_val = 0.01f;
  std::vector<float> host_A(A_size, A_val);
  std::vector<float> host_B(B_size, B_val);

  // Allocate device memory
  float *device_A, *device_B, *device_C;
  unsigned A_mem_size = sizeof(float) * A_size;
  unsigned B_mem_size = sizeof(float) * B_size;
  unsigned C_mem_size = sizeof(float) * C_size;
  cudaMalloc(reinterpret_cast<void **>(&device_A), A_mem_size);
  cudaMalloc(reinterpret_cast<void **>(&device_B), B_mem_size);
  cudaMalloc(reinterpret_cast<void **>(&device_C), C_mem_size);

  // Use a stream so that tasks can be executed asynchronously and be more
  // efficient
  cudaStream_t stream;
  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

  // copy host memory to device
  cudaMemcpyAsync(device_A, host_A.data(), A_mem_size, cudaMemcpyHostToDevice,
                  stream);
  cudaMemcpyAsync(device_B, host_B.data(), B_mem_size, cudaMemcpyHostToDevice,
                  stream);

  // Setup execution parameters
  dim3 threads_per_block(block_size, block_size);
  dim3 blocks_per_grid(B_dim.x / threads_per_block.x,
                       A_dim.y / threads_per_block.y);

  // Create and start timer
  std::cout << "Computing result using CUDA Kernel...\n";

  // Performs warmup operation using MatrixMul CUDA kernel
  if (block_size == 16) {
    MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
        device_C, device_A, device_B, A_dim.x, B_dim.x);
  } else {
    MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
        device_C, device_A, device_B, A_dim.x, B_dim.x);
  }

  cudaStreamSynchronize(stream);
  std::cout << "warmup done\n";

  // Free up host memories
  host_A.clear();
  host_A.shrink_to_fit();
  host_B.clear();
  host_B.shrink_to_fit();

  // Allocate CUDA events that we'll use for timing
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // Record the start event
  cudaEventRecord(start, stream);

  // Execute the kernel
  int count = 300;
  for (int j = 0; j < count; ++j) {
    if (block_size == 16) {
      MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
          device_C, device_A, device_B, A_dim.x, B_dim.x);
    } else {
      MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
          device_C, device_A, device_B, A_dim.x, B_dim.x);
    }
  }

  // Record the stop event
  cudaEventRecord(stop, stream);

  // Wait for the stop event to complete
  cudaEventSynchronize(stop);

  float total_ms = 0.0f;
  cudaEventElapsedTime(&total_ms, start, stop);

  // Compute and print the performance
  float ms_per_matrix_mul = total_ms / count;
  double flops_per_matrix_mul = 2.0 * static_cast<double>(A_dim.x) *
                                static_cast<double>(A_dim.y) *
                                static_cast<double>(B_dim.x);
  double giga_flops =
      (flops_per_matrix_mul * 1.0e-9f) / (ms_per_matrix_mul / 1000.0f);
  printf(
      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
      " WorkgroupSize= %u threads/block\n",
      giga_flops, ms_per_matrix_mul, flops_per_matrix_mul,
      threads_per_block.x * threads_per_block.y);

  // Copy result from device to host
  std::vector<float> host_C(C_size, 0.0f);
  cudaMemcpyAsync(host_C.data(), device_C, C_mem_size, cudaMemcpyDeviceToHost,
                  stream);
  cudaStreamSynchronize(stream);

  std::cout << "Checking computed result for correctness: ";
  bool correct = true;

  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  double eps = 1.e-6;  // machine zero

  for (unsigned i = 0; i < C_size; ++i) {
    double abs_err = std::fabs(host_C[i] - (A_dim.x * B_val));
    double dot_length = A_dim.x;
    double abs_val = std::fabs(host_C[i]);
    double rel_err = abs_err / abs_val / dot_length;

    if (rel_err > eps) {
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
             host_C[i], A_dim.x * B_val, eps);
      correct = false;
    }
  }

  std::cout << (correct ? "Result = PASS" : "Result = FAIL") << '\n';

  // Clean up memory
  cudaFree(device_A);
  cudaFree(device_B);
  cudaFree(device_C);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  if (correct) {
    return EXIT_SUCCESS;
  } else {
    return EXIT_FAILURE;
  }
}

int main(int argc, char **argv) {
  std::cout << "[Matrix Multiply Using CUDA] - Starting...\n";

  int block_size = 32;
  dim3 A_dim(5 * 2 * block_size, 5 * 2 * block_size, 1);
  dim3 B_dim(5 * 4 * block_size, 5 * 2 * block_size, 1);

  std::cout << "MatrixA(" << A_dim.x << ", " << A_dim.y << "), MatrixB("
            << B_dim.x << ", " << B_dim.y << ")\n";

  int matrix_result = MatrixMultiply(block_size, A_dim, B_dim);

  return matrix_result;
}

上述代码中，在核函数中使用到了共享内存提升效率：

    __shared__ float A_sub[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float B_sub[BLOCK_SIZE][BLOCK_SIZE];

以下代码片段中，#pragma unroll表示将下面的循环直接展开成普通的函数语句来提升效率。

#pragma unroll
    for (int k = 0; k < BLOCK_SIZE; ++k) {
      C_sub += A_sub[ty][k] * B_sub[k][tx];
    }

假如BLOCK_SIZE = 4，则编译器会将上述循环直接展开成如下语句以提升调用效率：

    C_sub += A_sub[ty][0] * B_sub[0][tx];
    C_sub += A_sub[ty][1] * B_sub[1][tx];
    C_sub += A_sub[ty][2] * B_sub[2][tx];
    C_sub += A_sub[ty][3] * B_sub[3][tx];

而下述流的代码：

  cudaStream_t stream;
  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

其作用是将后面使用stream的函数变成异步调用，也就是说，将如下带参数stream的函数调用：

  cudaMemcpyAsync(device_A, host_A.data(), A_mem_size, cudaMemcpyHostToDevice,
                  stream);
  cudaMemcpyAsync(device_B, host_B.data(), B_mem_size, cudaMemcpyHostToDevice,
                  stream);
  // ... 
  
  if (block_size == 16) {
    MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
        device_C, device_A, device_B, A_dim.x, B_dim.x);
  } else {
    MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
        device_C, device_A, device_B, A_dim.x, B_dim.x);
  }

变成非阻塞式的异步调用。如果不使用流，则必须等第一个cudaMemcp操作结束后才能执行第二个cudaMemcp操作，再执行核函数MatrixMulCUDA。现在有了流，则cudaMemcpyAsync就相当于一个异步任务，这里只是通知GPU去执行（至于GPU什么时候执行，由GPU调度器去分派），反正当前位置不会阻塞等待，而是立刻切换到下一条cudaMemcpyAsync，再立刻切换到核函数MatrixMulCUDA，通过流可显著提升GPU的执行效率。当需要同步结果时，调用：

  cudaStreamSynchronize(stream);

即可得到正确的结果。
而代码段：

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);;
    //  Here is the code that is timed.
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsed_time;
    cudaEventElapsedTime(&elapsed_time, start, stop);
    std::cout << "Elapsed time: " << elapsed_time << " ms.\n";  
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

则是CUDA用来计算耗时的标准代码。
CMake配置文件如下：

cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(matrix_multiply VERSION 0.1.0)

# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)

# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")

# Disable warning messages
cmake_policy(SET CMP0074 NEW)

# Find CUDA
find_package(CUDA REQUIRED)

# These flags embed debugging information for both host and device code
# Note that CUDA code with debugging information will greatly reduce efficiency!
# Do not add the -G option unless debugging is necessary!
set(CUDA_NVCC_FLAGS -G -g)

# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)

上述配置文件的含义如下图所示：
cuda-cmake

编译方式说明：

使用cmake的编译：

mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标，也可直接使用：make
cmake --build .

如直接使用nvcc编译，指令为：

nvcc -O3 -G -g -std=c++17 matrix_multiply.cu -lcublas -o matrix_multiply

上述编译语句的选项含义为：-O3使用O3级优化，-G设备（device）代码包含调试信息，-g主机（host）代码包含调试信息，-std=c++17使用C++17标准，-lcublas表示链接cublas库。
注意：CUDA代码带调试信息会极大降低效率，除非调试需要，否则不要添加-G选项。

GCC编译器版本必须为9.1以上版本（Ubuntu 20.04 2021年以后的版本默认就是GCC 9.3）

运行结果如下：

./matrix_multiply 
[Matrix Multiply Using CUDA] - Starting...
MatrixA(320, 320), MatrixB(640, 320)
Computing result using CUDA Kernel...
warmup done
Performance= 34.14 GFlop/s, Time= 3.839 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS