贺志国
计算矩阵C = A * B
,使用自己写的核函数,主要熟悉共享内存、流、事件的使用方法。使用分块策略的矩阵乘法原理如下图所示:
示例文件matrix_multiply.cu
代码如下:
/**
* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication which makes use of shared memory
* to ensure data reuse, the matrix multiplication is done using tiling
* approach.
*
*/
#include <iostream>
#include <vector>
// CUDA runtime
#include <cuda_runtime.h>
/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
* A_width is A's width and B_width is B's width
*/
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float *C, float *A, float *B, int A_width,
int B_width) {
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int A_begin = A_width * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int A_end = A_begin + A_width - 1;
// Step size used to iterate through the sub-matrices of A
int A_step = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int B_begin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int B_step = BLOCK_SIZE * B_width;
// C_sub is used to store the element of the block sub-matrix
// that is computed by the thread
float C_sub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = A_begin, b = B_begin; a <= A_end; a += A_step, b += B_step) {
// Declaration of the shared memory array A_sub used to
// store the sub-matrix of A
__shared__ float A_sub[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array B_sub used to
// store the sub-matrix of B
__shared__ float B_sub[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
A_sub[ty][tx] = A[a + A_width * ty + tx];
B_sub[ty][tx] = B[b + B_width * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
C_sub += A_sub[ty][k] * B_sub[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = B_width * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + B_width * ty + tx] = C_sub;
}
/**
* Run a simple test of matrix multiplication using CUDA
*/
int MatrixMultiply(int block_size, const dim3 &A_dim, const dim3 &B_dim) {
// Initialize the host matrices A and B
unsigned A_size = A_dim.x * A_dim.y;
unsigned B_size = B_dim.x * B_dim.y;
unsigned C_size = B_dim.x * A_dim.y;
const float A_val = 1.0f;
const float B_val = 0.01f;
std::vector<float> host_A(A_size, A_val);
std::vector<float> host_B(B_size, B_val);
// Allocate device memory
float *device_A, *device_B, *device_C;
unsigned A_mem_size = sizeof(float) * A_size;
unsigned B_mem_size = sizeof(float) * B_size;
unsigned C_mem_size = sizeof(float) * C_size;
cudaMalloc(reinterpret_cast<void **>(&device_A), A_mem_size);
cudaMalloc(reinterpret_cast<void **>(&device_B), B_mem_size);
cudaMalloc(reinterpret_cast<void **>(&device_C), C_mem_size);
// Use a stream so that tasks can be executed asynchronously and be more
// efficient
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// copy host memory to device
cudaMemcpyAsync(device_A, host_A.data(), A_mem_size, cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(device_B, host_B.data(), B_mem_size, cudaMemcpyHostToDevice,
stream);
// Setup execution parameters
dim3 threads_per_block(block_size, block_size);
dim3 blocks_per_grid(B_dim.x / threads_per_block.x,
A_dim.y / threads_per_block.y);
// Create and start timer
std::cout << "Computing result using CUDA Kernel...\n";
// Performs warmup operation using MatrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
} else {
MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
}
cudaStreamSynchronize(stream);
std::cout << "warmup done\n";
// Free up host memories
host_A.clear();
host_A.shrink_to_fit();
host_B.clear();
host_B.shrink_to_fit();
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record the start event
cudaEventRecord(start, stream);
// Execute the kernel
int count = 300;
for (int j = 0; j < count; ++j) {
if (block_size == 16) {
MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
} else {
MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
}
}
// Record the stop event
cudaEventRecord(stop, stream);
// Wait for the stop event to complete
cudaEventSynchronize(stop);
float total_ms = 0.0f;
cudaEventElapsedTime(&total_ms, start, stop);
// Compute and print the performance
float ms_per_matrix_mul = total_ms / count;
double flops_per_matrix_mul = 2.0 * static_cast<double>(A_dim.x) *
static_cast<double>(A_dim.y) *
static_cast<double>(B_dim.x);
double giga_flops =
(flops_per_matrix_mul * 1.0e-9f) / (ms_per_matrix_mul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
giga_flops, ms_per_matrix_mul, flops_per_matrix_mul,
threads_per_block.x * threads_per_block.y);
// Copy result from device to host
std::vector<float> host_C(C_size, 0.0f);
cudaMemcpyAsync(host_C.data(), device_C, C_mem_size, cudaMemcpyDeviceToHost,
stream);
cudaStreamSynchronize(stream);
std::cout << "Checking computed result for correctness: ";
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (unsigned i = 0; i < C_size; ++i) {
double abs_err = std::fabs(host_C[i] - (A_dim.x * B_val));
double dot_length = A_dim.x;
double abs_val = std::fabs(host_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
host_C[i], A_dim.x * B_val, eps);
correct = false;
}
}
std::cout << (correct ? "Result = PASS" : "Result = FAIL") << '\n';
// Clean up memory
cudaFree(device_A);
cudaFree(device_B);
cudaFree(device_C);
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (correct) {
return EXIT_SUCCESS;
} else {
return EXIT_FAILURE;
}
}
int main(int argc, char **argv) {
std::cout << "[Matrix Multiply Using CUDA] - Starting...\n";
int block_size = 32;
dim3 A_dim(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 B_dim(5 * 4 * block_size, 5 * 2 * block_size, 1);
std::cout << "MatrixA(" << A_dim.x << ", " << A_dim.y << "), MatrixB("
<< B_dim.x << ", " << B_dim.y << ")\n";
int matrix_result = MatrixMultiply(block_size, A_dim, B_dim);
return matrix_result;
}
上述代码中,在核函数中使用到了共享内存提升效率:
__shared__ float A_sub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float B_sub[BLOCK_SIZE][BLOCK_SIZE];
以下代码片段中,#pragma unroll
表示将下面的循环直接展开成普通的函数语句来提升效率。
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
C_sub += A_sub[ty][k] * B_sub[k][tx];
}
假如BLOCK_SIZE = 4
,则编译器会将上述循环直接展开成如下语句以提升调用效率:
C_sub += A_sub[ty][0] * B_sub[0][tx];
C_sub += A_sub[ty][1] * B_sub[1][tx];
C_sub += A_sub[ty][2] * B_sub[2][tx];
C_sub += A_sub[ty][3] * B_sub[3][tx];
而下述流的代码:
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
其作用是将后面使用stream
的函数变成异步调用,也就是说,将如下带参数stream
的函数调用:
cudaMemcpyAsync(device_A, host_A.data(), A_mem_size, cudaMemcpyHostToDevice,
stream);
cudaMemcpyAsync(device_B, host_B.data(), B_mem_size, cudaMemcpyHostToDevice,
stream);
// ...
if (block_size == 16) {
MatrixMulCUDA<16><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
} else {
MatrixMulCUDA<32><<<blocks_per_grid, threads_per_block, 0, stream>>>(
device_C, device_A, device_B, A_dim.x, B_dim.x);
}
变成非阻塞式的异步调用。如果不使用流,则必须等第一个cudaMemcp
操作结束后才能执行第二个cudaMemcp
操作,再执行核函数MatrixMulCUDA
。现在有了流,则cudaMemcpyAsync
就相当于一个异步任务,这里只是通知GPU去执行(至于GPU什么时候执行,由GPU调度器去分派),反正当前位置不会阻塞等待,而是立刻切换到下一条cudaMemcpyAsync
,再立刻切换到核函数MatrixMulCUDA
,通过流可显著提升GPU的执行效率。当需要同步结果时,调用:
cudaStreamSynchronize(stream);
即可得到正确的结果。
而代码段:
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);;
// Here is the code that is timed.
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, stop);
std::cout << "Elapsed time: " << elapsed_time << " ms.\n";
cudaEventDestroy(start);
cudaEventDestroy(stop);
则是CUDA用来计算耗时的标准代码。
CMake配置文件如下:
cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(matrix_multiply VERSION 0.1.0)
# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)
# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")
# Disable warning messages
cmake_policy(SET CMP0074 NEW)
# Find CUDA
find_package(CUDA REQUIRED)
# These flags embed debugging information for both host and device code
# Note that CUDA code with debugging information will greatly reduce efficiency!
# Do not add the -G option unless debugging is necessary!
set(CUDA_NVCC_FLAGS -G -g)
# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)
上述配置文件的含义如下图所示:
编译方式说明:
- 使用cmake的编译:
mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标,也可直接使用:make
cmake --build .
- 如直接使用
nvcc
编译,指令为:
nvcc -O3 -G -g -std=c++17 matrix_multiply.cu -lcublas -o matrix_multiply
上述编译语句的选项含义为:-O3
使用O3级优化,-G
设备(device)代码包含调试信息,-g
主机(host)代码包含调试信息,-std=c++17
使用C++17标准,-lcublas
表示链接cublas
库。
注意:CUDA代码带调试信息会极大降低效率,除非调试需要,否则不要添加-G选项。
- GCC编译器版本必须为9.1以上版本(Ubuntu 20.04 2021年以后的版本默认就是GCC 9.3)
运行结果如下:
./matrix_multiply
[Matrix Multiply Using CUDA] - Starting...
MatrixA(320, 320), MatrixB(640, 320)
Computing result using CUDA Kernel...
warmup done
Performance= 34.14 GFlop/s, Time= 3.839 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS