Learning Roadmap:
Section 1: Intro to Parallel Programming & MUSA
- Deep Learning Ecosystem(摩尔线程 国产显卡 MUSA 并行编程 学习笔记-2024/11/30-CSDN博客)
- Ubuntu+Driver+Toolkit+conda+pytorch+torch_musa环境安装(2024/11/24-Ubuntu Windows双系统安装 | 2024/11/30-GPU驱动&MUSA Toolkit安装)
- C/C++ Review(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/22-CSDN博客)
- GPU intros(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/25-CSDN博客)
- GPU硬件架构 (摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/26-CSDN博客)
- Write First Kernels (Here) (2024/11/27-线程层级 | 2024/11/28-First MUSA Kernel to Count Thread | 2024/12/02-向量相加 | 2024/12/03-向量相加(3D)| 2024/12/04-矩阵相乘)
- MUSA API
- Faster Matrix Multiplication
- Triton
- Pytorch Extensions(摩尔线程国产显卡 MUSA 并行编程学习笔记-2024/11/21-CSDN博客)
- MNIST Multilayer Perceptron
Section 2: Parallel Programming & MUSA in Depth
- Analyzing Parallel Program Performance on a Quad-Core CPU
- Scheduling Task Graphs on a Multi-Core CPU
- A Simple Renderer in MUSA
- Optimizing DNN Performance on DNN Accelerator Hardware
- llm.c
Ref:摩尔学院 | High-Performance Computing with GPUs | Stanford CS149 - Video | Stanford CS149 - Syllabus
Stream
Ref: High-Performance Computing with GPUs Chapter 5
Definition
CUDA stream 就是一条任务队列,你可以将一系列的 CUDA 操作(例如内存传输、核函数启动等)按顺序放入同一个 stream 中,这些操作会严格按照入队顺序执行。
Why Use
重叠计算与数据传输:
通过在不同的 stream 中安排任务,你可以在一个 stream 中执行数据传输的操作(例如主机与设备之间的数据拷贝),同时在另一个 stream 中启动核函数进行计算。这样,数据传输的等待时间可以被计算过程隐藏,提高整体性能。
并行执行:
多个 stream 之间的任务是互相独立的,可以并行执行,从而更充分地利用 GPU 的多通道执行能力。
资源优化:
合理划分任务到不同的 stream,可以让 GPU 同时利用计算引擎和数据传输引擎,提高硬件资源的利用率。
Stream相关常用操作
// 1. 创建一个 stream
cudaStream_t stream;
cudaStreamCreate(&stream);
// 2. 异步将数据从主机拷贝到设备
cudaMemcpyAsync(deviceInput, hostInput, dataSize, cudaMemcpyHostToDevice, stream);
// 3. 在同一 stream 中启动核函数进行数据处理
kernel<<<gridDim, blockDim, 0, stream>>>(deviceInput, deviceOutput);
// 4. 异步将结果从设备拷贝回主机
cudaMemcpyAsync(hostOutput, deviceOutput, dataSize, cudaMemcpyDeviceToHost, stream);
// 5. 等待该 stream 中的所有任务完成
cudaStreamSynchronize(stream);
// 6. 销毁 stream
cudaStreamDestroy(stream);
示例
如下列代码示例,通过创建两个stream并通过musaMemcpyAsync,同时传递数据A与B从host至device,更好的使用了host与device之间的带宽,并减少了运行时常。
代码地址
代码
#include <musa_runtime.h>
#include <stdio.h>
#define CHECK_MUSA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != musaSuccess) {
fprintf(stderr, "MUSA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(err), musaGetErrorString(err), func);
exit(EXIT_FAILURE);
}
}
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C[i] = A[i] + B[i];
}
}
int main(void) {
int numElements = 50000;
size_t size = numElements * sizeof(float);
float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;
musaStream_t stream1, stream2;
// Allocate host memory
h_A = (float *)malloc(size);
h_B = (float *)malloc(size);
h_C = (float *)malloc(size);
// Initialize host arrays
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Allocate device memory
CHECK_MUSA_ERROR(musaMalloc((void **)&d_A, size));
CHECK_MUSA_ERROR(musaMalloc((void **)&d_B, size));
CHECK_MUSA_ERROR(musaMalloc((void **)&d_C, size));
// Create streams
CHECK_MUSA_ERROR(musaStreamCreate(&stream1));
CHECK_MUSA_ERROR(musaStreamCreate(&stream2));
// Copy inputs to device asynchronously
CHECK_MUSA_ERROR(musaMemcpyAsync(d_A, h_A, size, musaMemcpyHostToDevice, stream1));
CHECK_MUSA_ERROR(musaMemcpyAsync(d_B, h_B, size, musaMemcpyHostToDevice, stream2));
// Launch kernels
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_A, d_B, d_C, numElements);
// Copy result back to host asynchronously
CHECK_MUSA_ERROR(musaMemcpyAsync(h_C, d_C, size, musaMemcpyDeviceToHost, stream1));
// Synchronize streams
CHECK_MUSA_ERROR(musaStreamSynchronize(stream1));
CHECK_MUSA_ERROR(musaStreamSynchronize(stream2));
// Verify result
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Clean up
CHECK_MUSA_ERROR(musaFree(d_A));
CHECK_MUSA_ERROR(musaFree(d_B));
CHECK_MUSA_ERROR(musaFree(d_C));
CHECK_MUSA_ERROR(musaStreamDestroy(stream1));
CHECK_MUSA_ERROR(musaStreamDestroy(stream2));
free(h_A);
free(h_B);
free(h_C);
return 0;
}
编译
mcc 01_stream_basics.mu -o 01_stream_basics -mtgpu -O2 -lmusart
./01_stream_basics
输出结果
结果验证正确