目录
前言
CUDA流:一系列将在GPU上按照顺序执行的操作。可以将一个流看做是GPU上的一个任务,不同任务可以并行执行。如下图所示,利用三个流,同一个流上的任务顺序执行,不同流上的任务可以同时执行,从而实现并发操作。
实例代码如下:
// cuda runtime 头文件
#include <cuda_runtime.h>
#include <stdio.h>
#include <string.h>
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
bool __check_cuda_runtime(cudaError_t code, const char *op, const char *file, int line)
{
if (code != cudaSuccess)
{
const char *err_name = cudaGetErrorName(code);
const char *err_message = cudaGetErrorString(code);
printf("%s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
int main()
{
int device_id = 0;
checkRuntime(cudaSetDevice(device_id));
cudaStream_t stream = nullptr;
// 创建cuda流
checkRuntime(cudaStreamCreate(&stream));
float *memory_device = nullptr;// 地址在cpu,值是gpu的地址
// allocate memory on device(GPU)
checkRuntime(cudaMalloc(&memory_device, 100 * sizeof(float)));
printf("memory_device = %p\n", memory_device);//打印gpu地址
float *memory_host = new float[100];
memory_host[2] = 520.25;
// copy host memory data to device memory async using our stream
checkRuntime(cudaMemcpyAsync(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice, stream));
printf("memory_device = %p\n", memory_device);
// allocate memory on host(CPU)
float *memory_page_locked = nullptr;
checkRuntime(cudaMallocHost(&memory_page_locked, 100 * sizeof(float)));//分配页锁内存
// copy device memory data to host memory async using our stream
checkRuntime(cudaMemcpyAsync(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost, stream));
printf("memory_page_locked[2] = %f\n", memory_page_locked[2]);//因为是异步的,所以来不及拷贝
// 等待stream中的队列处理完
checkRuntime(cudaStreamSynchronize(stream));
printf("memory_page_locked[2] = %f\n", memory_page_locked[2]);
checkRuntime(cudaFreeHost(memory_page_locked));
checkRuntime(cudaFree(memory_device));
checkRuntime(cudaStreamDestroy(stream));
delete[] memory_host;
return 0;
}
代码运行结果:
memory_device = 0x7f6506800000
memory_device = 0x7f6506800000
memory_page_locked[2] = 0.000000
memory_page_locked[2] = 520.250000
上述代码展示用cuda流完成异步拷贝,cpu数据拷贝到gpu,再从gpu拷贝到锁页内存。
在同步checkRuntime(cudaStreamSynchronize(stream))前,打印为0,因为拷贝还未完成,之后打印为 520.250000。