在qtcreator中的cuda-runtime编程

Good@dz

已于 2022-04-09 16:25:41 修改

阅读量992

点赞数

分类专栏： tensorrt 文章标签： visual studio c语言 c++

于 2022-02-21 22:33:27 首次发布

本文链接：https://blog.csdn.net/qq_42178122/article/details/123057083

版权

tensorrt 专栏收录该内容

8 篇文章 4 订阅

订阅专栏

1. cuda-runtime的初始化

在pro中，添加

LIBS += -lcudart    # 调用cuda_runtime函数，需要用到cudart.so  cudart中的rt就是runtime的缩写


// CUDA运行时头文件
#include <cuda_runtime.h>

// CUDA驱动头文件
#include <cuda.h>
#include <stdio.h>
#include <string.h>

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    if(code != cudaSuccess){    
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

int main(){

    CUcontext context = nullptr;
    cuCtxGetCurrent(&context);//cuCtxGetCurrent（）可用于检索初始化期间创建的上下文。 该上下文可以被后续的驱动程序API调用使用
    printf("Current context = %p，当前无context\n", context);

    // cuda runtime是以cuda为基准开发的运行时库
    // cuda runtime所使用的CUcontext是基于cuDevicePrimaryCtxRetain函数获取的
    // 即，cuDevicePrimaryCtxRetain会为每个设备关联一个context，通过cuDevicePrimaryCtxRetain函数可以获取到
    // 而context初始化的时机是懒加载模式，即当你调用一个runtime api时，会触发创建动作
    // 也因此，避免了cu驱动级别的init和destroy操作。使得api的调用更加容易
    int device_count = 0;
    checkRuntime(cudaGetDeviceCount(&device_count));
    printf("device_count = %d\n", device_count);

    // 取而代之，是使用setdevice来控制当前上下文，当你要使用不同设备时,可通过设置不同的device id来实现
    int device_id = 0;
    printf("set current device to : %d，这个API依赖CUcontext，触发创建并设置\n", device_id);
    checkRuntime(cudaSetDevice(device_id));

    // 注意，是由于set device函数是“第一个执行的需要context的函数”，所以他会执行cuDevicePrimaryCtxRetain
    // 并设置当前context，这一切都是默认执行的。注意：cudaGetDeviceCount是一个不需要context的函数
    // 你可以认为绝大部分runtime api都是需要context的，所以第一个执行的cuda runtime函数，会创建context并设置上下文
    cuCtxGetCurrent(&context);
    printf("SetDevice after, Current context = %p，获取当前context\n", context);

    int current_device = 0;
    checkRuntime(cudaGetDevice(&current_device));//获取当前设备I,并返回给current_device
    printf("current_device = %d\n", current_device);
    return 0;
}

2. GPU的内存分配

// CUDA运行时头文件
#include <cuda_runtime.h>

#include <stdio.h>
#include <string.h>

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    if(code != cudaSuccess){    
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

int main(){

    int device_id = 0;
    checkRuntime(cudaSetDevice(device_id));//设置使用哪个GPU

    float* memory_device = nullptr;
    checkRuntime(cudaMalloc(&memory_device, 100 * sizeof(float)));//设置一块显卡内存,其大小为100个浮点数的一维数组

    float* memory_host = new float[100];//float* ptr1 = new float[100]语句声明ptr1是float型指针，并将其初始化为指向一个具有100个float型元素数组的首地址
    memory_host[2] = 520.25;
    checkRuntime(cudaMemcpy(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice));//从内存拷贝到显卡内存

    float* memory_page_locked = nullptr;
    checkRuntime(cudaMallocHost(&memory_page_locked, 100 * sizeof(float)));
    checkRuntime(cudaMemcpy(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost));//从显卡内存拷贝到内存

    printf("%f\n", memory_page_locked[2]);
    checkRuntime(cudaFreeHost(memory_page_locked));
    checkRuntime(cudaFree(memory_device));
    delete [] memory_host;
    return 0;
}

3. cudastream流管理,函数的异步控制

的块并行也好，线程并行也好，运行的核函数都是相同的（代码一样，传递参数也一样）。流可以实现在一个设备上运行多个核函数,其可以执行不同的核函数，也可以实现对同一个核函数传递不同的参数，实现任务级别的并行。

//基本函数
cudaStream_t stream//定义流
cudaStreamCreate(cudaStream_t * s)//创建流
cudaStreamDestroy(cudaStream_t s)//销毁流
//显性同步
cudaStreamSynchronize()//同步单个流：等待该流上的命令都完成
cudaDeviceSynchronize()//同步所有流同步：等待整个设备上流都完成
cudaStreamWaitEvent()//通过某个事件：等待某个事件结束后执行该流上的命令
cudaStreamQuery()//查询一个流任务是否完成
//回调
cudaStreamAddCallback()//在任何点插入回调函数
//优先级
cudaStreamCreateWithPriority()
cudaDeviceGetStreamPriorityRange()


// CUDA运行时头文件
#include <cuda_runtime.h>

#include <stdio.h>
#include <string.h>

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    if(code != cudaSuccess){    
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

int main(){
    int device_id = 0;
    checkRuntime(cudaSetDevice(device_id));

    cudaStream_t stream = nullptr;//定义流
    checkRuntime(cudaStreamCreate(&stream));//用于创建流

    float* memory_device = nullptr;
    checkRuntime(cudaMalloc(&memory_device, 100 * sizeof(float)));

    float* memory_host = new float[100];
    memory_host[2] = 520.25;
    checkRuntime(cudaMemcpyAsync(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice, stream));//非默认stream中的数据传输使用函数cudaMemcpyAsync()

    float* memory_page_locked = nullptr;
    checkRuntime(cudaMallocHost(&memory_page_locked, 100 * sizeof(float)));
    checkRuntime(cudaMemcpyAsync(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost, stream));
    checkRuntime(cudaStreamSynchronize(stream));//用于单个流同步
   
    printf("%f\n", memory_page_locked[2]);
    checkRuntime(cudaFreeHost(memory_page_locked));
    checkRuntime(cudaFree(memory_device));
    checkRuntime(cudaStreamDestroy(stream));//用于销毁流
    delete [] memory_host;
    return 0;
}

参考链接：https://blog.csdn.net/mounty_fsc/article/details/51092933

Good@dz

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
在qtcreator中的cuda-runtime编程

1. cuda-tuntime的初始化// CUDA运行时头文件#include <cuda_runtime.h>// CUDA驱动头文件#include <cuda.h>#include <stdio.h>#include <string.h>#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)bool __check_cuda_ru
复制链接

扫一扫

专栏目录