Tensorrt的使用

石江浩

已于 2024-01-29 09:20:53 修改

阅读量474

点赞数 9

文章标签：算法深度学习

于 2024-01-16 22:54:24 首次发布

本文链接：https://blog.csdn.net/weixin_48136861/article/details/135590227

版权

TensorRT(简写为TRT)是英伟达推出的针对英伟达显卡上深度学习模型部署加速的工具，使用时通常包含两个主要步骤，一是加载模型，二是执行推理，数据的前后处理有的也可以考虑放在trt模块中以提供显卡的加速能力，后续文章将会逐步展开。

TensorRT依赖cuda和cudnn，在配置运行环境时不建议统一使用最新的cuda环境，而是根据硬件的发售时期选择发售当时推出的cuda版本或者其下一个版本

完整代码见：learn: 储存相关demo

使用TRT基本代码需要包含头文件

#include <cuda_runtime.h>

#include <NvInfer.h>

使用TensorRT首先需要创建自己的日志类，继承并重写TRT的log函数，最基本的打印如下，也可自行添加日志级别过滤、文档保存等：

class JLogger : public nvinfer1::ILogger
{
public:
    virtual void log(Severity severity, const char* msg) noexcept override{
        printf("LOG(%d):%s\n", severity, msg);
    }
};

首先创建网络，以下代码为手动构建一个3*3的卷积网络，并保存，为方便理解每行都加了注释，最常用的做法还是直接调用onnx模型进行反序列化。

void buildModel()
{
    JLogger logger;//实例化logger，用作日志保存
    auto builder = nvinfer1::createInferBuilder(logger);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1U);//创建网络
    auto config  = builder->createBuilderConfig(); //创建编译配置
    builder->setMaxBatchSize(1);//配置batchsize
    config->setMaxWorkspaceSize(1 << 30); //分配工作空间
    //这个工作空间为cuda自己管理的内存复用空间，优化时可调小
    float kernel_weight[] = { //准备卷积核
        1, 0, 0,
        0, 1, 0,
        0, 0, 1
    };
    nvinfer1::Weights conv1_weight; //初始化卷积核权重
    nvinfer1::Weights conv1_no_bias;//初始化卷积核偏置
    conv1_weight.count = sizeof(kernel_weight) / sizeof(kernel_weight[0]);//设定权重数量
    conv1_weight.values= kernel_weight;//设定权重数据
    conv1_weight.type  = nvinfer1::DataType::kFLOAT;//设定数据类型    
    conv1_no_bias.count = 0;//设定偏置项数量
    conv1_no_bias.values= nullptr;//设定偏置数据
    conv1_no_bias.type = nvinfer1::DataType::kFLOAT;//设定偏置数据类型
    auto input =network->addInput(//网络添加输入节点
        "Image",//设定输入节点的名称
        nvinfer1::DataType::kFLOAT,//设定输入节点的数据类型
        nvinfer1::Dims4(1, 1, 3, 3));//设定输入节点的shape
    auto conv1 = network->addConvolution(
        *input,//输入节点的数据，需使用引用
        1,//输出的通道数
        nvinfer1::DimsHW(3, 3),//卷积核大小
        conv1_weight,//指定卷积权重
        conv1_no_bias//指定偏置项
    );
    //添加激活层
    auto relu1 = network->addActivation(
        *conv1->getOutput(0),   //使用卷积层的输出
        nvinfer1::ActivationType::kRELU// 使用ReLU
    );
    auto output = relu1->getOutput(0);//获取ReLU的输出
    output->setName("Predict");//设定输出节点名称为Predict
    network->markOutput(*output);//将output标记为网络输出

    auto engine = builder->buildEngineWithConfig(*network, *config);//构建引擎
    auto host_memory = engine->serialize();//获取引擎指针
    const char model_save_path[] = "cnn.engine";
    save_to_file(model_save_path, host_memory->data(), host_memory->size());
    host_memory->destroy();
    engine->destroy();
    config->destroy();
    network->destroy();
    builder->destroy();
    cout << "save to " << model_save_path << endl;
}

接下来可以调用上一步构建的网络，执行推理，

void model_inference()
{
    JLogger logger;//建立日志类
    cudaStream_t stream = nullptr;//创建流
    cudaStreamCreate(&stream);
    cudaSetDevice(0);//选择显卡
    const char engine_path[] = "/data/learn/learn/Tensorrt/workspace/cnn.engine";
    auto model_data = load_from_file(engine_path);//加载引擎文件
    if (model_data.empty()){
        printf("model %s load failed.", engine_path);
    }
    auto runtime = nvinfer1::createInferRuntime(logger);//创建运行时对象
    auto engine = runtime->deserializeCudaEngine(model_data.data(), model_data.size());//反序列化引擎
    auto context = engine->createExecutionContext();//创建上下文
    //获取输入输出节点相关信息
    // int nbindings = engine->getNbBindings();//获取输入输出节点总数
    // for (int i=0; i<nbindings; i++)
    // {
    //     auto dims = engine->getBindingDimensions(i);//获取节点shape
    //     auto name = engine->getBindingName(i);//获取节点名称
    //     auto type = engine->bindingIsInput(i) ? "input" : "output";//判断是否为输入节点
    //     cout << "binging" << name << i << "dim is" << format_dim(dims) << " type is " << type << endl;
    // }
    //准备输入输出数据及内存
    float* output_data_device = nullptr;//准备输出数据内存
    cudaMalloc(&output_data_device, sizeof(float));
    float output_data[1];
    float input_data[] = {//准备输入数据
        1,2,3,
        4,5,6,
        7,8,9
    };
    size_t input_size = sizeof(input_data);//获取输入数据大小
    float* input_data_device = nullptr;
    cudaMalloc(&input_data_device, input_size);//分配输入数据的显存
    cudaMemcpyAsync(input_data_device, input_data, input_size, cudaMemcpyHostToDevice, stream);//输入数据异步拷贝
    void* bingdings_device_pointer[] = {input_data_device, output_data_device};//将输入输出的地址按顺序储存
    //执行推理
    bool finished = context->enqueueV2( bingdings_device_pointer, stream, nullptr);
    if(!finished)
    {
        cout << "Enqueue failed." << endl;
    }
    cudaMemcpyAsync(output_data, output_data_device, sizeof(float), cudaMemcpyDeviceToHost, stream);//将执行结果传回cpu
    cudaStreamSynchronize(stream);//等待数据传输完毕
    cout << "output data size" << sizeof(output_data) << endl;
    cout << "Enqueue succeed, result is " << output_data[0] << endl;
    //释放内存
    cudaFree(input_data_device);
    cudaFree(output_data_device);
    context->destroy();
    engine->destroy();
    runtime->destroy();
    cudaStreamDestroy(stream);
}

动态输入大小：

auto profile = builder.createOptimizationProfile();

profile->setDimensions("foo", OptProfileSelector::kMIN, Dims4(n, c, h, w)));

config.addOpeimizationProfile(profile);

其中OptProfileSelector::kMIN表示最小输入尺寸，kOPT为最常用输入尺寸，kMAX为最大输入尺寸，可根据实际使用情况做调整，使用时只需要在范围之间就能运行。

石江浩

关注

9
点赞
踩
7

收藏

觉得还不错? 一键收藏
1
评论
Tensorrt的使用

TensorRT(简写为TRT)是英伟达推出的针对英伟达显卡上深度学习模型部署加速的工具，使用时通常包含两个主要步骤，一是加载模型，二是执行推理，数据的前后处理有的也可以考虑放在trt模块中以提供显卡的加速能力，后续文章将会逐步展开。首先创建网络，以下代码为手动构建一个3*3的卷积网络，并保存，为方便理解每行都加了注释，最常用的做法还是直接调用onnx模型进行反序列化。接下来可以调用上一步构建的网络，执行推理，使用TRT基本代码需要包含头文件。
复制链接

扫一扫