jetson orin nano super AI模型部署之路（八）tensorrt C++ api介绍-CSDN博客

本文链接：https://blog.csdn.net/bornfree5511/article/details/147705277

我们基于tensorrt-cpp-api这个仓库介绍。这个仓库的代码是一个非常不错的tensorrt的cpp api实现，可基于此开发自己的项目。

我们从src/main.cpp开始按顺序说明。

一、首先是声明我们创建tensorrt model的参数。

 // Specify our GPU inference configuration options
    Options options;
    // Specify what precision to use for inference
    // FP16 is approximately twice as fast as FP32.
    options.precision = Precision::FP16;
    // If using INT8 precision, must specify path to directory containing
    // calibration data.
    options.calibrationDataDirectoryPath = "";
    // Specify the batch size to optimize for.
    options.optBatchSize = 1;
    // Specify the maximum batch size we plan on running.
    options.maxBatchSize = 1;
    // Specify the directory where you want the model engine model file saved.
    options.engineFileDir = ".";

这里Options是在src/engine.h中声明的一个结构体


// 网络的选项配置
struct Options {
    // 用于 GPU 推理的精度模式
    // 可选值：FP32（单精度浮点）、FP16（半精度浮点）、INT8（8位量化）
    Precision precision = Precision::FP16;

    // 如果选择 INT8 精度，必须提供校准数据集的目录路径
    std::string calibrationDataDirectoryPath;

    // 用于计算 INT8 校准数据的批量大小
    // 应设置为 GPU 能支持的最大批量大小
    int32_t calibrationBatchSize = 128;

    // 优化的批量大小
    // 表示引擎在构建时会针对该批量大小进行优化
    int32_t optBatchSize = 1;

    // 最大允许的批量大小
    // 表示引擎支持的最大批量大小
    int32_t maxBatchSize = 16;

    // GPU 设备索引
    // 用于指定在哪个 GPU 上运行推理
    int deviceIndex = 0;

    // 引擎文件保存的目录
    // 用于存储 TensorRT 引擎文件
    std::string engineFileDir = ".";

    // 最大允许的输入宽度
    // 默认为 -1，表示期望固定的输入大小
    int32_t maxInputWidth = -1;

    // 最小允许的输入宽度
    // 默认为 -1，表示期望固定的输入大小
    int32_t minInputWidth = -1;

    // 优化的输入宽度
    // 默认为 -1，表示期望固定的输入大小
    int32_t optInputWidth = -1;
};

二、构建tensorrt推理的engine

Engine<float> engine(options);

这里Engine是一个模板类，继承字IEngine类。IEngine 是一个抽象接口类，定义了 TensorRT 推理引擎的核心功能。
它包括构建和加载网络、运行推理、获取输入和输出张量维度等功能。
通过模板参数 T，可以支持不同的数据类型（如 float 或 int）。
具体的实现需要在派生类中完成。

IEngine类的实现在include/interfaces/IEngine.h中。

template <typename T>
class IEngine {
public:
    // 虚析构函数，确保派生类的资源能够正确释放
    virtual ~IEngine() = default;

    // 构建并加载 ONNX 模型到 TensorRT 引擎文件
    // 参数：
    // - onnxModelPath: ONNX 模型文件的路径
    // - subVals: 输入数据的减法均值，用于归一化
    // - divVals: 输入数据的除法均值，用于归一化
    // - normalize: 是否对输入数据进行归一化
    // 返回值：
    // - 如果成功构建并加载网络，返回 true；否则返回 false
    virtual bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                                  const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;

    // 从磁盘加载 TensorRT 引擎文件到内存
    // 参数：
    // - trtModelPath: TensorRT 引擎文件的路径
    // - subVals: 输入数据的减法均值，用于归一化
    // - divVals: 输入数据的除法均值，用于归一化
    // - normalize: 是否对输入数据进行归一化
    // 返回值：
    // - 如果成功加载网络，返回 true；否则返回 false
    virtual bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                             const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;

    // 运行推理
    // 参数：
    // - inputs: 输入数据，格式为 [input][batch][cv::cuda::GpuMat]
    // - featureVectors: 输出数据，格式为 [batch][output][feature_vector]
    // 返回值：
    // - 如果推理成功，返回 true；否则返回 false
    virtual bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, 
                              std::vector<std::vector<std::vector<T>>> &featureVectors) = 0;

    // 获取输入张量的维度
    // 返回值：
    // - 输入张量的维度列表，类型为 nvinfer1::Dims3
    virtual const std::vector<nvinfer1::Dims3> &getInputDims() const = 0;

    // 获取输出张量的维度
    // 返回值：
    // - 输出张量的维度列表，类型为 nvinfer1::Dims
    virtual const std::vector<nvinfer1::Dims> &getOutputDims() const = 0;
};

再来看Engine类的实现。

template <typename T>
class Engine : public IEngine<T> {
public:
    // 构造函数，初始化引擎选项
    Engine(const Options &options);

    // 析构函数，释放 GPU 缓冲区
    ~Engine();

    // 构建并加载 ONNX 模型到 TensorRT 引擎文件
    // 将模型缓存到磁盘以避免重复构建，并加载到内存中
    // 默认情况下，输入数据会被归一化到 [0.f, 1.f]
    // 如果需要 [-1.f, 1.f] 的归一化，可以通过 subVals 和 divVals 参数设置
    bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                          const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;

    // 从磁盘加载 TensorRT 引擎文件到内存
    // 默认情况下，输入数据会被归一化到 [0.f, 1.f]
    // 如果需要 [-1.f, 1.f] 的归一化，可以通过 subVals 和 divVals 参数设置
    bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                     const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;

    // 运行推理
    // 输入格式：[input][batch][cv::cuda::GpuMat]
    // 输出格式：[batch][output][feature_vector]
    bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;

    // 工具函数：调整图像大小并保持宽高比，通过在右侧或底部填充来实现
    // 适用于需要将检测坐标映射回原始图像的场景（例如 YOLO 模型）
    static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
                                                                const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));

    // 获取输入张量的维度
    [[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };

    // 获取输出张量的维度
    [[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };

    // 工具函数：将三维嵌套的输出数组转换为二维数组
    // 适用于批量大小为 1 且有多个输出特征向量的情况
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);

    // 工具函数：将三维嵌套的输出数组转换为一维数组
    // 适用于批量大小为 1 且只有一个输出特征向量的情况
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);

    // 工具函数：将输入从 NHWC 格式转换为 NCHW 格式，并应用归一化和均值减法
    static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB = false);

private:
    // 构建网络
    bool build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize);

    // 将引擎选项序列化为字符串
    std::string serializeEngineOptions(const Options &options, const std::string &onnxModelPath);

    // 获取设备名称列表
    void getDeviceNames(std::vector<std::string> &deviceNames);

    // 清除 GPU 缓冲区
    void clearGpuBuffers();

    // 输入数据的归一化、缩放和均值减法参数
    std::array<float, 3> m_subVals{};
    std::array<float, 3> m_divVals{};
    bool m_normalize;

    // 存储输入和输出 GPU 缓冲区的指针
    std::vector<void *> m_buffers;
    std::vector<uint32_t> m_outputLengths{};
    std::vector<nvinfer1::Dims3> m_inputDims;
    std::vector<nvinfer1::Dims> m_outputDims;
    std::vector<std::string> m_IOTensorNames;
    int32_t m_inputBatchSize;

    // TensorRT 运行时和推理上下文
    std::unique_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
    std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
    std::unique_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
    std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;

    // 引擎选项
    const Options m_options;

    // TensorRT 日志器
    Logger m_logger;
};

// 构造函数：初始化引擎选项
template <typename T>
Engine<T>::Engine(const Options &options) : m_options(options) {}

// 析构函数：清除 GPU 缓冲区
template <typename T>
Engine<T>::~Engine() { clearGpuBuffers(); }

相较于抽象接口类IEngine，Engine类添加了resizeKeepAspectRatioPadRightBottom、transformOutput、blobFromGpuMats这几个static的public函数。

    // 工具函数：调整图像大小并保持宽高比，通过在右侧或底部填充来实现
    // 适用于需要将检测坐标映射回原始图像的场景（例如 YOLO 模型）
    static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
                                                                const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));

    // 工具函数：将三维嵌套的输出数组转换为二维数组
    // 适用于批量大小为 1 且有多个输出特征向量的情况
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);

    // 工具函数：将三维嵌套的输出数组转换为一维数组
    // 适用于批量大小为 1 且只有一个输出特征向量的情况
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);

    // 工具函数：将输入从 NHWC 格式转换为 NCHW 格式，并应用归一化和均值减法
    static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB = false);

在类中，static 修饰的成员函数是静态成员函数，它与普通成员函数有以下区别：

(1)静态成员函数不依赖于类的实例
静态成员函数属于类本身，而不是类的某个实例。
调用静态成员函数时，不需要创建类的对象，可以直接通过类名调用。

(2)静态成员函数不能访问非静态成员变量
静态成员函数没有 this 指针，因此无法访问类的非静态成员变量或非静态成员函数。
静态成员函数只能访问类的静态成员变量或调用其他静态成员函数。

在这里为什么使用 static？

(1)工具函数的设计
transformOutput 是一个工具函数，用于将三维嵌套的输出数组转换为二维数组。
它的功能与类的实例无关，因此设计为静态函数更合理。
这样可以直接通过类名调用，而不需要创建类的对象。

(2)提高代码的可读性和效率
将与类实例无关的函数声明为静态函数，可以明确表示该函数不依赖于类的状态。
静态函数的调用效率通常比非静态函数更高，因为它不需要隐式传递 this 指针。

第二个修改是对于getInputDims和getOutputDims函数的处理。在抽象接口类IEngine中，其实现方式是

virtual const std::vector<nvinfer1::Dims3> &getInputDims() const = 0;

在Engine类中，其声明变成了

    // 获取输入张量的维度
    [[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };

[[nodiscard]] 是 C++17 引入的一个属性，用于提示调用者不要忽略函数的返回值。
如果调用者忽略了带有 [[nodiscard]] 属性的函数的返回值，编译器会发出警告。

在这段代码中，getInputDims() 返回的是输入张量的维度信息。如果调用者忽略了这个返回值，可能会导致程序逻辑错误。
使用 [[nodiscard]] 可以提醒开发者注意返回值的重要性。

例如engine.getInputDims(); // 如果没有使用返回值，编译器会发出警告。

override 是 C++11 引入的关键字，用于显式声明一个函数是从基类继承并重写的虚函数。
如果函数没有正确地重写基类中的虚函数（例如函数签名不匹配），编译器会报错。

它可以防止由于函数签名错误而导致的意外行为。
在这段代码中，getInputDims() 是从基类 IEngine 中继承并重写的虚函数。

第一个 const: 表示返回的引用是常量，调用者不能修改返回的 std::vector。
第二个 const: 表示该成员函数不会修改类的成员变量，保证函数是只读的。

需要注意的是，在src/engine.h中，使用了模版类的实现，这有点要注意：


// 忽略之前的所有代码

// 构造函数：初始化引擎选项
template <typename T>
Engine<T>::Engine(const Options &options) : m_options(options) {}

// 析构函数：清除 GPU 缓冲区
template <typename T>
Engine<T>::~Engine() { clearGpuBuffers(); }

// Include inline implementations
#include "engine/EngineRunInference.inl"
#include "engine/EngineUtilities.inl"
#include "engine/EngineBuildLoadNetwork.inl"

在文件的最后，才include了这几个头文件。

EngineRunInference.inl: 包含 runInference 函数的实现。

EngineUtilities.inl: 包含工具函数（如 transformOutput）的实现。

EngineBuildLoadNetwork.inl: 包含 buildLoadNetwork 和 loadNetwork 函数的实现。

将 #include 头文件放在文件末尾的原因是为了包含内联实现文件（.inl 文件），这种设计通常用于模板类或需要将实现与声明分离的情况下。

.inl 文件的作用
.inl 文件通常用于存放模板类或函数的实现。
在 C++ 中，模板类或模板函数的实现必须在编译时可见，因此不能像普通类那样将实现放在 .cpp 文件中。
为了保持代码的清晰和模块化，开发者会将模板类的实现从头文件中分离出来，放入 .inl 文件中。
为什么将 .inl 文件放在文件末尾？
2.1 避免重复定义
如果在头文件的开头包含 .inl 文件，可能会导致重复定义的问题，尤其是在头文件被多次包含时。
将 .inl 文件放在头文件末尾，可以确保模板类的声明先被处理，然后再处理实现部分。

2.2 确保依赖的声明已完成
.inl 文件中的实现可能依赖于头文件中声明的类或函数。
将 .inl 文件放在头文件末尾，可以确保所有必要的声明都已完成，避免编译错误。

2.3 提高代码的可读性
将 .inl 文件放在头文件末尾，可以将类的声明和实现逻辑分开，增强代码的可读性和可维护性。
开发者可以在头文件中快速查看类的接口，而不需要直接阅读实现细节。
3. 为什么不直接放在 .cpp 文件中？
模板类或模板函数的实现不能放在 .cpp 文件中，因为模板的具体类型是在编译时实例化的，而 .cpp 文件是在链接阶段处理的。
如果将模板实现放在 .cpp 文件中，编译器在实例化模板时将无法找到实现，导致链接错误。

三、tensorrt推理Engine的具体实现

3.1 buildLoadNetwork实现

buildLoadNetwork在include/engine/EngineBuildLoadNetwork.inl中实现。

template <typename T>
bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals,
                                 bool normalize) {
    const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
    const auto engineDir = std::filesystem::path(m_options.engineFileDir);
    std::filesystem::path enginePath = engineDir / engineName;
    spdlog::info("Searching for engine file with name: {}", enginePath.string());

    if (Util::doesFileExist(enginePath)) {
        spdlog::info("Engine found, not regenerating...");
    } else {
        if (!Util::doesFileExist(onnxModelPath)) {
            auto msg = "Could not find ONNX model at path: " + onnxModelPath;
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }

        spdlog::info("Engine not found, generating. This could take a while...");
        if (!std::filesystem::exists(engineDir)) {
            std::filesystem::create_directories(engineDir);
            spdlog::info("Created directory: {}", engineDir.string());
        }

        auto ret = build(onnxModelPath, subVals, divVals, normalize);
        if (!ret) {
            return false;
        }
    }

    return loadNetwork(enginePath, subVals, divVals, normalize);
}

其中，最重要的是build函数的实现。build的实现比较长，该函数的作用是从一个 ONNX 模型文件构建 TensorRT 引擎。
它会解析 ONNX 模型文件，设置优化配置（如动态批量大小、动态输入宽度、精度模式等），并最终生成一个序列化的 TensorRT 引擎文件。

其完整实现如下代码，后面会对其中重要代码展开解释，一并写在代码注释上。

3.2 build函数实现（最重要的函数）

template <typename T>
bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize) {
    // Create our engine builder.
    //创建 TensorRT 构建器和网络

    // nvinfer1::createInferBuilder:
    // 创建一个 TensorRT 构建器（IBuilder），用于构建和优化网络。
    // m_logger 是一个自定义的日志器，用于记录 TensorRT 的日志信息。


    auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
    if (!builder) {
        return false;
    }

    // Define an explicit batch size and then create the network (implicit batch
    // size is deprecated). More info here:
    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch
    
    // builder->createNetworkV2:
    // 创建一个网络定义（INetworkDefinition），用于描述神经网络的结构。
    // 使用 kEXPLICIT_BATCH 标志表示显式批量大小（TensorRT 7.0 之后推荐使用）。
    auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
    if (!network) {
        return false;
    }

    // Create a parser for reading the onnx file.
    //  创建 ONNX 解析器并解析模型

    // nvonnxparser::createParser:
    // 创建一个 ONNX 解析器（IParser），用于将 ONNX 模型解析为 TensorRT 的网络定义。

    auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
    if (!parser) {
        return false;
    }

    // We are going to first read the onnx file into memory, then pass that buffer
    // to the parser. Had our onnx model file been encrypted, this approach would
    // allow us to first decrypt the buffer.

    // 读取 ONNX 文件:
    // 将 ONNX 模型文件读入内存缓冲区（buffer），然后传递给解析器。
    // 这种方式允许对模型文件进行预处理（如解密）。
    std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
    std::streamsize size = file.tellg();
    file.seekg(0, std::ios::beg);

    std::vector<char> buffer(size);
    if (!file.read(buffer.data(), size)) {
        auto msg = "Error, unable to read engine file";
        spdlog::error(msg);
        throw std::runtime_error(msg);
    }

    // parser->parse:
    // 解析 ONNX 模型文件，将其转换为 TensorRT 的网络定义。
    // ONNX 是一种常见的模型格式，TensorRT 提供了专门的解析器来支持 ONNX 模型。
    // 通过将文件读入内存，可以灵活处理加密或压缩的模型文件。
    // Parse the buffer we read into memory.
    auto parsed = parser->parse(buffer.data(), buffer.size());
    if (!parsed) {
        return false;
    }

    // Ensure that all the inputs have the same batch size
 
    const auto numInputs = network->getNbInputs();
    if (numInputs < 1) {
        auto msg = "Error, model needs at least 1 input!";
        spdlog::error(msg);
        throw std::runtime_error(msg);
    }

    // 检查输入的批量大小
    // TensorRT 要求所有输入的批量大小必须一致。
    const auto input0Batch = network->getInput(0)->getDimensions().d[0];
    for (int32_t i = 1; i < numInputs; ++i) {
        if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
            auto msg = "Error, the model has multiple inputs, each with differing batch sizes!";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    // Check to see if the model supports dynamic batch size or not
    bool doesSupportDynamicBatch = false;
    if (input0Batch == -1) {
        doesSupportDynamicBatch = true;
        spdlog::info("Model supports dynamic batch size");
    } else {
        spdlog::info("Model only supports fixed batch size of {}", input0Batch);
        // If the model supports a fixed batch size, ensure that the maxBatchSize
        // and optBatchSize were set correctly.
        if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
            auto msg = "Error, model only supports a fixed batch size of " + std::to_string(input0Batch) +
                       ". Must set Options.optBatchSize and Options.maxBatchSize to 1";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    const auto input3Batch = network->getInput(0)->getDimensions().d[3];
    bool doesSupportDynamicWidth = false;
    if (input3Batch == -1) {
        doesSupportDynamicWidth = true;
        spdlog::info("Model supports dynamic width. Using Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth to set the input width.");

        // Check that the values of maxInputWidth, minInputWidth, and optInputWidth are valid
        if (m_options.maxInputWidth < m_options.minInputWidth || m_options.maxInputWidth < m_options.optInputWidth ||
            m_options.minInputWidth > m_options.optInputWidth
            || m_options.maxInputWidth < 1 || m_options.minInputWidth < 1 || m_options.optInputWidth < 1) {
            auto msg = "Error, invalid values for Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    //设置优化配置

    auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
    if (!config) {
        return false;
    }

    //设置优化配置，包括动态批量大小和动态输入宽度。
    // 使用 IOptimizationProfile 定义输入的最小、优化和最大维度。
    // 为什么要这么写？
    // TensorRT 允许为动态输入设置优化配置，以便在推理时根据实际输入调整性能。
    // 通过设置优化配置，可以在性能和灵活性之间取得平衡。
    // Register a single optimization profile
    nvinfer1::IOptimizationProfile *optProfile = builder->createOptimizationProfile();
    for (int32_t i = 0; i < numInputs; ++i) {
        // Must specify dimensions for all the inputs the model expects.
        const auto input = network->getInput(i);
        const auto inputName = input->getName();
        const auto inputDims = input->getDimensions();
        int32_t inputC = inputDims.d[1];
        int32_t inputH = inputDims.d[2];
        int32_t inputW = inputDims.d[3];
    
        int32_t minInputWidth = std::max(m_options.minInputWidth, inputW);

        // Specify the optimization profile`
        if (doesSupportDynamicBatch) {
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, inputC, inputH, minInputWidth));
        } else {
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN,
                                      nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, minInputWidth));
        }

        if (doesSupportDynamicWidth) {
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
                                      nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, m_options.optInputWidth));
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
                                      nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, m_options.maxInputWidth));
        } else {
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
                                    nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW));
            optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
                                    nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, inputW));
        }
    }
    config->addOptimizationProfile(optProfile);

    // Set the precision level
    const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
    if (m_options.precision == Precision::FP16) {
        // Ensure the GPU supports FP16 inference
        if (!builder->platformHasFastFp16()) {
            auto msg = "Error: GPU does not support FP16 precision";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
        config->setFlag(nvinfer1::BuilderFlag::kFP16);
    } else if (m_options.precision == Precision::INT8) {
        if (numInputs > 1) {
            auto msg = "Error, this implementation currently only supports INT8 "
                       "quantization for single input models";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }

        // Ensure the GPU supports INT8 Quantization
        if (!builder->platformHasFastInt8()) {
            auto msg = "Error: GPU does not support INT8 precision";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }

        // Ensure the user has provided path to calibration data directory
        if (m_options.calibrationDataDirectoryPath.empty()) {
            auto msg = "Error: If INT8 precision is selected, must provide path to "
                       "calibration data directory to Engine::build method";
            throw std::runtime_error(msg);
        }

        config->setFlag((nvinfer1::BuilderFlag::kINT8));

        const auto input = network->getInput(0);
        const auto inputName = input->getName();
        const auto inputDims = input->getDimensions();
        const auto calibrationFileName = engineName + ".calibration";

        m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
                                                                m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
                                                                subVals, divVals, normalize);
        config->setInt8Calibrator(m_calibrator.get());
    }

    // CUDA stream used for profiling by the builder.
    cudaStream_t profileStream;
    Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
    config->setProfileStream(profileStream);

    // Build the engine
    // If this call fails, it is suggested to increase the logger verbosity to
    // kVERBOSE and try rebuilding the engine. Doing so will provide you with more
    // information on why exactly it is failing.

    // 构建引擎并保存到磁盘
    // 使用 builder->buildSerializedNetwork 构建序列化的 TensorRT 引擎。
    // 将引擎保存到磁盘，以便后续加载和使用。
    // 为什么要这么写？
    // 构建引擎是一个耗时操作，将引擎保存到磁盘可以避免重复构建。
    // 序列化的引擎文件可以直接加载到内存中，节省时间。
    std::unique_ptr<nvinfer1::IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
    if (!plan) {
        return false;
    }

    // Write the engine to disk
    const auto enginePath = std::filesystem::path(m_options.engineFileDir) / engineName;
    std::ofstream outfile(enginePath, std::ofstream::binary);
    outfile.write(reinterpret_cast<const char *>(plan->data()), plan->size());
    spdlog::info("Success, saved engine to {}", enginePath.string());

    Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
    return true;
}

函数的作用

解析 ONNX 模型文件。

设置优化配置（动态批量大小、动态输入宽度等）。

设置精度模式（FP16 或 INT8）。

构建 TensorRT 引擎并保存到磁盘。

涉及的 TensorRT API

nvinfer1::IBuilder: 用于构建和优化网络。

nvinfer1::INetworkDefinition: 描述神经网络的结构。

nvonnxparser::IParser: 解析 ONNX 模型文件。

nvinfer1::IBuilderConfig: 配置优化选项（如精度模式、优化配置）。

nvinfer1::IOptimizationProfile: 定义动态输入的最小、优化和最大维度。

builder->buildSerializedNetwork: 构建序列化的 TensorRT 引擎。

3.3 loadNetwork，load trt engine函数实现

在buildLoadNetwork函数中，是用build函数通过onnx构建完成trt engine后，紧接着就是loadNetwork的实现。

//省略其他代码
        auto ret = build(onnxModelPath, subVals, divVals, normalize);
        if (!ret) {
            return false;
        }
    }

    return loadNetwork(enginePath, subVals, divVals, normalize);
}

该函数的作用是从磁盘加载一个序列化的 TensorRT 引擎文件（.engine 文件），并将其反序列化为内存中的 TensorRT 引擎。
它还会初始化推理所需的上下文（IExecutionContext）和 GPU 缓冲区。

我们看一下其具体实现，然后一同将代码的具体解释写在注释上。

template <typename T>
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals,
                            bool normalize) {
    //保存归一化参数（subVals 和 divVals），用于后续对输入数据进行归一化处理。
    // normalize 参数决定是否对输入数据进行归一化。
    m_subVals = subVals;
    m_divVals = divVals;
    m_normalize = normalize;

    // Read the serialized model from disk
    if (!Util::doesFileExist(trtModelPath)) {
        auto msg = "Error, unable to read TensorRT model at path: " + trtModelPath;
        spdlog::error(msg);
        return false;
    } else {
        auto msg = "Loading TensorRT engine file at path: " + trtModelPath;
        spdlog::info(msg);
    }

    //读取引擎文件到内存
    //将序列化的 TensorRT 引擎文件读取到内存缓冲区（buffer）中。
    // 为什么要这么写？
    // TensorRT 的反序列化 API（deserializeCudaEngine）需要一个内存缓冲区作为输入。
    // 使用 std::ifstream 以二进制模式读取文件，确保文件内容不被修改。
    std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
    std::streamsize size = file.tellg();
    file.seekg(0, std::ios::beg);

    std::vector<char> buffer(size);
    if (!file.read(buffer.data(), size)) {
        auto msg = "Error, unable to read engine file";
        spdlog::error(msg);
        throw std::runtime_error(msg);
    }

    // Create a runtime to deserialize the engine file.
    // 创建 TensorRT 运行时
    // 涉及的 TensorRT API
    // nvinfer1::createInferRuntime:
    // 创建一个 TensorRT 运行时（IRuntime）对象，用于反序列化引擎文件。
    // m_logger 是一个自定义日志器，用于记录 TensorRT 的日志信息。
    // 作用
    // 创建一个运行时对象，用于管理 TensorRT 引擎的反序列化和执行。
    // 为什么要这么写？
    // TensorRT 的引擎文件是序列化的，需要通过运行时对象将其反序列化为内存中的引擎。
    m_runtime = std::unique_ptr<nvinfer1::IRuntime>{nvinfer1::createInferRuntime(m_logger)};
    if (!m_runtime) {
        return false;
    }

    // Set the device index
    auto ret = cudaSetDevice(m_options.deviceIndex);
    if (ret != 0) {
        int numGPUs;
        cudaGetDeviceCount(&numGPUs);
        auto errMsg = "Unable to set GPU device index to: " + std::to_string(m_options.deviceIndex) + ". Note, your device has " +
                      std::to_string(numGPUs) + " CUDA-capable GPU(s).";
        spdlog::error(errMsg);
        throw std::runtime_error(errMsg);
    }

    // Create an engine, a representation of the optimized model.
    // 反序列化引擎
    // 涉及的 TensorRT API
    // IRuntime::deserializeCudaEngine:
    // 将序列化的引擎文件反序列化为内存中的 TensorRT 引擎（ICudaEngine）。
    // 作用
    // 将序列化的引擎文件加载到内存中，生成一个可用于推理的 TensorRT 引擎。
    // 为什么要这么写？
    // TensorRT 的引擎文件是序列化的，需要通过反序列化生成内存中的引擎对象。
    m_engine = std::unique_ptr<nvinfer1::ICudaEngine>(m_runtime->deserializeCudaEngine(buffer.data(), buffer.size()));
    if (!m_engine) {
        return false;
    }

    // The execution context contains all of the state associated with a
    // particular invocation
    // 创建执行上下文
    //     涉及的 TensorRT API
    // ICudaEngine::createExecutionContext:
    // 创建一个执行上下文（IExecutionContext），用于管理推理时的状态。
    // 作用
    // 创建一个执行上下文，用于管理推理时的输入输出绑定和其他状态。
    // 为什么要这么写？
    // TensorRT 的推理需要一个执行上下文来管理输入输出缓冲区和执行状态。
    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
    if (!m_context) {
        return false;
    }

    // Storage for holding the input and output buffers
    // This will be passed to TensorRT for inference
    // 初始化 GPU 缓冲区
    //     清除之前的 GPU 缓冲区，并为当前引擎的输入输出张量分配新的缓冲区。
    // 为什么要这么写？
    // 每个 TensorRT 引擎的输入输出张量可能不同，因此需要重新分配缓冲区。
    clearGpuBuffers();
    m_buffers.resize(m_engine->getNbIOTensors());

    m_outputLengths.clear();
    m_inputDims.clear();
    m_outputDims.clear();
    m_IOTensorNames.clear();

    // Create a cuda stream
    cudaStream_t stream;
    Util::checkCudaErrorCode(cudaStreamCreate(&stream));

    // Allocate GPU memory for input and output buffers
    // 分配输入和输出缓冲区
    //     涉及的 TensorRT API
    // ICudaEngine::getNbIOTensors:
    // 获取引擎的输入输出张量数量。
    // ICudaEngine::getIOTensorName:
    // 获取张量的名称。
    // ICudaEngine::getTensorShape:
    // 获取张量的形状。
    // ICudaEngine::getTensorDataType:
    // 获取张量的数据类型。
    // 作用
    // 遍历引擎的所有输入输出张量，为输出张量分配 GPU 缓冲区。
    // 为什么要这么写？
    // TensorRT 的推理需要将输入输出张量绑定到 GPU 缓冲区。
    m_outputLengths.clear();
    for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
        const auto tensorName = m_engine->getIOTensorName(i);
        m_IOTensorNames.emplace_back(tensorName);
        const auto tensorType = m_engine->getTensorIOMode(tensorName);
        const auto tensorShape = m_engine->getTensorShape(tensorName);
        const auto tensorDataType = m_engine->getTensorDataType(tensorName);

        if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
            // The implementation currently only supports inputs of type float
            if (m_engine->getTensorDataType(tensorName) != nvinfer1::DataType::kFLOAT) {
                auto msg = "Error, the implementation currently only supports float inputs";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            }

            // Don't need to allocate memory for inputs as we will be using the OpenCV
            // GpuMat buffer directly.

            // Store the input dims for later use
            m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
            m_inputBatchSize = tensorShape.d[0];
        } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
            // Ensure the model output data type matches the template argument
            // specified by the user
            if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same<float, T>::value) {
                auto msg = "Error, the model has expected output of type float. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
                auto msg = "Error, the model has expected output of type __half. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kINT8 && !std::is_same<int8_t, T>::value) {
                auto msg = "Error, the model has expected output of type int8_t. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same<int32_t, T>::value) {
                auto msg = "Error, the model has expected output of type int32_t. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kBOOL && !std::is_same<bool, T>::value) {
                auto msg = "Error, the model has expected output of type bool. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kUINT8 && !std::is_same<uint8_t, T>::value) {
                auto msg = "Error, the model has expected output of type uint8_t. Engine class template parameter must be adjusted.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            } else if (tensorDataType == nvinfer1::DataType::kFP8) {
                auto msg = "Error, the model has expected output of type kFP8. This is not supported by the Engine class.";
                spdlog::error(msg);
                throw std::runtime_error(msg);
            }

            // The binding is an output
            uint32_t outputLength = 1;
            m_outputDims.push_back(tensorShape);

            for (int j = 1; j < tensorShape.nbDims; ++j) {
                // We ignore j = 0 because that is the batch size, and we will take that
                // into account when sizing the buffer
                outputLength *= tensorShape.d[j];
            }

            m_outputLengths.push_back(outputLength);
            // Now size the output buffer appropriately, taking into account the max
            // possible batch size (although we could actually end up using less
            // memory)
            Util::checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], outputLength * m_options.maxBatchSize * sizeof(T), stream));
        } else {
            auto msg = "Error, IO Tensor is neither an input or output!";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    // Synchronize and destroy the cuda stream
    Util::checkCudaErrorCode(cudaStreamSynchronize(stream));
    Util::checkCudaErrorCode(cudaStreamDestroy(stream));

    return true;
}

总结
函数的作用
从磁盘加载序列化的 TensorRT 引擎文件。

初始化运行时、引擎、执行上下文和 GPU 缓冲区。

涉及的 TensorRT API

IRuntime::deserializeCudaEngine: 反序列化引擎文件。

ICudaEngine::createExecutionContext: 创建执行上下文。

ICudaEngine::getNbIOTensors: 获取输入输出张量数量。

ICudaEngine::getTensorShape: 获取张量的形状。

3.4 runInference具体实现

基于前面的build和load trt model，现在就到了runInference的环节。runInference的实现在include/engine/EngineRunInference.inl中。该函数的作用是运行 TensorRT 推理。
它接收输入数据（GpuMat 格式），对其进行预处理，运行推理，并将推理结果拷贝回 CPU。

还是看一下runInference的全部实现，然后把实现过程的解释写在注释上面。

#pragma once
#include <filesystem>
#include <spdlog/spdlog.h>
#include "util/Util.h"

template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs,
                             std::vector<std::vector<std::vector<T>>> &featureVectors) {
    // First we do some error checking
    //输入检查
    //     检查输入数据的合法性，包括：
    // 输入是否为空。
    // 输入数量是否与模型的输入张量数量匹配。
    // 批量大小是否超过模型的最大批量大小。
    // 如果模型有固定批量大小，检查输入的批量大小是否匹配。
    // 为什么要这么写？
    // 确保输入数据的合法性是运行推理的前提条件。
    // 如果输入数据不合法，推理结果将是无效的。
    if (inputs.empty() || inputs[0].empty()) {
        spdlog::error("Provided input vector is empty!");
        return false;
    }

    const auto numInputs = m_inputDims.size();
    if (inputs.size() != numInputs) {
        spdlog::error("Incorrect number of inputs provided!");
        return false;
    }

    // Ensure the batch size does not exceed the max
    if (inputs[0].size() > static_cast<size_t>(m_options.maxBatchSize)) {
        spdlog::error("===== Error =====");
        spdlog::error("The batch size is larger than the model expects!");
        spdlog::error("Model max batch size: {}", m_options.maxBatchSize);
        spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size());
        return false;
    }

    // Ensure that if the model has a fixed batch size that is greater than 1, the
    // input has the correct length
    if (m_inputBatchSize != -1 && inputs[0].size() != static_cast<size_t>(m_inputBatchSize)) {
        spdlog::error("===== Error =====");
        spdlog::error("The batch size is different from what the model expects!");
        spdlog::error("Model batch size: {}", m_inputBatchSize);
        spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size());
        return false;
    }

    const auto batchSize = static_cast<int32_t>(inputs[0].size());
    // Make sure the same batch size was provided for all inputs
    for (size_t i = 1; i < inputs.size(); ++i) {
        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
            spdlog::error("===== Error =====");
            spdlog::error("The batch size is different for each input!");
            return false;
        }
    }

    // Create the cuda stream that will be used for inference
    //创建 CUDA 流
    //     涉及的 CUDA API
    // cudaStreamCreate:
    // 创建一个 CUDA 流，用于异步执行 CUDA 操作。
    // 作用
    // 创建一个 CUDA 流，用于管理推理过程中的异步操作（如数据拷贝和推理执行）。
    // 为什么要这么写？
    // 使用 CUDA 流可以提高性能，因为它允许多个 CUDA 操作同时执行。
    cudaStream_t inferenceCudaStream;
    Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));

    //输入预处理
    std::vector<cv::cuda::GpuMat> preprocessedInputs;


    // Preprocess all the inputs
     //     涉及的 TensorRT API
    // IExecutionContext::setInputShape:
    // 设置动态输入张量的形状（如批量大小）。
    // 作用
    // 检查输入数据的尺寸是否与模型的输入张量匹配。
    // 如果模型支持动态输入，设置输入张量的形状。
    // 调用 blobFromGpuMats 函数，将输入数据从 NHWC 格式转换为 NCHW 格式，并进行归一化。
    // 为什么要这么写？
    // TensorRT 的输入张量通常是 NCHW 格式，而 OpenCV 的 GpuMat 通常是 NHWC 格式，因此需要进行格式转换。
    // 如果模型支持动态输入，必须在推理前设置输入张量的形状。
    for (size_t i = 0; i < numInputs; ++i) {
        const auto &batchInput = inputs[i];
        const auto &dims = m_inputDims[i];

        auto &input = batchInput[0];
        if (input.channels() != dims.d[0] || input.rows != dims.d[1] || input.cols != dims.d[2]) {
            spdlog::error("===== Error =====");
            spdlog::error("Input does not have correct size!");
            spdlog::error("Expected: ({}, {}, {})", dims.d[0], dims.d[1], dims.d[2]);
            spdlog::error("Got: ({}, {}, {})", input.channels(), input.rows, input.cols);
            spdlog::error("Ensure you resize your input image to the correct size");
            return false;
        }

        nvinfer1::Dims4 inputDims = {batchSize, dims.d[0], dims.d[1], dims.d[2]};
        m_context->setInputShape(m_IOTensorNames[i].c_str(),
                                 inputDims); // Define the batch size

        // OpenCV reads images into memory in NHWC format, while TensorRT expects
        // images in NCHW format. The following method converts NHWC to NCHW. Even
        // though TensorRT expects NCHW at IO, during optimization, it can
        // internally use NHWC to optimize cuda kernels See:
        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#data-layout
        // Copy over the input data and perform the preprocessing
        auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
        preprocessedInputs.push_back(mfloat);
        m_buffers[i] = mfloat.ptr<void>();
    }

    // Ensure all dynamic bindings have been defined.
    if (!m_context->allInputDimensionsSpecified()) {
        auto msg = "Error, not all required dimensions specified.";
        spdlog::error(msg);
        throw std::runtime_error(msg);
    }

    // Set the address of the input and output buffers
       //设置输入和输出缓冲区地址
    //     涉及的 TensorRT API
    // IExecutionContext::setTensorAddress:
    // 设置输入和输出张量的内存地址。
    // 作用
    // 将输入和输出缓冲区的地址绑定到 TensorRT 的执行上下文。
    // 为什么要这么写？
    // TensorRT 的推理需要知道输入和输出张量的内存地址。
    for (size_t i = 0; i < m_buffers.size(); ++i) {
        bool status = m_context->setTensorAddress(m_IOTensorNames[i].c_str(), m_buffers[i]);
        if (!status) {
            return false;
        }
    }

    // Run inference.
    // 运行推理
    //     涉及的 TensorRT API
    // IExecutionContext::enqueueV3:
    // 在指定的 CUDA 流中异步运行推理。
    // 作用
    // 在 GPU 上运行推理。
    // 为什么要这么写？
    // 使用异步推理可以提高性能，因为它允许推理和其他操作（如数据拷贝）同时执行。
    bool status = m_context->enqueueV3(inferenceCudaStream);
    if (!status) {
        return false;
    }

    // Copy the outputs back to CPU
    featureVectors.clear();

    //拷贝输出数据到 CPU
    //     涉及的 CUDA API
    // cudaMemcpyAsync:
    // 异步拷贝数据，从 GPU 内存到 CPU 内存。
    // 作用
    // 将推理结果从 GPU 内存拷贝到 CPU 内存。
    // 为什么要这么写？
    // 推理结果通常需要在 CPU 上进行后处理，因此需要将数据从 GPU 内存拷贝到 CPU 内存。
    for (int batch = 0; batch < batchSize; ++batch) {
        // Batch
        std::vector<std::vector<T>> batchOutputs{};
        for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) {
            // We start at index m_inputDims.size() to account for the inputs in our
            // m_buffers
            std::vector<T> output;
            auto outputLength = m_outputLengths[outputBinding - numInputs];
            output.resize(outputLength);
            // Copy the output
            Util::checkCudaErrorCode(cudaMemcpyAsync(output.data(),
                                                     static_cast<char *>(m_buffers[outputBinding]) + (batch * sizeof(T) * outputLength),
                                                     outputLength * sizeof(T), cudaMemcpyDeviceToHost, inferenceCudaStream));
            batchOutputs.emplace_back(std::move(output));
        }
        featureVectors.emplace_back(std::move(batchOutputs));
    }

    // Synchronize the cuda stream
    //同步和销毁 CUDA 流
    //     涉及的 CUDA API
    // cudaStreamSynchronize:
    // 等待 CUDA 流中的所有操作完成。
    // cudaStreamDestroy:
    // 销毁 CUDA 流，释放资源。
    // 作用
    // 确保所有异步操作完成，并释放 CUDA 流资源。
    // 为什么要这么写？
    // 在销毁 CUDA 流之前，必须确保所有操作完成。
    // 销毁 CUDA 流可以释放 GPU 资源。
    Util::checkCudaErrorCode(cudaStreamSynchronize(inferenceCudaStream));
    Util::checkCudaErrorCode(cudaStreamDestroy(inferenceCudaStream));
    return true;
}

总结

函数的作用

检查输入数据的合法性。

对输入数据进行预处理。

设置输入和输出缓冲区地址。

在 GPU 上运行推理。

将推理结果拷贝到 CPU。

涉及的 TensorRT API

IExecutionContext::setInputShape: 设置动态输入张量的形状。

IExecutionContext::setTensorAddress: 设置输入和输出张量的内存地址。

IExecutionContext::enqueueV3: 异步运行推理。

cudaMemcpyAsync: 异步拷贝数据。

为什么要这么写？

遵循 TensorRT 的最佳实践，确保推理过程的高效性和正确性。

使用 CUDA 流和异步操作提高性能。

对输入数据进行严格检查，确保推理结果的可靠性。

在runInference的实现中，还有两个点值得看一下：

第一，blobFromGpuMats的实现。此函数的实现在include/engine/EngineUtilities.inl中。该函数的作用是将一批输入图像（GpuMat 格式）转换为 TensorRT 所需的输入张量格式。
它完成以下任务：
将输入图像从 HWC 格式（Height-Width-Channel）转换为 CHW 格式（Channel-Height-Width）。
对图像进行归一化（如果需要）。
应用减均值（subVals）和除以标准差（divVals）的操作。
返回预处理后的 GPU 张量。

template <typename T>
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB) {
   
    CHECK(!batchInput.empty())
    CHECK(batchInput[0].channels() == 3)
    
    //创建目标 GPU 缓冲区
    //     创建一个目标 GPU 缓冲区（gpu_dst），用于存储批量图像的预处理结果。
    // 缓冲区大小为：高度 × 宽度 × 批量大小 × 通道数
    // 其中，CV_8UC3 表示每个像素有 3 个通道，每个通道占用 8 位（无符号整数）。
    // 为什么要这么写？
    // TensorRT 的输入张量通常是批量数据，因此需要将所有图像的预处理结果存储在一个连续的缓冲区中。
    cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);

    //图像通道分离（HWC -> CHW）
    //     将每张图像从 HWC 格式转换为 CHW 格式。
    // 如果 swapRB 为 true，交换 R 和 B 通道（从 BGR 转换为 RGB）。
    // 关键点
    // cv::cuda::split:

    // 将输入图像的通道分离，并存储到 input_channels 中。
    // 例如，HWC 格式的图像 [H, W, C] 会被分离为 3 个单通道图像 [H, W]。
    // 通道存储位置:

    // 每个通道的数据存储在 gpu_dst 的不同位置：
    // 第 1 个通道：gpu_dst.ptr()[0 + width * 3 * img]
    // 第 2 个通道：gpu_dst.ptr()[width + width * 3 * img]
    // 第 3 个通道：gpu_dst.ptr()[width * 2 + width * 3 * img]
    // 为什么要这么写？
    // TensorRT 的输入张量通常是 CHW 格式，而 OpenCV 的 GpuMat 默认是 HWC 格式，因此需要进行格式转换。
    // swapRB 参数允许用户灵活地处理 RGB 和 BGR 格式的图像。
    size_t width = batchInput[0].cols * batchInput[0].rows;
    if (swapRB) {
        for (size_t img = 0; img < batchInput.size(); ++img) {
            std::vector<cv::cuda::GpuMat> input_channels{
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img])),
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img]))};
            cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
        }
    } else {
        for (size_t img = 0; img < batchInput.size(); ++img) {
            std::vector<cv::cuda::GpuMat> input_channels{
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
                cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img]))};
            cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
        }
    }
    cv::cuda::GpuMat mfloat;
    //数据归一化
    //     作用
    // 将图像数据从 uint8 转换为 float32。
    // 如果 normalize 为 true，将像素值归一化到 [0, 1] 范围。
    // 关键点
    // cv::cuda::GpuMat::convertTo:
    // 将图像数据从一种数据类型转换为另一种数据类型。
    // 例如，将 CV_8UC3 转换为 CV_32FC3。
    // 为什么要这么写？
    // TensorRT 的输入张量通常是 float32 类型，因此需要将图像数据从 uint8 转换为 float32。
    // 归一化可以提高模型的数值稳定性。
    if (normalize) {
        // [0.f, 1.f]
        gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
    } else {
        // [0.f, 255.f]
        gpu_dst.convertTo(mfloat, CV_32FC3);
    }

    // Apply scaling and mean subtraction
    //减均值和除以标准差
    //     作用
    // 对图像数据进行减均值和除以标准差的操作。
    // 公式：mfloat = (mfloat - subVals) / divVals
    // 关键点
    // cv::cuda::subtract:
    // 对每个像素值减去对应通道的均值（subVals）。
    // cv::cuda::divide:
    // 对每个像素值除以对应通道的标准差（divVals）。
    // 为什么要这么写？
    // 减均值和除以标准差是常见的图像预处理步骤，可以使输入数据的分布更适合模型的训练分布。
    // 例如，subVals 和 divVals 通常是根据训练数据集计算的均值和标准差。
    cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
    cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);

    return mfloat;
}

函数的作用

将一批输入图像从 GpuMat 格式转换为 TensorRT 所需的输入张量格式。

包括以下步骤：

检查输入合法性。

将图像从 HWC 格式转换为 CHW 格式。

数据归一化。

减均值和除以标准差。

涉及的 OpenCV CUDA API

cv::cuda::GpuMat:
表示存储在 GPU 上的矩阵。

cv::cuda::split:
将多通道图像分离为单通道图像。

cv::cuda::GpuMat::convertTo:
转换图像数据类型。

cv::cuda::subtract:
对每个像素值减去标量。

cv::cuda::divide:
对每个像素值除以标量。

为什么要这么写？
符合 TensorRT 的输入要求（CHW 格式、float32 类型）。

避免数据拷贝，直接在 GPU 上完成所有预处理操作，提高性能。

提供灵活性（支持 RGB/BGR 通道交换、归一化、减均值和除以标准差）。

这里再重点解释下HWC->CHW的代码转换逻辑。

for (size_t img = 0; img < batchInput.size(); ++img) {
    std::vector<cv::cuda::GpuMat> input_channels{
        cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),
        cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
        cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img]))};
    cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
}

这段代码的核心功能是将输入的图像数据从 HWC 格式（Height, Width, Channels）转换为 CHW 格式（Channels, Height, Width）。这种转换通常用于深度学习模型的输入预处理，因为许多模型（例如 TensorRT）要求输入数据以 CHW 格式排列。

输入数据格式：
batchInput 是一个包含多张图像的 std::vectorcv::cuda::GpuMat，每张图像的格式为 HWC（即高度、宽度和通道）。
每张图像有 3 个通道（通常是 RGB 或 BGR），每个像素由 3 个值表示。
目标数据格式：
gpu_dst 是一个 GPU 上的 cv::cuda::GpuMat，用于存储转换后的数据，格式为 CHW。
它的大小为 (1, rows * cols * batch_size)，其中 rows 和 cols 是单张图像的高度和宽度，batch_size 是图像的数量。
计算偏移量：

width 表示单张图像的像素总数（cols * rows）。
width * 3 * img 是当前图像在 gpu_dst 中的起始位置。
0, width, 和 width * 2 分别对应通道 0（R）、通道 1（G）、通道 2（B）的偏移量。
创建通道视图：

cv::cuda::GpuMat 对象的构造函数调用

cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),

这行代码是一个 cv::cuda::GpuMat 对象的构造函数调用，用于在 GPU 上创建一个矩阵（GpuMat 是 OpenCV 的 CUDA 模块中的类，用于处理 GPU 上的矩阵数据）。

batchInput[0].rows:
表示矩阵的行数（高度），这里取的是 batchInput[0] 的行数。
假设 batchInput 是一个包含多张图像的向量，每张图像是一个 cv::cuda::GpuMat。

batchInput[0].cols:
表示矩阵的列数（宽度），这里取的是 batchInput[0] 的列数。

CV_8U：
表示矩阵的类型，CV_8U 是 OpenCV 中的一个常量，表示每个像素是 8 位无符号整数（单通道）。
在这里，虽然图像是 3 通道（RGB 或 BGR），但每个通道的像素值仍然是 8 位无符号整数。

&(gpu_dst.ptr()[0 + width * 3 * img]):
表示矩阵的起始地址，指向 GPU 内存中的某个位置。
gpu_dst.ptr() 返回 gpu_dst 矩阵的首地址（指针）。
偏移量 0 + width * 3 * img 用于定位到 gpu_dst 中当前图像的起始位置。
width 是单张图像的像素总数（cols * rows）。
3 是通道数（RGB）。
img 是当前图像在批次中的索引。

这段代码的作用是：
在 GPU 上创建一个矩阵（GpuMat）。
该矩阵的大小为 batchInput[0].rows x batchInput[0].cols。
数据类型为 CV_8U（8 位无符号整数）。
数据存储在 gpu_dst 的某个偏移位置。

cv::cuda::GpuMat 的构造函数允许通过指针访问 GPU 内存。
&(gpu_dst.ptr()[…]) 指定了每个通道在 gpu_dst 中的起始位置。
input_channels 是一个包含 3 个 GpuMat 的向量，每个 GpuMat 对应一个通道。
通道分离：

cv::cuda::split(batchInput[img], input_channels) 将输入图像的通道分离，并将每个通道的数据写入 input_channels 中。
由于 input_channels 指向 gpu_dst 的不同位置，分离后的数据直接存储在 gpu_dst 中，完成了 HWC -> CHW 的转换。
关键点
HWC 格式
数据在内存中的排列方式是按行优先的，每行包含所有通道的数据。例如：[R1, G1, B1, R2, G2, B2, …, Rn, Gn, Bn]

其中 R1, G1, B1 是第一个像素的 RGB 值。
CHW 格式：

数据在内存中的排列方式是按通道优先的，每个通道的数据连续存储。例如：[R1, R2, …, Rn, G1, G2, …, Gn, B1, B2, …, Bn]

GPU 内存布局：

gpu_dst 是一个线性内存块，分为多个部分，每部分存储一个通道的数据。
通过计算偏移量，将每个通道的数据写入正确的位置。

3.5 resizeKeepAspectRatioPadRightBottom实现

这段代码实现了一个函数 resizeKeepAspectRatioPadRightBottom，用于对输入的 GPU 图像（cv::cuda::GpuMat）进行缩放和填充操作，以保持图像的宽高比，同时将图像调整到指定的目标尺寸。填充部分使用指定的背景颜色（bgcolor）。

template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
                                                                const cv::Scalar &bgcolor) {
    float r = std::min(width / (input.cols * 1.0), height / (input.rows * 1.0));
    int unpad_w = r * input.cols;
    int unpad_h = r * input.rows;
    cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
    cv::cuda::resize(input, re, re.size());
    //创建目标图像：
    // 创建一个新的 cv::cuda::GpuMat 对象 out，大小为目标尺寸 height x width，类型为 CV_8UC3。
    // 使用 bgcolor 初始化图像的所有像素值，作为填充部分的背景颜色。
    cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
    //     将缩放后的图像复制到目标图像：
    // 使用 cv::Rect(0, 0, re.cols, re.rows) 定义一个矩形区域，表示目标图像的左上角区域，其大小与缩放后的图像一致。
    // 使用 copyTo 将缩放后的图像 re 复制到目标图像 out 的该区域。
    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
    return out;
}

四、main.cpp串联各个部分

我们来看main.cpp中是怎样将tensorrt api串联在一起的。

#include "cmd_line_parser.h"
#include "logger.h"
#include "engine.h"
#include <chrono>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/opencv.hpp>

int main(int argc, char *argv[]) {
    CommandLineArguments arguments;

    // 从环境变量中获取日志级别
    std::string logLevelStr = getLogLevelFromEnvironment();
    spdlog::level::level_enum logLevel = toSpdlogLevel(logLevelStr);
    spdlog::set_level(logLevel);

    // 解析命令行参数
    if (!parseArguments(argc, argv, arguments)) {
        return -1;
    }

    // 一、配置好构建tensorrt部署model和运行时的Options
    // 指定 GPU 推理的配置选项
    Options options;
    // 指定推理使用的精度
    // FP16 的速度大约是 FP32 的两倍
    options.precision = Precision::FP16;
    // 如果使用 INT8 精度，必须指定校准数据目录路径
    options.calibrationDataDirectoryPath = "";
    // 指定优化的批量大小
    options.optBatchSize = 1;
    // 指定计划运行的最大批量大小
    options.maxBatchSize = 1;
    // 指定保存模型引擎文件的目录
    options.engineFileDir = ".";

    // 创建推理引擎
    // 二、初始化Engine
    Engine<float> engine(options);

    // 定义预处理参数
    // 默认的 Engine::build 方法会将值归一化到 [0.f, 1.f]
    // 如果将 normalize 标志设置为 false，则值保持在 [0.f, 255.f]
    std::array<float, 3> subVals{0.f, 0.f, 0.f}; // 减去的均值
    std::array<float, 3> divVals{1.f, 1.f, 1.f}; // 除以的标准差
    bool normalize = true; // 是否归一化

    // 如果模型需要将值归一化到 [-1.f, 1.f]，可以使用以下参数：
    // subVals = {0.5f, 0.5f, 0.5f};
    // divVals = {0.5f, 0.5f, 0.5f};
    // normalize = true;

    if (!arguments.onnxModelPath.empty()) {
        // 构建 ONNX 模型为 TensorRT 引擎文件，并加载到内存中
        //三、构建tensorrt model
        // 回忆一下，这里面涉及了从onnx->trt,以及加载trt模型、优化trt model、创建上下文、创建cuda stream、给输入输出绑定地址
        bool succ = engine.buildLoadNetwork(arguments.onnxModelPath, subVals, divVals, normalize);
        if (!succ) {
            throw std::runtime_error("无法构建或加载 TensorRT 引擎。");
        }
    } else {
        // 直接加载 TensorRT 引擎文件
        bool succ = engine.loadNetwork(arguments.trtModelPath, subVals, divVals, normalize);
        if (!succ) {
            const std::string msg = "无法加载 TensorRT 引擎。";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    // 读取输入图像
    const std::string inputImage = "../inputs/team.jpg";
    auto cpuImg = cv::imread(inputImage);
    if (cpuImg.empty()) {
        const std::string msg = "无法读取图像路径：" + inputImage;
        spdlog::error(msg);
        throw std::runtime_error(msg);
    }

    // 将图像上传到 GPU 内存
    cv::cuda::GpuMat img;
    img.upload(cpuImg);

    // 模型需要 RGB 格式的输入
    cv::cuda::cvtColor(img, img, cv::COLOR_BGR2RGB);

    // 填充输入向量以供推理使用
    const auto &inputDims = engine.getInputDims();
    std::vector<std::vector<cv::cuda::GpuMat>> inputs;

    // 使用与 Options.optBatchSize 匹配的批量大小
    size_t batchSize = options.optBatchSize;

    // 为演示目的，将同一张图像填充到所有输入中
    for (const auto &inputDim : inputDims) { // 遍历模型的每个输入
        std::vector<cv::cuda::GpuMat> input;
        for (size_t j = 0; j < batchSize; ++j) { // 为每个批量元素填充数据
            // 使用保持宽高比的方式调整图像大小
            //四、调整图像大小
            auto resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(img, inputDim.d[1], inputDim.d[2]);
            input.emplace_back(std::move(resized));
        }
        inputs.emplace_back(std::move(input));
    }

    // 在正式推理前对网络进行预热
    spdlog::info("正在预热网络...");
    std::vector<std::vector<std::vector<float>>> featureVectors;
    for (int i = 0; i < 100; ++i) {
        //五、运行推理
        //回忆一下，这个函数涉及了对输入做预处理、HWC->CHW、调用tensorrt的推理API、把输出转移到RAM
        bool succ = engine.runInference(inputs, featureVectors);
        if (!succ) {
            const std::string msg = "无法运行推理。";
            spdlog::error(msg);
            throw std::runtime_error(msg);
        }
    }

    // 基准测试推理时间
    size_t numIterations = 1000;
    spdlog::info("正在运行基准测试（{} 次迭代）...", numIterations);
    preciseStopwatch stopwatch;
    for (size_t i = 0; i < numIterations; ++i) {
        featureVectors.clear();
        engine.runInference(inputs, featureVectors);
    }
    auto totalElapsedTimeMs = stopwatch.elapsedTime<float, std::chrono::milliseconds>();
    auto avgElapsedTimeMs = totalElapsedTimeMs / numIterations / static_cast<float>(inputs[0].size());

    spdlog::info("基准测试完成！");
    spdlog::info("======================");
    spdlog::info("每个样本的平均时间：{} ms", avgElapsedTimeMs);
    spdlog::info("批量大小：{}", inputs[0].size());
    spdlog::info("平均 FPS：{} fps", static_cast<int>(1000 / avgElapsedTimeMs));
    spdlog::info("======================\n");

    // 打印特征向量
    for (size_t batch = 0; batch < featureVectors.size(); ++batch) {
        for (size_t outputNum = 0; outputNum < featureVectors[batch].size(); ++outputNum) {
            spdlog::info("批次 {}, 输出 {}", batch, outputNum);
            std::string output;
            int i = 0;
            for (const auto &e : featureVectors[batch][outputNum]) {
                output += std::to_string(e) + " ";
                if (++i == 10) {
                    output += "...";
                    break;
                }
            }
            spdlog::info("{}", output);
        }
    }

    // 如果模型需要后处理（例如将特征向量转换为边界框），可以在这里实现

    return 0;
}

五、总结：

使用 TensorRT 的 C++ API 在 Jetson Orin Nano（或其他平台）上部署模型，大体可分为以下五个主要阶段：

1. 配置推理选项

在开始之前，先构造一个 Options 结构体，指定：

精度模式：FP32、FP16 或 INT8
校准数据路径（仅 INT8 时需要）
优化批量大小 (optBatchSize) 和最大批量大小 (maxBatchSize)
设备索引（多 GPU 时指定）
Engine 文件保存目录 等

Options options;
options.precision = Precision::FP16;
options.optBatchSize = 1;
options.maxBatchSize = 1;
options.engineFileDir = ".";

2. 构造并（或）加载 TensorRT 引擎

构建并缓存引擎
```
Engine<float> engine(options);
engine.buildLoadNetwork("model.onnx");
```
- 解析 ONNX：nvonnxparser::IParser 读入到 INetworkDefinition
- 设置动态输入（Batch/Width）优化配置：IOptimizationProfile
- 选择精度：kFP16 或 kINT8 + 校准器
- 调用 builder->buildSerializedNetwork(...) 生成 .engine 文件
加载已有引擎
```
engine.loadNetwork("model.engine");
```
- 反序列化：IRuntime::deserializeCudaEngine
- 创建执行上下文：ICudaEngine::createExecutionContext
- 分配 GPU 缓冲区（输入/输出）

3. 输入预处理（Blob 构造）

在 runInference 调用前，将 OpenCV 的 cv::cuda::GpuMat 图像：

HWC → CHW：利用 cv::cuda::split 并按通道布局到连续 GPU 内存
归一化 & 减均值/除以标准差：convertTo + subtract + divide

直接将预处理后的 GPU 指针绑定到 TensorRT 输入缓冲：

blob = Engine<float>::blobFromGpuMats(batchMats, subVals, divVals, normalize);
context->setTensorAddress(inputName, blob.ptr<void>());

4. 执行推理

// 设置动态形状（如果支持）
context->setInputShape(inputName, Dims4(batchSize, C, H, W));
// 绑定所有 IO 缓冲区
for each tensorName:
    context->setTensorAddress(tensorName, bufferPtr);
// 异步执行
context->enqueueV3(cudaStream);