TensorRT推理——（三）转换成TensorRT模型、部署C++版

要坚持写博客呀

已于 2023-07-09 22:19:32 修改

阅读量3.6k

点赞数 7

文章标签： c++ 计算机视觉 opencv

于 2023-05-16 11:27:10 首次发布

本文链接：https://blog.csdn.net/weixin_39263657/article/details/130687150

版权

一、转换ONNX步骤

使用TensorRT推理肯定是在英伟达GPU环境上进行推理的。

设置Batch Size
设置精度
转换模型
验证转换前后精度

二、ONNX转换成TensorRT engine

1. 使用命令行工具`trtexec`转换成TensorRT engine

trtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine_intro.trt  --explicitBatch
# 参数解释
--maxBatch：设置一个最大batchsize上限，用于输入的batchsize不确定情况下
--explicitBatch：根据onnx模型后结构自动推导出明确的batchsize

2. 使用TensorRT API 转换成TensorRT engine

def generate_engine(onnx_path, engine_path):
    # 1.构建trt日志记录器
    logger = trt.Logger(trt.Logger.WARNING)
    # 初始化
    trt.init_libnvinfer_plugins(logger, namespace="")

    # 2.create a builder，logger放入进去
    builder = trt.Builder(logger)

    # 3.创建配置文件，用于trt如何优化模型
    config = builder.create_builder_config()
    # 设置工作空间内存大小
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)  # 1 MiB
    # 设置精度
    config.set_flag(trt.BuilderFlag.FP16)
    # INT8需要进行校准

    # 4.创建一个network。EXPLICIT_BATCH：batch是动态的
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    # 创建ONNX模型解析器
    parser = trt.OnnxParser(network, logger)
    # 解析ONNX模型，并填充到网络
    success = parser.parse_from_file(onnx_path)
    # 处理错误
    for idx in range(parser.num_errors):
        print(parser.get_error(idx))
    if not success:
        pass  # Error handling code here

    # 5.engine模型序列化，即生成了trt.engine model
    serialized_engine = builder.build_serialized_network(network, config)
    # 保存序列化的engine，如果以后要用到的话. 模型不能跨平台，即和trt版本 gpu类型有关
    with open(engine_path, "wb") as f:
        f.write(serialized_engine)

    # 6.反序列化engine。使用runtime接口。即加载engine模型进行推理。
    # runtime = trt.Runtime(logger)
    # engine = runtime.deserialize_cuda_engine(serialized_engine)
    # with open("sample.engine", "rb") as f:
    #     serialized_engine = f.read()

三、部署TensorRT engine

1. TensorRT engine推理

yolo.h：

// yolo.h
class YOLO
{
    public:
        // 1.加载engine模型，并进行初始化
        YOLO(std::string engine_file_path);
        // 使用类虚析构函数：delete掉 父类指向子类的指针时先调用子类的析构函数，然后调用父类的析构函数。不加virtual时：只调用基类析构函数
        // 所以只有一个类被当做基类的时候，才把类析构函数写成类虚析构函数
        virtual ~YOLO();
        // 2.加载图像+图像前处理+推理+结果后处理
        void detect_img(std::string image_path);
        void detect_video(std::string video_path);
        // resize图像+letterbox(矩形推理)
        cv::Mat static_resize(cv::Mat& img);
        // 图像归一化
        float* blobFromImage(cv::Mat& img);
        // 3.推理
        void doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape);

    private:
        static const int INPUT_W = 640;
        static const int INPUT_H = 640;
        // 用于获取engine 输入输出的name
        const char* INPUT_BLOB_NAME = "image_arrays";
        const char* OUTPUT_BLOB_NAME = "outputs";
        float* prob;
        int output_size = 1;
        // runtime创建engine，engine创建context
        IRuntime* runtime;
        ICudaEngine* engine;
        IExecutionContext* context;

};

yolo.cpp：


YOLO::YOLO(std::string engine_file_path)
{
    size_t size{0};
    char *trtModelStream{nullptr};
    // 读取文件
    std::ifstream file(engine_file_path, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
    std::cout << "engine init finished" << std::endl;

    runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    auto out_dims = engine->getBindingDimensions(1);
    for(int j=0;j<out_dims.nbDims;j++) {
        this->output_size *= out_dims.d[j];
    }
    this->prob = new float[this->output_size];
}

YOLO::~YOLO()
{
    std::cout<<"yolo destroy"<<std::endl;
    this->context->destroy();
    this->engine->destroy();
    this->runtime->destroy();

}


// 前处理 letterbox
cv::Mat YOLO::static_resize(cv::Mat& img) {
    float r = std::min(this->INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
    int unpad_w = r * img.cols;
    int unpad_h = r * img.rows;
    cv::Mat re(unpad_h, unpad_w, CV_8UC3);
    cv::resize(img, re, re.size());
    cv::Mat out(this->INPUT_W, this->INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
    return out;
}
// 前处理 归一化
float* YOLO::blobFromImage(cv::Mat& img){
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);

    float* blob = new float[img.total()*3];
    int channels = 3;
    int img_h = img.rows;
    int img_w = img.cols;
    for (size_t c = 0; c < channels; c++)
    {
        for (size_t  h = 0; h < img_h; h++)
        {
            for (size_t w = 0; w < img_w; w++)
            {
                blob[c * img_w * img_h + h * img_w + w] =
                    (((float)img.at<cv::Vec3b>(h, w)[c]) / 255.0f);
            }
        }
    }
    return blob;
}

// 推理
void YOLO::doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape) {
    const ICudaEngine& engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);

    assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
    int mBatchSize = engine.getMaxBatchSize();

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(1, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

// combine pre infer post
void YOLO::detect_img(std::string image_path)
{
    cv::Mat img = cv::imread(image_path);
    int img_w = img.cols;
    int img_h = img.rows;
    // letterbox
    cv::Mat pr_img = this->static_resize(img);
    std::cout << "blob image" << std::endl;
    // 归一化
    float* blob;
    blob = blobFromImage(pr_img);
    //缩放比例
    float scale = std::min(this->INPUT_W / (img.cols*1.0), this->INPUT_H / (img.rows*1.0));

    // run inference
    auto start = std::chrono::system_clock::now();
    doInference(*context, blob, this->prob, output_size, pr_img.size());
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

    // 结果后处理
    std::vector<Object> objects;
    decode_outputs(this->prob, this->output_size, objects, scale, img_w, img_h);
    draw_objects(img, objects, image_path);
    delete blob;

}

// 结果后处理
static void decode_outputs(float* prob, int output_size, std::vector<Object>& objects, float scale, const int img_w, const int img_h) {
        std::vector<Object> proposals;
        generate_yolo_proposals(prob, output_size, BBOX_CONF_THRESH, proposals);
        std::cout << "num of boxes before nms: " << proposals.size() << std::endl;
        qsort_descent_inplace(proposals);
        std::vector<int> picked;
        nms_sorted_bboxes(proposals, picked, NMS_THRESH);

        int count = picked.size();

        std::cout << "num of boxes: " << count << std::endl;

        objects.resize(count);
        for (int i = 0; i < count; i++)
        {
            objects[i] = proposals[picked[i]];

            // adjust offset to original unpadded
            float x0 = (objects[i].rect.x) / scale;
            float y0 = (objects[i].rect.y) / scale;
            float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
            float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;

            // clip
            x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
            y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
            x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
            y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

            objects[i].rect.x = x0;
            objects[i].rect.y = y0;
            objects[i].rect.width = x1 - x0;
            objects[i].rect.height = y1 - y0;
        }
}