Yolov5 TensorRT .engine文件的生成和推理过程,以及如何从GPU中得到图片进行自定义(根据任务需求的)后处理


warmup:
运行时具有延迟初始化的组件,这可能导致加载后发送给模型的第一个请求的等待时间较长,此延迟可能比单个推理请求的延迟高几个数量级。为了减少初始化的延迟对请求的影响,可以在模型加载时通过提供一组推理请求样本和 SavedModel 来触发子系统和组件的初始化,此过程称为 “预热” 模型。

一、create a model using the API directly and serialize it to a stream

//如果不存在wts_name
    //create a model using the API directly and serialize it to a stream
    if (!wts_name.empty(){
        IHostMemory* modelStream{ nullptr } 
        //Class to handle library allocated memory that is accessible to the user.
        //The memory allocated via the host memory object is owned by the library and will be de-allocated when the destroy method is called.
        APIToModel(BATCH_SIZE, &modelStream, is_p6, gd, gw, wts_name)
        // wts->.engine
        assert(modelStream != nullptr);
        //modelstream -> serialize engine
        std::ofstream p(engine_name, std::ios::binary);
        //open output file which is named "engine_name" and used "std::ios::binary" format
        if (!p) {
            std::cerr << "could not open plan output file" << std::endl;
            return -1;
        }
        // APIToModel() and build_engine() will return a ( serialized engine )"modelStream"
        // then write the "modelStream" into output file as the .engine file
        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        modelStream->destroy();
        return 0;
    })

二、deserialize the .engine and data prepare

// deserialize the .engine and run inference
     std::ifstream file(engine_name, std::ios::binary)
    // 如果想以输入方式(只写)打开,就用ifstream来定义;
    // 如果想以输出方式(只读)打开,就用ofstream来定义;
    // 如果想以输入/输出方式来打开,就用fstream来定义。
    if (!file.good()){
        std::cerr << "read " << engine_name << " error!" << std::endl;
        return -1;
    }
    // define trtmodelstream
    char *trtModelStream = nullptr;
    size_t size = 0;
    // get the size of engine file
    file.seekg(0, file.end);
    size = file.tellg();
    // back to the begining of the engine file
    file.seekg(0, file.beg);
    // trtModelStream is defined as a ["engine file size "] size char
    trtModelStream = new char[size];
    assert(trtModelStream);
    // rtrModelStream read file's content
    file.read(trtModelStream, size);
    file.close();

    // file_names ['A','B',......]
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
        return -1;
    }

read_files_in_dir 函数

// static inline 内联函数 加速用
// 参数前面有const : 只要整个函数不改变这个参数,则加上const,编译器可以并行处理加速
static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
    DIR *p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }

    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            //std::string cur_file_name(p_dir_name);
            //cur_file_name += "/";
            //cur_file_name += p_file->d_name;
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
            // 把img name pushback到filenames
        }
    }

    closedir(p_dir);
    return 0;
}

三、run inference

// prepare input data ---------------------------
    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    static float prob[BATCH_SIZE * OUTPUT_SIZE];
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    assert(engine->getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
    // Create stream
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    int fcount = 0;
    for (int f = 0; f < (int)file_names.size(); f++) {
        fcount++;
        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
        for (int b = 0; b < fcount; b++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
            if (img.empty()) continue;
            cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB
            int i = 0;
            for (int row = 0; row < INPUT_H; ++row) {
                uchar* uc_pixel = pr_img.data + row * pr_img.step;
                for (int col = 0; col < INPUT_W; ++col) {
                    data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
                    uc_pixel += 3;
                    ++i;
                }
            }
        }

        // Run inference
        auto start = std::chrono::system_clock::now();
        doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
        auto end = std::chrono::system_clock::now();
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
        }
        for (int b = 0; b < fcount; b++) {
            auto& res = batch_res[b];
            //std::cout << res.size() << std::endl;
            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
            for (size_t j = 0; j < res.size(); j++) {
                cv::Rect r = get_rect(img, res[j].bbox);
                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            }
            cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
        }
        fcount = 0;
    }

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

四、get image

还没整

好的,我明白了。首先,您需要安装CUDA和TensorRT 8,并设置好环境变量。然后,您需要下载Yolov5的代码并将其转换为TensorRT 8的格式。可以使用TensorRTPython API进行转换。 接下来,您需要在C++代码加载TensorRT 8模型并将其发送到GPU进行推理。这可以通过使用TensorRTC++ API来完成。在推理期间,您可以在CUDA实现模型的预处理和后处理。 对于预处理,您可以使用CUDA的kernel来实现,例如将图像从CPU内存复制到GPU内存并进行标准化。对于后处理,您可以使用CUDA的kernel来实现,例如将检测框从GPU内存复制到CPU内存并进行解码。 下面是一个简单的伪代码示例,展示了如何在C++实现Yolov5模型的推理,以及如何在CUDA实现预处理和后处理: ```c++ // 加载TensorRT 8模型 ICudaEngine* engine = loadTensorRTModel("yolov5.engine"); // 创建CUDA上下文 cudaSetDevice(0); cudaStream_t stream; cudaStreamCreate(&stream); // 分配GPU内存 void* inputDeviceBuffer = cudaMalloc(...); void* outputDeviceBuffer = cudaMalloc(...); // 加载图像到CPU内存 cv::Mat image = cv::imread("input.jpg"); cv::cvtColor(image, image, cv::COLOR_BGR2RGB); cv::resize(image, image, cv::Size(640, 640)); float* inputData = preprocessImage(image); // 将图像从CPU内存复制到GPU内存 cudaMemcpyAsync(inputDeviceBuffer, inputData, ...); // 进行推理 IExecutionContext* context = engine->createExecutionContext(); context->enqueueV2(...); // 将检测框从GPU内存复制到CPU内存并进行解码 float* outputData = new float[...]; cudaMemcpyAsync(outputData, outputDeviceBuffer, ...); decodeOutput(outputData); // 清理资源 cudaFree(inputDeviceBuffer); cudaFree(outputDeviceBuffer); cudaStreamDestroy(stream); delete[] outputData; ``` 在上面的代码,`preprocessImage`和`decodeOutput`函数分别实现了图像预处理和输出解码,在CUDA使用kernel实现。 需要注意的是,这只是一个简单的示例,实际实现可能会更加复杂。另外,如果您需要使用多个GPU进行推理,还需要使用CUDA的多GPU API。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值