yolov5的TensorRT部署--动态batch

置顶 Good@dz

已于 2022-12-30 09:41:32 修改

阅读量3.5k

点赞数 4

分类专栏：基础技能文章标签： batch 深度学习人工智能

于 2022-06-25 21:46:40 首次发布

本文链接：https://blog.csdn.net/qq_42178122/article/details/125373108

版权

基础技能专栏收录该内容

21 篇文章 5 订阅

订阅专栏

从0到1实现基于tensorrt的yolo部署教程 http://t.csdn.cn/HUn4T,请点击该链接，即可看到全文

本文对于上面的案例，从多batc的角度详细讲解

一、生成TensorRT模型的多batch设置

对于充分利用多batch，能大大提升模型的检测速度，例如多个视频流进行目标检测，我们可以获取多个视频流中的图片，都多张图片一起送往一个网络里进行推理。
在生成TensorRT模型的代码，我们应该加入以下代码

// 设置profile，这里动态batch专属
auto profile = builder->createOptimizationProfile();
auto input_tensor = network->getInput(0);
auto input_dims = input_tensor->getDimensions();

// 配置最小:kMIN、最优:kOPT、最大范围:kMAX       指的是BatchSize
input_dims.d[0] = 1;  // 在最小的batch和最优的batch设置为1
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
// 最大的batch根据自己的要求来设置    
input_dims.d[0] = maxBatchSize;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
// 将这个profile加到设置里面
config->addOptimizationProfile(profile);

2. 在模型推理时的代码

在推理的时，如何将设置显存和内存，并且将输入的数据与指定的内存和显卡关联在一起，这是多batch的难点
batch大小的设置

int input_batch = 2;  // 指定batch的大小
int input_channel = 3;
int input_height = 640;
int input_width = 640;
int input_numel = input_batch * input_channel * input_height * input_width;

float* input_data_host = nullptr;
float* input_data_device = nullptr;
    // 在主机内存和显卡分别设置相应的内存空间
checkRuntime(cudaMallocHost(&input_data_host,input_numel * sizeof(float)));
checkRuntime(cudaMalloc(&input_data_device,input_numel * sizeof(float)));

数据赋值到主机内存和显存

// 读取视频流
cap >> image;
cap1 >> image1;
// 仿射变换来改变图片的尺寸
cv::warpAffine(image,input_image,m2x3_i2d,input_image.size(),cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114)); // 对图像做平移缩放旋转变换,可逆
cv::warpAffine(image1,input_image1,m2x3_i2d1,input_image1.size(),cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));


int image_area = input_image.cols * input_image.rows;
int image_area1 = input_image1.cols * input_image1.rows;
// 视频流中图一的数据地址
unsigned char *pimage = input_image.data;

float *phost_b = input_data_host + image_area * 0;
float *phost_g = input_data_host + image_area * 1;
float *phost_r = input_data_host + image_area * 2;
// 将视频流中图一的数据赋值到指定的内存中
for (int i = 0; i < image_area; ++i, pimage += 3)
{
      // 注意这里的顺序rgb调换了
      *phost_r++ = pimage[0] / 255.0f;
      *phost_g++ = pimage[1] / 255.0f;
      *phost_b++ = pimage[2] / 255.0f;
}
// 视频流中图二的数据地址
unsigned char *pimage1 = input_image1.data;
// 上面图一的数据赋值到指定内存后，再赋值图二的数据
auto input_data_host1 = input_data_host + image_area * 3;
float *phost_b1 = input_data_host1 + image_area1 * 0;
float *phost_g1 = input_data_host1 + image_area1 * 1;
float *phost_r1 = input_data_host1 + image_area1 * 2;
// 将视频流中图二的数据赋值到指定的内存中
for (int i = 0; i < image_area1; ++i, pimage1 += 3)
{
      // 注意这里的顺序rgb调换了
      *phost_r1++ = pimage1[0] / 255.0f;
      *phost_g1++ = pimage1[1] / 255.0f;
      *phost_b1++ = pimage1[2] / 255.0f;
}

输出的数据在主机内存和显存的设置

int output_numel = input_batch * output_numbox * output_numprob;    // input_batch的大小与input_batch相关联
float* output_data_host = nullptr;
float* output_data_device = nullptr;
checkRuntime(cudaMallocHost(&output_data_host, output_numel * sizeof(float)));
checkRuntime(cudaMalloc(&output_data_device, output_numel * sizeof(float)));

进行模型推理

checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));
float *bindings[] = {input_data_device, output_data_device};
bool success = execution_context->enqueueV2((void **)bindings, stream, nullptr);
checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream));
checkRuntime(cudaStreamSynchronize(stream));

神经网络输出的多batch处理

float *ptr = output_data_host  + i_num * output_numbox * output_numprob + i * output_numprob;   // i_num代表batch的维度

总的动态batch代码


// tensorRT include
// 编译用的头文件
#include <NvInfer.h>

// onnx解析器的头文件
//#include <onnx-tensorrt/NvOnnxParser.h>
#include <NvOnnxParser.h>
// 推理用的运行时头文件
#include <NvInferRuntime.h>

// cuda include
#include <cuda_runtime.h>

// system include
#include <stdio.h>
#include <math.h>

#include <iostream>
#include <fstream>
#include <vector>
#include <memory>
#include <functional>
#include <unistd.h>

#include <opencv2/opencv.hpp>


using namespace std;

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    if(code != cudaSuccess){
        const char* err_name = cudaGetErrorName(code);
        const char* err_message = cudaGetErrorString(code);
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);
        return false;
    }
    return true;
}

inline const char* severity_string(nvinfer1::ILogger::Severity t){
    switch(t){
    case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
    case nvinfer1::ILogger::Severity::kERROR:   return "error";
    case nvinfer1::ILogger::Severity::kWARNING: return "warning";
    case nvinfer1::ILogger::Severity::kINFO:    return "info";
    case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
    default: return "unknow";
    }
}

// coco数据集的labels，关于coco：https://cocodataset.org/#home
static const char* cocolabels[] = {
    "person", "bicycle", "car", "motorcycle", "airplane",
    "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
    "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
    "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis",
    "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass",
    "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
    "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
    "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
    "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
    "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
    "scissors", "teddy bear", "hair drier", "toothbrush"
};

// hsv转bgr
static std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v){
    const int h_i = static_cast<int>(h * 6);
    const float f = h * 6 - h_i;
    const float p = v * (1 - s);
    const float q = v * (1 - f*s);
    const float t = v * (1 - (1 - f) * s);
    float r, g, b;
    switch (h_i) {
    case 0:r = v; g = t; b = p;break;
    case 1:r = q; g = v; b = p;break;
    case 2:r = p; g = v; b = t;break;
    case 3:r = p; g = q; b = v;break;
    case 4:r = t; g = p; b = v;break;
    case 5:r = v; g = p; b = q;break;
    default:r = 1; g = 1; b = 1;break;}
    return make_tuple(static_cast<uint8_t>(b * 255), static_cast<uint8_t>(g * 255), static_cast<uint8_t>(r * 255));
}

static std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id){
    float h_plane = ((((unsigned int)id << 2) ^ 0x937151) % 100) / 100.0f;;
    float s_plane = ((((unsigned int)id << 3) ^ 0x315793) % 100) / 100.0f;
    return hsv2bgr(h_plane, s_plane, 1);
}

class TRTLogger : public nvinfer1::ILogger{
public:
    virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
        if(severity <= Severity::kWARNING){
            // 打印带颜色的字符，格式如下：
            // printf("\033[47;33m打印的文本\033[0m");
            // 其中 \033[ 是起始标记
            //      47    是背景颜色
            //      ;     分隔符
            //      33    文字颜色
            //      m     开始标记结束
            //      \033[0m 是终止标记
            // 其中背景颜色或者文字颜色可不写
            // 部分颜色代码 https://blog.csdn.net/ericbar/article/details/79652086
            if(severity == Severity::kWARNING){
                printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
            }
            else if(severity <= Severity::kERROR){
                printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
            }
            else{
                printf("%s: %s\n", severity_string(severity), msg);
            }
        }
    }
} logger;

// 通过智能指针管理nv返回的指针参数
// 内存自动释放，避免泄漏
template<typename _T>
shared_ptr<_T> make_nvshared(_T* ptr){
    return shared_ptr<_T>(ptr, [](_T* p){p->destroy();});
}

bool exists(const string& path){

#ifdef _WIN32
    return ::PathFileExistsA(path.c_str());
#else
    return access(path.c_str(), R_OK) == 0;
#endif
}

typedef std::function<void(
        int current, int count, const std::vector<std::string>& files,
        nvinfer1::Dims dims, float* ptensor
        )> Int8Process;

class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator(const vector<string>& imagefiles, nvinfer1::Dims dims, const Int8Process& preprocess) {
        assert(preprocess != nullptr);
        this->dims_ = dims;
        this->allimgs_ = imagefiles;
        this->preprocess_ = preprocess;
        this->fromCalibratorData_ = false;

        files_.resize(dims.d[0]);
    }

    // 这个构造函数，是允许从缓存数据中加载标定结果，这样不用重新读取图像处理
    Int8EntropyCalibrator(const vector<uint8_t>& entropyCalibratorData, nvinfer1::Dims dims, const Int8Process& preprocess) {

        assert(preprocess != nullptr);
        this->dims_ = dims;
        this->entropyCalibratorData_ = entropyCalibratorData;
        this->preprocess_ = preprocess;
        this->fromCalibratorData_ = true;
        files_.resize(dims.d[0]);
    }

    virtual ~Int8EntropyCalibrator(){
        if(tensor_host_ != nullptr){
            checkRuntime(cudaFreeHost(tensor_host_));
            checkRuntime(cudaFree(tensor_device_));
            tensor_host_ = nullptr;
            tensor_device_ = nullptr;
        }
    }

    // 想要按照多少的batch进行标定
    int getBatchSize() const noexcept {
        return dims_.d[0];
    }

    bool next() {
        int batch_size = dims_.d[0];
        if (cursor_ + batch_size > allimgs_.size())
            return false;

        for(int i = 0; i < batch_size; ++i)
            files_[i] = allimgs_[cursor_++];

        if(tensor_host_ == nullptr){
            size_t volumn = 1;
            for(int i = 0; i < dims_.nbDims; ++i)
                volumn *= dims_.d[i];

            bytes_ = volumn * sizeof(float);
            checkRuntime(cudaMallocHost(&tensor_host_, bytes_));
            checkRuntime(cudaMalloc(&tensor_device_, bytes_));
        }

        preprocess_(cursor_, allimgs_.size(), files_, dims_, tensor_host_);
        checkRuntime(cudaMemcpy(tensor_device_, tensor_host_, bytes_, cudaMemcpyHostToDevice));
        return true;
    }

    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept {
        if (!next()) return false;
        bindings[0] = tensor_device_;
        return true;
    }

    const vector<uint8_t>& getEntropyCalibratorData() {
        return entropyCalibratorData_;
    }

    const void* readCalibrationCache(size_t& length) noexcept {
        if (fromCalibratorData_) {
            length = this->entropyCalibratorData_.size();
            return this->entropyCalibratorData_.data();
        }

        length = 0;
        return nullptr;
    }

    virtual void writeCalibrationCache(const void* cache, size_t length) noexcept {
        entropyCalibratorData_.assign((uint8_t*)cache, (uint8_t*)cache + length);
    }

private:
    Int8Process preprocess_;
    vector<string> allimgs_;
    size_t batchCudaSize_ = 0;
    int cursor_ = 0;
    size_t bytes_ = 0;
    nvinfer1::Dims dims_;
    vector<string> files_;
    float* tensor_host_ = nullptr;
    float* tensor_device_ = nullptr;
    vector<uint8_t> entropyCalibratorData_;
    bool fromCalibratorData_ = false;
};


// 上一节的代码
bool build_model(){

    if(exists("yolov5s.trtmodel")){
        printf("yolov5s.trtmodel has exists.\n");
        return true;
    }

    // 构建logger日记记录器
    TRTLogger logger;

    // 这是基本需要的组件
    // 构建 Builder 网络元数据，这是模型搭建的入口，网络的 TensorRT 内部表示以及可执行程序引擎，都是由该对象的成员方法生成的
    auto builder = make_nvshared(nvinfer1::createInferBuilder(logger));
    // 负责设置模型的一些参数，如是否开启 fp16 模式、int8 模式等。BuilderConfig 是建立在 Builder 基础之上的
    auto config = make_nvshared(builder->createBuilderConfig());
    // 构建 Network 计算图，是 最为核心的一个模块。
    // flag =1???
    auto network = make_nvshared(builder->createNetworkV2(1));
    auto parser = make_nvshared(nvonnxparser::createParser(*network,logger));
    if(!parser->parseFromFile("yolov5s.onnx",1))   // onnx加载后，怎么与network关联起来
    {
        printf("Failed to parse yolov5s.onnx\n");

        // 注意这里的几个指针还没有释放，是有内存泄漏的，后面考虑更优雅的解决
        return false;
    }

    // 多batch推理
    int maxBatchSize = 10;
    config->setMaxWorkspaceSize(1<<28);

    // 如果模型有多个输入，则必须多个profile
    auto profile = builder->createOptimizationProfile();
    auto input_tensor = network->getInput(0);
    auto input_dims = input_tensor->getDimensions();


    input_dims.d[0] = 1;

    config->setFlag(nvinfer1::BuilderFlag::kFP16);


    // 配置最小、最优、最大范围
    input_dims.d[0] = 1;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
    input_dims.d[0] = maxBatchSize;
    profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
    config->addOptimizationProfile(profile);

    auto engine = make_nvshared(builder->buildEngineWithConfig(*network, *config));
    if (engine == nullptr)
    {
        printf("Build engine failed.\n");
        return false;
    }

    // 将模型序列化，并储存为文件
    auto model_data = make_nvshared(engine->serialize());
    FILE *f = fopen("yolov5s.trtmodel", "wb");
    fwrite(model_data->data(), 1, model_data->size(), f);
    fclose(f);


    return true;
}

///

vector<unsigned char> load_file(const string& file){
    ifstream in(file, ios::in | ios::binary);
    if (!in.is_open())
        return {};

    in.seekg(0, ios::end);
    size_t length = in.tellg();

    std::vector<uint8_t> data;
    if (length > 0){
        in.seekg(0, ios::beg);
        data.resize(length);

        in.read((char*)&data[0], length);
    }
    in.close();
    return data;
}

void inference(){
    TRTLogger logger;
    auto engine_data = load_file("yolov5s.trtmodel");
    auto runtime = make_nvshared(nvinfer1::createInferRuntime(logger));
    // 反序列化
    auto engine = make_nvshared(runtime->deserializeCudaEngine(engine_data.data(),engine_data.size()));
    if(engine == nullptr)
    {
        printf("Deserialize cuda engine failed.\n");
        //        runtime->destroy();
        return;
    }
    if(engine->getNbBindings()!=2)
    {
        printf("你的onnx导出有问题，必须是1个输入和1个输出，你这明显有：%d个输出.\n", engine->getNbBindings() - 1);
        return;
    }
    cudaStream_t stream = nullptr;
    checkRuntime(cudaStreamCreate(&stream));
    auto execution_context = make_nvshared(engine->createExecutionContext());
    int input_batch = 2;
    int input_channel = 3;
    int input_height = 640;
    int input_width = 640;
    int input_numel = input_batch * input_channel * input_height * input_width;

    float* input_data_host = nullptr;
    float* input_data_device = nullptr;
    // 在主机内存和显卡分别设置相应的内存空间
    checkRuntime(cudaMallocHost(&input_data_host,input_numel * sizeof(float)));
    checkRuntime(cudaMalloc(&input_data_device,input_numel * sizeof(float)));

    auto output_dims = engine->getBindingDimensions(1);
    int output_numbox = output_dims.d[1];   // ??
    int output_numprob = output_dims.d[2];  // ??
    int num_classes = output_numprob -5;
    int output_numel = input_batch * output_numbox * output_numprob;
    float* output_data_host = nullptr;
    float* output_data_device = nullptr;
    checkRuntime(cudaMallocHost(&output_data_host, output_numel * sizeof(float)));
    checkRuntime(cudaMalloc(&output_data_device, output_numel * sizeof(float)));


    // 明确当前推理时，使用的数据输入大小
    auto input_dims = engine->getBindingDimensions(0);
    input_dims.d[0] = input_batch;

    execution_context->setBindingDimensions(0, input_dims);

    cv::VideoCapture cap("/media/***/晴.mp4");
    cv::VideoCapture cap1("/media/***/极弯场景.mp4");
    cv::Mat image;
    cv::Mat image1;
    cap>>image;
    cap>>image1;
    // 通过双线性插值对图像进行resize
    float scale_x = input_width / (float)image.cols;
    float scale_y = input_height / (float)image.rows;
    float scale = std::min(scale_x,scale_y);
    float i2d[6],d2i[6];
//     resize图像，源图像和目标图像几何中心的对齐
    i2d[0] = scale;
    i2d[1] = 0;
    i2d[2] = (-scale * image.cols + input_width + scale - 1) * 0.5;
    i2d[3] = 0;
    i2d[4] = scale;
    i2d[5] = (-scale * image.rows + input_height + scale - 1) * 0.5;

    cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);           // image to dst(network), 2x3 matrix
    cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);           // dst to image, 2x3 matrix
    cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); // 计算一个反仿射变换
    cv::Mat input_image(input_height, input_width, CV_8UC3);
    cv::Mat input_image1(input_height, input_width, CV_8UC3);

    float scale_x1 = input_width / (float)image.cols;
    float scale_y1 = input_height / (float)image.rows;
    float scale1 = std::min(scale_x1,scale_y1);
    float i2d1[6],d2i1[6];
    // resize图像，源图像和目标图像几何中心的对齐
    i2d1[0] = scale1;
    i2d1[1] = 0;
    i2d1[2] = (-scale1 * image.cols + input_width + scale1 - 1) * 0.5;
    i2d1[3] = 0;
    i2d1[4] = scale1;
    i2d1[5] = (-scale1 * image.rows + input_height + scale1 - 1) * 0.5;

    cv::Mat m2x3_i2d1(2, 3, CV_32F, i2d1);           // image to dst(network), 2x3 matrix
    cv::Mat m2x3_d2i1(2, 3, CV_32F, d2i1);           // dst to image, 2x3 matrix
    cv::invertAffineTransform(m2x3_i2d1, m2x3_d2i1); // 计算一个反仿射变换

    while(1)
    {
        clock_t startTime, endTime;
        startTime = clock();
        cap >> image;
        cap1 >> image1;
        cv::warpAffine(image,input_image,m2x3_i2d,input_image.size(),cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114)); // 对图像做平移缩放旋转变换,可逆
        cv::warpAffine(image1,input_image1,m2x3_i2d1,input_image1.size(),cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));


        int image_area = input_image.cols * input_image.rows;
        int image_area1 = input_image1.cols * input_image1.rows;

        unsigned char *pimage = input_image.data;


        float *phost_b = input_data_host + image_area * 0;
        float *phost_g = input_data_host + image_area * 1;
        float *phost_r = input_data_host + image_area * 2;
        for (int i = 0; i < image_area; ++i, pimage += 3)
        {
            // 注意这里的顺序rgb调换了
            *phost_r++ = pimage[0] / 255.0f;
            *phost_g++ = pimage[1] / 255.0f;
            *phost_b++ = pimage[2] / 255.0f;
        }

        unsigned char *pimage1 = input_image1.data;
        auto input_data_host1 = input_data_host + image_area * 3;
        float *phost_b1 = input_data_host1 + image_area1 * 0;
        float *phost_g1 = input_data_host1 + image_area1 * 1;
        float *phost_r1 = input_data_host1 + image_area1 * 2;
        for (int i = 0; i < image_area1; ++i, pimage1 += 3)
        {
            // 注意这里的顺序rgb调换了
            *phost_r1++ = pimage1[0] / 255.0f;
            *phost_g1++ = pimage1[1] / 255.0f;
            *phost_b1++ = pimage1[2] / 255.0f;
        }
        ///
        std::cout << "预处理时间: " << (double)(clock() - startTime) / CLOCKS_PER_SEC << "s" << std::endl;

        auto startTime1 = clock();
        checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));
        float *bindings[] = {input_data_device, output_data_device};
        bool success = execution_context->enqueueV2((void **)bindings, stream, nullptr);
        checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream));
        checkRuntime(cudaStreamSynchronize(stream));
        std::cout << "推理时间: " << (double)(clock() - startTime1) / CLOCKS_PER_SEC << "s" << std::endl;
        startTime1 = clock();
        // decode box：从不同尺度下的预测狂还原到原输入图上(包括:预测框，类被概率，置信度）
        vector<vector<float>> bboxes;
        float confidence_threshold = 0.25;
        float nms_threshold = 0.5;

        for (int i = 0; i < output_numbox; ++i)
        {

            float *ptr = output_data_host  + i * output_numprob;
            float objness = ptr[4];
            if (objness < confidence_threshold)
                continue;

            float *pclass = ptr + 5;
            int label = std::max_element(pclass, pclass + num_classes) - pclass;
            float prob = pclass[label];
            float confidence = prob * objness;
            if (confidence < confidence_threshold)
                continue;

            // 中心点、宽、高
            float cx = ptr[0];
            float cy = ptr[1];
            float width = ptr[2];
            float height = ptr[3];

            // 预测框
            float left = cx - width * 0.5;     // ??
            float top = cy - height * 0.5;
            float right = cx + width * 0.5;
            float bottom = cy + height * 0.5;

            // 对应图上的位置
            float image_base_left = d2i1[0] * left + d2i1[2];
            float image_base_right = d2i1[0] * right + d2i1[2];
            float image_base_top = d2i1[0] * top + d2i1[5];
            float image_base_bottom = d2i1[0] * bottom + d2i1[5];
            bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
        }
        //        printf("decoded bboxes.size = %d\n", bboxes.size());
        // nms非极大抑制
        std::sort(bboxes.begin(), bboxes.end(), [](vector<float> &a, vector<float> &b)
        { return a[5] > b[5]; });
        std::vector<bool> remove_flags1(bboxes.size());
        std::vector<vector<float>> box_result1;
        box_result1.reserve(bboxes.size());

        auto iou1 = [](const vector<float> &a, const vector<float> &b)
        {
            float cross_left = std::max(a[0], b[0]);
            float cross_top = std::max(a[1], b[1]);
            float cross_right = std::min(a[2], b[2]);
            float cross_bottom = std::min(a[3], b[3]);

            float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
            float union_area = std::max(0.0f, a[2] - a[0]) * std::max(0.0f, a[3] - a[1]) + std::max(0.0f, b[2] - b[0]) * std::max(0.0f, b[3] - b[1]) - cross_area;
            if (cross_area == 0 || union_area == 0)
                return 0.0f;
            return cross_area / union_area;
        };

        for (int i = 0; i < bboxes.size(); ++i)
        {
            if (remove_flags1[i])
                continue;

            auto &ibox = bboxes[i];
            box_result1.emplace_back(ibox);
            for (int j = i + 1; j < bboxes.size(); ++j)
            {
                if (remove_flags1[j])
                    continue;

                auto &jbox = bboxes[j];
                if (ibox[4] == jbox[4])
                {
                    // class matched
                    if (iou1(ibox, jbox) >= nms_threshold)
                        remove_flags1[j] = true;
                }
            }
        }
        //        printf("box_result.size = %d\n", box_result.size());

        for (int i = 0; i < box_result1.size(); ++i)
        {
            auto &ibox = box_result1[i];
            float left = ibox[0];
            float top = ibox[1];
            float right = ibox[2];
            float bottom = ibox[3];
            int class_label = ibox[4];
            float confidence = ibox[5];
            cv::Scalar color;
            tie(color[0], color[1], color[2]) = random_color(class_label);
            cv::rectangle(image, cv::Point(left, top), cv::Point(right, bottom), color, 3);

            auto name = cocolabels[class_label];
            auto caption = cv::format("%s %.2f", name, confidence);
            int text_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width + 10;
            cv::rectangle(image, cv::Point(left - 3, top - 33), cv::Point(left + text_width, top), color, -1);
            cv::putText(image, caption, cv::Point(left, top - 5), 0, 1, cv::Scalar::all(0), 2, 16);
        }
        cv::namedWindow("图像坐标系0",0);
        cv::imshow("图像坐标系0", image);
        cv::waitKey(1);
        bboxes.clear();
        for (int i = 0; i < output_numbox; ++i)
        {
            float *ptr = output_data_host  + 1 * output_numbox * output_numprob + i * output_numprob;  //input_batch * output_numbox * output_numprob
            float objness = ptr[4];
            if (objness < confidence_threshold)
                continue;

            float *pclass = ptr + 5;
            int label = std::max_element(pclass, pclass + num_classes) - pclass;
            float prob = pclass[label];
            float confidence = prob * objness;
            if (confidence < confidence_threshold)
                continue;

            // 中心点、宽、高
            float cx = ptr[0];
            float cy = ptr[1];
            float width = ptr[2];
            float height = ptr[3];

            // 预测框
            float left = cx - width * 0.5;     // ??
            float top = cy - height * 0.5;
            float right = cx + width * 0.5;
            float bottom = cy + height * 0.5;

            // 对应图上的位置
            float image_base_left = d2i[0] * left + d2i[2];
            float image_base_right = d2i[0] * right + d2i[2];
            float image_base_top = d2i[0] * top + d2i[5];
            float image_base_bottom = d2i[0] * bottom + d2i[5];
            bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
        }
        //        printf("decoded bboxes.size = %d\n", bboxes.size());

        // nms非极大抑制
        std::sort(bboxes.begin(), bboxes.end(), [](vector<float> &a, vector<float> &b)
        { return a[5] > b[5]; });
        std::vector<bool> remove_flags(bboxes.size());
        std::vector<vector<float>> box_result;
        box_result.reserve(bboxes.size());

        auto iou = [](const vector<float> &a, const vector<float> &b)
        {
            float cross_left = std::max(a[0], b[0]);
            float cross_top = std::max(a[1], b[1]);
            float cross_right = std::min(a[2], b[2]);
            float cross_bottom = std::min(a[3], b[3]);

            float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
            float union_area = std::max(0.0f, a[2] - a[0]) * std::max(0.0f, a[3] - a[1]) + std::max(0.0f, b[2] - b[0]) * std::max(0.0f, b[3] - b[1]) - cross_area;
            if (cross_area == 0 || union_area == 0)
                return 0.0f;
            return cross_area / union_area;
        };

        for (int i = 0; i < bboxes.size(); ++i)
        {
            if (remove_flags[i])
                continue;

            auto &ibox = bboxes[i];
            box_result.emplace_back(ibox);
            for (int j = i + 1; j < bboxes.size(); ++j)
            {
                if (remove_flags[j])
                    continue;

                auto &jbox = bboxes[j];
                if (ibox[4] == jbox[4])
                {
                    // class matched
                    if (iou(ibox, jbox) >= nms_threshold)
                        remove_flags[j] = true;
                }
            }
        }
        //        printf("box_result.size = %d\n", box_result.size());

        for (int i = 0; i < box_result.size(); ++i)
        {
            auto &ibox = box_result[i];
            float left = ibox[0];
            float top = ibox[1];
            float right = ibox[2];
            float bottom = ibox[3];
            int class_label = ibox[4];
            float confidence = ibox[5];
            cv::Scalar color;
            tie(color[0], color[1], color[2]) = random_color(class_label);
            cv::rectangle(image1, cv::Point(left, top), cv::Point(right, bottom), color, 3);

            auto name = cocolabels[class_label];
            auto caption = cv::format("%s %.2f", name, confidence);
            int text_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width + 10;
            cv::rectangle(image1, cv::Point(left - 3, top - 33), cv::Point(left + text_width, top), color, -1);
            cv::putText(image1, caption, cv::Point(left, top - 5), 0, 1, cv::Scalar::all(0), 2, 16);
        }
        cv::namedWindow("图像坐标系1",0);
        cv::imshow("图像坐标系1", image1);
        cv::waitKey(1);
        std::cout << "后处理时间: " << (double)(clock() - startTime1) / CLOCKS_PER_SEC << "s" << std::endl;
        endTime = clock(); //计时结束
        std::cout << "预处理+推理+后处理的总时间: " << (double)(endTime - startTime) / CLOCKS_PER_SEC << "s" << std::endl;
    }
    // cv::imwrite("image-draw.jpg", image);
    checkRuntime(cudaStreamDestroy(stream));
    checkRuntime(cudaFreeHost(input_data_host));
    checkRuntime(cudaFreeHost(output_data_host));
    checkRuntime(cudaFree(input_data_device));
    checkRuntime(cudaFree(output_data_device));




}

int main(){
    if(!build_model()){
        return -1;
    }
    inference();
    return 0;
}

Good@dz

关注

4
点赞
踩
39

收藏

觉得还不错? 一键收藏
2
评论
yolov5的TensorRT部署--动态batch

从0到1实现基于tensorrt的yolo部署教程 http://t.csdn.cn/HUn4T,请点击该链接，即可看到全文本文对于上面的案例，从多batc的角度详细讲解对于充分多batch，能大大提升模型的检测速度，例如多个视频流进行目标检测，我们可以获取多个视频流中的图片，都多张图片一起送往一个网络里进行推理。在生成TensorRT模型的代码，我们应该加入以下代码2. 在模型推理时的代码在推理的时，如何将设置显存和内存，并且将输入的数据与指定的内存和显卡关联在一起，这是多batch的难点batc
复制链接

扫一扫