YOLO v5 python版本TensorRT推理

kui9702

已于 2022-11-26 13:19:04 修改

阅读量1.7k

点赞数 1

分类专栏： # 目标检测

于 2022-05-10 21:09:14 首次发布

本文链接：https://blog.csdn.net/kui9702/article/details/124697556

版权

YOLOv5 TensorRT 推理速度 C++ NMS

关键词由CSDN通过智能技术生成

目标检测专栏收录该内容

15 篇文章

订阅专栏

该代码展示了如何使用TensorRT进行YOLOv5模型的推理，并对比Python实现，展示了C++版本的TensorRT推理速度更快的优势。代码包括加载预训练的TensorRT引擎，对输入图像进行预处理，执行推理，以及后处理步骤如NMS。通过letterbox函数调整图像尺寸，nms函数进行非极大值抑制，以减少重复的检测框。实验结果显示，C++版本的TensorRT推理时间大约是Python实现的一半。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

YOLO v5 TensorRT推理

#include "iostream"
#include "NvInfer.h"
#include <fstream>
#include <sstream>
#include <assert.h>
#include <vector>
#include <numeric>
#include "opencv2/opencv.hpp"
#include <algorithm>
#include <math.h>
#include <ctime>
#include <math.h>
#include <fstream>


#define LOCATIONS 4
#define MAX_OUTPUT_BBOX_COUNT 30000
#define MAX_DET 300
#define PERSIZE 6
#define INPUT_W 416
#define INPUT_H 416

struct Detection {
        //center_x center_y w h
        float bbox[LOCATIONS];
        float conf;  // bbox_conf * cls_conf
        float class_air;
        float class_oxy;
};

struct ResultDetection {
        //center_x center_y w h
        float bbox[LOCATIONS];
        float conf;  // bbox_conf * cls_conf
        int class_id;
        int suppressed;
};


class Logger : public nvinfer1::ILogger           
{
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
} glogger;


unsigned int getElementSize(nvinfer1::DataType t)
{
    switch (t)
    {
        case nvinfer1::DataType::kINT32: return 4;
        case nvinfer1::DataType::kFLOAT: return 4;
        case nvinfer1::DataType::kHALF: return 2;
        case nvinfer1::DataType::kBOOL:
        case nvinfer1::DataType::kINT8: return 1;
    }
    throw std::runtime_error("Invalid DataType.");
    return 0;
}

int64_t volume(const nvinfer1::Dims& d)
{
    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}


nvinfer1::ICudaEngine* loadEngine(std::fstream& file){
    std::string cacheEngine = "";
    while(file.peek() != EOF){
        std::stringstream buffer;
        buffer << file.rdbuf();
        cacheEngine.append(buffer.str());
    }
    file.close();
    nvinfer1::IRuntime* trtRuntime = nvinfer1::createInferRuntime(glogger);
    nvinfer1::ICudaEngine* engine = trtRuntime->deserializeCudaEngine(cacheEngine.data(), cacheEngine.size(), nullptr);
    assert(engine != nullptr);
    return engine;
}
/*
    newShape = [width, height]
*/
float letterbox(cv::Mat& image, int* newShape, int* color, int aut=0){
    int width = image.cols;
    int height = image.rows;
    // 获取比较小的缩放比例
    double r = std::min((float)newShape[0]/width, (float)newShape[1]/height);

    // 计算padding
    int newUnpad[2] = {(int)round(width * r), (int)round(height * r)};
    // 计算差值
    float dw = newShape[0] - newUnpad[0];
    float dh = newShape[1] - newUnpad[1]; 

    if(aut){
        dw = (int)dw % 32;
        dh = (int)dh % 32;
    }

    // 取对半
    dw = dw / 2;
    dh = dh / 2;
    cv::resize(image, image, cv::Size(newUnpad[0], newUnpad[1]));
    // 计算坐标
    int top = (int)round(dh - 0.1);
    int bottom = (int)round(dh + 0.1);
    int left = (int)round(dw - 0.1);
    int right = (int)round(dw + 0.1);
    cv::copyMakeBorder(image,image,top,bottom,left,right,cv::BORDER_CONSTANT,cv::Scalar(color[0],color[1],color[2]));
    return r;
}

void xywh2xyxy(float *xywh, float * xyxy){
    xyxy[0] = (float)(xywh[0] - xywh[2] / 2);
    xyxy[1] = (float)(xywh[1] - xywh[3] / 2);
    xyxy[2] = (float)(xywh[0] + xywh[2] / 2);
    xyxy[3] = (float)(xywh[1] + xywh[3] / 2);
}

float iou(float lbox[4], float rbox[4]) {
    // float interBox[] = {
    //     (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
    //     (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
    //     (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
    //     (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
    // };

    if (lbox[2] > lbox[3] || lbox[0] > lbox[1])
        return 0.0f;

    float interBoxS = (lbox[1] - lbox[0])*(lbox[3] - lbox[2]);
    return interBoxS / ((lbox[1] - lbox[0])*(lbox[3] - lbox[2])+ (rbox[1] - rbox[0])*(rbox[3] - rbox[2]) - interBoxS);
}

bool cmp(const ResultDetection& a, const ResultDetection& b) {
    return a.conf > b.conf;
}


void compareAndfilter(std::vector<float>& oriList, std::vector<ResultDetection>& resultList, int max_size){
    // 获取原始长度
    int len = oriList.size() / PERSIZE;
    // 遍历  ===================================================================当前状态下很难会产生超过max_size的候选框，故此不做对置信度的排序 python代码-》  x = x[x[:, 4].argsort(descending=True)[:max_nms]]
    for(int j = 0; j < max_size; ++j){
        int max = 0;
        // for(int i = 0;i < len; ++i){
        for(int k = 1; k < len; ++k){
            if(oriList[k * PERSIZE + 4] > oriList[max * PERSIZE + 4]){
                max = k;
            }
        }
        std::cout << max << std::endl;
        ResultDetection temp = {oriList[max * PERSIZE + 0], oriList[max * PERSIZE + 1], oriList[max * PERSIZE + 2], oriList[max * PERSIZE + 3], oriList[max * PERSIZE + 4], (int)oriList[max * PERSIZE + 5], 0};
        resultList.emplace_back(temp);
        oriList[max  * PERSIZE + 4] = -1;
    }
}

void nms(std::vector<ResultDetection>& res, float *output, int outSize, float conf_thresh, float nms_thresh = 0.5){
    int det_size = sizeof(Detection) / sizeof(float);
    // 筛选出第一轮结果，根据conf > conf_thresh
    std::vector<int> oneList;
    for(int i = 0; i < outSize / det_size; ++i){
        if(output[det_size * i + 4] > conf_thresh){
            // 记录下所有置信度大于conf_thresh的标签
            oneList.emplace_back(i);
        }
    }
    if(oneList.size() == 0) return;
    // nms数据
    std::vector<float> nmsList;
    for(int i : oneList){
        int class_index;
        float zero_class_conf = output[i*det_size + 5] * output[i*det_size + 4];
        float first_class_conf = output[i*det_size + 6] * output[i*det_size + 4];
        float xywh[4] = {output[i*det_size + 0], output[i*det_size + 1], output[i*det_size + 2], output[i*det_size + 3]};
        float xyxy[4];
        xywh2xyxy(xywh, xyxy);
        if(zero_class_conf > first_class_conf){
            nmsList.emplace_back(xyxy[0]);
            nmsList.emplace_back(xyxy[1]);
            nmsList.emplace_back(xyxy[2]);
            nmsList.emplace_back(xyxy[3]);
            nmsList.emplace_back(zero_class_conf);
            nmsList.emplace_back(0);
        }else{
            nmsList.emplace_back(xyxy[0]);
            nmsList.emplace_back(xyxy[1]);
            nmsList.emplace_back(xyxy[2]);
            nmsList.emplace_back(xyxy[3]);
            nmsList.emplace_back(first_class_conf);
            nmsList.emplace_back(1);
        }
    }
    std::vector<ResultDetection> resultList;
    if(nmsList.size()==0) return;
    else if ((nmsList.size()/PERSIZE) > MAX_OUTPUT_BBOX_COUNT)
    {
        // 提取前MAX_OUTPUT_BBOX_COUNT个候选框
        compareAndfilter(nmsList, resultList, MAX_OUTPUT_BBOX_COUNT);
    }else{
        // resultList.assign(nmsList.begin(), nmsList.end());
        compareAndfilter(nmsList, resultList, nmsList.size() / PERSIZE);
    }
    
    // 简单的排序
    std::cout << resultList.size() << std::endl;
    for(int i = 0; i <  resultList.size(); i++){
        auto tempDet = resultList[i];
        if(tempDet.suppressed == 1) continue;
        auto ix1 = tempDet.bbox[0];
        auto iy1 = tempDet.bbox[1];
        auto ix2 = tempDet.bbox[2];
        auto iy2 = tempDet.bbox[3];
        auto iarea = (ix2 - ix1) * (iy2 - iy1);
        for(int j = i+1; j < resultList.size();  ++j){
            auto yTempDet = resultList[j];
            if(yTempDet.suppressed == 1) continue;
            auto xx1 = std::max(ix1, yTempDet.bbox[0]);
            auto yy1 = std::max(iy1, yTempDet.bbox[1]);
            auto xx2 = std::min(ix2, yTempDet.bbox[2]);
            auto yy2 = std::min(iy2, yTempDet.bbox[3]);

            auto w = std::max((float)0, xx2 - xx1);
            auto h = std::max((float)0, yy2 - yy1);

            auto inter = w * h;
            auto ovr = inter / (iarea + (yTempDet.bbox[2] - yTempDet.bbox[0])*(yTempDet.bbox[3]-yTempDet.bbox[1]) - inter);
            if(ovr > nms_thresh){
                resultList[j].suppressed = 1;
            }
        }
    }

    for(int i = 0; i < resultList.size() && res.size() < MAX_DET; ++i){
        ResultDetection a = resultList[i];
        if(!a.suppressed) res.emplace_back(a);
    }

}


float getTrueCor(float cor, float size){
    if(cor < 0){
        return 0;
    }else{
        if(cor < size){
            return cor;
        }else{
            return size;
        }
    }
}


cv::Rect get_rect(cv::Mat& img, float bbox[4], cv::Mat& img_letter_box, float scale) {   // bbox[4]  ->  xmin, ymin, xmax, ymax
    // 获取原图img与加了空白的预测图片img_letter_box
    float gain = std::min((float)img_letter_box.cols / img.cols, (float)img_letter_box.rows / img.rows);
    // 一般在加了letter box之后，较长的一边都会为满，则获取短边的pad即可
    int pad_cols = int(img_letter_box.cols - img.cols*gain) / 2;   // w
    int pad_rows = int(img_letter_box.rows - img.rows*gain) / 2;   // h

    // bbox[0] = bbox[0] - pad_cols;
    // bbox[2] = bbox[2] - pad_cols;
    // bbox[1] = bbox[1] - pad_rows;
    // bbox[3] = bbox[3] - pad_rows;

    // float scale = (float) img.cols / (img_letter_box.cols - pad_cols*2);


    float l, r, t, b;
    l = (int)(getTrueCor((bbox[0] - pad_cols)  / scale, img.cols));
    r = (int)(getTrueCor((bbox[2] - pad_cols)  / scale, img.cols));
    t = (int)(getTrueCor((bbox[1] - pad_rows)  / scale, img.rows));
    b = (int)(getTrueCor((bbox[3] - pad_rows) / scale, img.rows));

    // float l, r, t, b;
    // float r_w = INPUT_W / (img.cols * 1.0);
    // float r_h = INPUT_H / (img.rows * 1.0);
    // if (r_h > r_w) {
    //     l = bbox[0] - bbox[2] / 2.f;
    //     r = bbox[0] + bbox[2] / 2.f;
    //     t = bbox[1] - bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;
    //     b = bbox[1] + bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;
    //     // l = l / r_w;
    //     // r = r / r_w;
    //     // t = t / r_w;
    //     // b = b / r_w;
    //     l = getTrueCor(l / r_w, img.cols);
    //     r = getTrueCor(r / r_w, img.cols);
    //     t = getTrueCor(t / r_w, img.rows);
    //     b = getTrueCor(b / r_w, img.rows);
    // } else {
    //     l = bbox[0] - bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;
    //     r = bbox[0] + bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;
    //     t = bbox[1] - bbox[3] / 2.f;
    //     b = bbox[1] + bbox[3] / 2.f;
    //     // l = l / r_h;
    //     // r = r / r_h;
    //     // t = t / r_h;
    //     // b = b / r_h;
    //     l = getTrueCor(l / r_h, img.cols);
    //     r = getTrueCor(r / r_h, img.cols);
    //     t = getTrueCor(t / r_h, img.rows);
    //     b = getTrueCor(b / r_h, img.rows);
    // }
    return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}


int main(){
    std::string enginePath="/data/kile/202204/yolov5/log/2.engine";
    // 创建文件流
    std::fstream file(enginePath, std::ios_base::binary | std::ios_base::in);
    if(!file.is_open()){
       return 10001; // engine文件打开失败 
    }
    nvinfer1::ICudaEngine* engine = loadEngine(file);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    
    clock_t start, end;


    start = clock();
    void *buffers[2];
    std::vector<int64_t> bufferSize;
    int nBindings = engine->getNbBindings();
    bufferSize.resize(nBindings);
    for(int i = 0; i < nBindings; ++i){
        nvinfer1::Dims dims = engine->getBindingDimensions(i);
        nvinfer1::DataType dtype = engine->getBindingDataType(i);
        int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
        bufferSize[i] = totalSize;
        cudaMalloc(&buffers[i], totalSize);
    }
    int outSize = bufferSize[1] / sizeof(float);

    std::string imagePath = "/data/kile/202204/yolov5/result/aircraft_4.jpg";
    cv::Mat image = cv::imread(imagePath);
    cv::Mat img1 = image.clone();
    int size[2] = {416, 416};
    int color[3] = {114, 114, 114};
    float scale = letterbox(image, size, color);
    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
    // image.convertTo(image, CV_32F, 255.0, 0);
    // time_t start,end;
    // start = clock();
    std::vector<float> a;
    if(image.isContinuous()){
        // a->assign(image.datastart, image.dataend);
        a.assign(image.datastart, image.dataend);
    }

    float image_content[image.channels()*image.rows*image.cols];
    for (int i = 0; i < image.rows*image.cols; i++)
    {   // (float)(a[i*3] / 255)
        image_content[i] = (float)(a[i*3] / 255);
        image_content[i+image.cols*image.rows] = (float)(a[i*3+1] / 255);
        image_content[i+2*image.cols*image.rows] = (float)(a[i*3+2] / 255);   // 0.003
        // std::cout << image_content[i] << std::endl;
    }
    // end = clock();
    // std::cout << (float) (end - start) / CLOCKS_PER_SEC << std::endl;
    cudaError_t flag;
    cudaStream_t stream;
    flag = cudaStreamCreate(&stream);
    if (flag != cudaSuccess)
    {
        std::cout << "1 cudaStreamCreate error : " << flag <<  std::endl;
        return 40001;
    }

    flag = cudaMemcpyAsync(buffers[0],&image_content, bufferSize[0],cudaMemcpyHostToDevice,stream);
    if (flag != cudaSuccess)
    {
        std::cout << "2 cudaMemcpyAsync input error : " << cudaGetErrorString(flag) << std::endl;
        return 40002;
    }
    bool status = context->enqueueV2(buffers, stream, nullptr);
    if (!status){
        std::cout << "4 inference error : " << status << std::endl;
        return 40004;
    }
    float result[outSize];

    flag = cudaMemcpyAsync(result, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost,stream);
    if (flag != cudaSuccess)
    {   
        std::cout << "3 cudaMemcpyAsync output error : " << cudaGetErrorString(flag) << std::endl;
        return 40003;
    }
    cudaStreamSynchronize(stream);
    cudaStreamDestroy(stream);
    // std::fstream f;
    // f.open("/data/kile/202204/yolov5/log/imagecontent.txt", std::ios_base::out);
    // for (int i = 0; i < 74529; i++)
    // {   // (float)(a[i*3] / 255)
    //     f << std::to_string(result[i]) << std::endl;
    // }
    // f.close();

    std::vector<ResultDetection> res;
    nms(res, result, outSize, 0.25, 0.45);

    for (size_t j = 0; j < res.size(); j++) {
        cv::Rect r = get_rect(img1, res[j].bbox, image, scale);
        cv::rectangle(img1, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
        cv::putText(img1, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
    }
    cv::imwrite("/data/kile/202204/yolov5/log/test_c.jpg", img1);
    end = clock();

    std::cout << "hello world!! " << (float) (end - start) / CLOCKS_PER_SEC  << std::endl;
    getchar();
    return 1;
}

import time
from typing import List
import numpy as np
import tensorrt
import torch
from pycuda import driver
import pycuda.autoinit
from PIL import Image, ImageDraw, ImageFont

from utils.general import non_max_suppression, scale_coords

def time_sync():
    # PyTorch-accurate time
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    return time.time()

def trt_pre(batch, context, d_size,
            d_type):  # Need to set both input and output precisions to FP16 to fully enable FP16
    output = np.empty(d_size, dtype=d_type)
    batch = batch.reshape(-1)
    d_input = driver.mem_alloc(1 * batch.nbytes)
    d_output = driver.mem_alloc(output.nbytes)
    bindings = [int(d_input), int(d_output)]
    stream = driver.Stream()
    # Transfer input data to device
    driver.memcpy_htod_async(d_input, batch, stream)
    # Execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # Transfer predictions back
    driver.memcpy_dtoh_async(output, d_output, stream)
    # Syncronize threads
    stream.synchronize()
    return output


def python_tensorrt_predict(model_path):
    # 加载模型A
    trt_model = tensorrt.Runtime(tensorrt.Logger(tensorrt.Logger.WARNING))
    # 反序列化模型
    engine = trt_model.deserialize_cuda_engine(open(model_path, "rb").read())
    # 创建推理上下文
    context = engine.create_execution_context()
    for binding in engine:
        if not engine.binding_is_input(binding):
            size = tensorrt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = tensorrt.nptype(engine.get_binding_dtype(binding))
        else:
            input_w = engine.get_binding_shape(binding)[-1]
            input_h = engine.get_binding_shape(binding)[-2]
    from utils.datasets import LoadImages
    # source = r"/data/kile/202204/yolov5/result/aircraft_4.jpg"
    source = r"/data/kile/202204/yolov5/video/data/data_data/cd3ed3d2cf7611eca8630050569379a7.jpg"
    start = time.perf_counter()
    dataset = LoadImages(source, img_size=[input_w, input_h], stride=32, auto=False)
    for path, im, im0s, vid_cap, s in dataset:
        image = Image.open(path)
        im = torch.from_numpy(im).to("cuda").float()
        im /= 255
        start1 = time.perf_counter()
        outputs = trt_pre(np.asarray(im.cpu(), dtype=np.float32), context, size, dtype)
        end1 = time.perf_counter()
        print(f"inference {end1 - start1}")
        # with open("log/outputs.txt", "w") as w:
        #     for i in outputs:
        #         w.write(str(i))
        #         w.write("\n")
        outputs = torch.as_tensor(outputs).reshape((-1, 7)).unsqueeze(0)
        pred = non_max_suppression(outputs, 0.25, 0.45)
        pred[0][:,:4] = scale_coords(im.shape[1:], pred[0][:,:4], im0s.shape)
        image = drawImage(image, list(pred))
        image.save(r"log/test.jpg")
    end = time.perf_counter()
    print(f"{end -start}")

def drawImage(image, class_list):
    font = ImageFont.truetype(font='/data/kile/other/yolov3/font/FiraMono-Medium.otf',
                              size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
    thickness = (image.size[0] + image.size[1]) // 300
    for i in class_list[0]:
        if not isinstance(i, List):
            i = list(i)
        label = str(i[-1])+"_"+str(i[-2])
        box = i[:-2]
        left, top, right, bottom = box
        top = int(top.numpy())
        left = int(left.numpy())
        bottom = int(bottom.numpy())
        right = int(right.numpy())
        draw = ImageDraw.Draw(image)
        label_size = draw.textsize(label, font)

        top = max(0, np.floor(top + 0.5).astype('int32'))
        left = max(0, np.floor(left + 0.5).astype('int32'))
        bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
        right = min(image.size[0], np.floor(right + 0.5).astype('int32'))

        if top - label_size[1] >= 0:
            text_origin = np.array([left, top - label_size[1]])
        else:
            text_origin = np.array([left, top + 1])
        for i in range(thickness):
            draw.rectangle(
                [left + i, top + i, right - i, bottom - i],
                outline=(0x27, 0xC1, 0x36))
        draw.rectangle(
            [tuple(text_origin), tuple(text_origin + label_size)],
            fill=(128, 0, 128))
        draw.text(text_origin, label, fill=(0, 0, 0), font=font)
        del draw
    return image


if __name__ == '__main__':
    # 通官方的源码export.py生成tensorrt模型
    # model_path = r"/data/kile/202204/yolov5/log/2.engine"
    model_path = r"/data/kile/202204/yolov5/log/1.trt"
    python_tensorrt_predict(model_path)
    from PIL import Image

    # path = r"/data/kile/data/oridata_100/n0942195117838/n0942195117838.jpeg"
    # image = cv2.imread(path)
    # im, _, _ = letterbox(image, (416,416), auto=False)
    # cv2.imwrite("/data/1.jpg", im)