yolov8的TensorRT部署(C++版本)

一、环境配置

        需要配置好CUDA、CUDNN环境,还要装好TensorRT环境,参考下面这篇博客

【Ubuntu版】TensorRT安装教程(tar包方式)_ubuntu安装tensorrt-CSDN博客

二、模型准备

        首先你需要一个ONNX模型文件,我是Pytorch->ONNX

from ultralytics import YOLO

model = YOLO("./yolov8n.pt")

if __name__ == '__main__':

    model.export(format="onnx")

        然后得到yolov8n.onnx文件,接下来就是ONNX->trt

找到你们TensorRT安装位置

TensorRT-8.6.4.3/bin/trtexec --onnx=yolov8n.onnx --saveEigine=yolov8n.trt

这样就生成了TensorRT可以使用的模型文件

三、yolov8输出结果分析

        yolov8输出的output0.shape为 1x84x8400

8400是预选框的数量,比yolov5的25600的少了很多

其中84的为前4个是框的xywh(中心坐标加宽高),与yolov5相比少了一个objectness,那么objectness怎么获得呢?在yolov8中objectness=后面80个类的confidence中最大的那个

最终的score=objectness*confidence

四、主要代码

detect.cpp

#include<iostream>  
#include<opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
#include<fstream>
#include "NvInfer.h"
#include "processing.hpp"
//#include "logging.h"

using namespace nvinfer1;
using namespace std;

const int model_width = 640;
const int model_height = 640;

class MyLogger : public nvinfer1::ILogger
{
    public:
    explicit MyLogger(nvinfer1::ILogger::Severity severity =nvinfer1::ILogger::Severity::kWARNING) : severity_(severity) {}

    void log(nvinfer1::ILogger::Severity severity, const char *msg) noexcept override
    {
        if (severity <= severity_) {
            std::cerr << msg << std::endl;
        }
    }
    nvinfer1::ILogger::Severity severity_;
};

int main()
{
//一、图像处理
    string image_path = "/home/hitcrt/code/tensorrt/TRT_test/street.jpg";  //填写自己图片路径(需要绝对路径)
    cv::Mat input_image = cv::imread(image_path);

    float* input_blob = new float[model_height * model_width * 3];
    cv::Mat resize_image;
	//比例
    const float _ratio = std::min(model_width / (input_image.cols * 1.0f),
                            model_height / (input_image.rows * 1.0f));
    // 等比例缩放
    const int border_width = input_image.cols * _ratio;
    const int border_height = input_image.rows * _ratio;
    // 计算偏移值
    const int x_offset = (model_width - border_width) / 2;
    const int y_offset = (model_height - border_height) / 2;

    //将输入图像缩放至resize_image
    cv::resize(input_image, resize_image, cv::Size(border_width, border_height));
    //复制图像并且制作边界
    cv::copyMakeBorder(resize_image, resize_image, y_offset, y_offset, x_offset,
                        x_offset, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
    // 转换为RGB格式
    cv::cvtColor(resize_image, resize_image, cv::COLOR_BGR2RGB);
    
    //归一化
    const int channels = resize_image.channels();
    const int width = resize_image.cols;
    const int height = resize_image.rows;
    for (int c = 0; c < channels; c++) {
        for (int h = 0; h < height; h++) {
            for (int w = 0; w < width; w++) {
                input_blob[c * width * height + h * width + w] =
                    resize_image.at<cv::Vec3b>(h, w)[c] / 255.0f;  //at<Vec3b> 是 OpenCV 中用于访问图像像素的一种方法,使用 at<Vec3b> 获取彩色图像中特定位置的像素颜色值
            }
        }
    }

//二、模型反序列化
    MyLogger logger;
    //读取trt信息
    const std::string engine_file_path = "/home/hitcrt/code/tensorrt/TRT_test/yolov8n.trt";  //填写自己trt文件路径(需要绝对路径)
    std::stringstream engine_file_stream;
    engine_file_stream.seekg(0, engine_file_stream.beg);  //从起始位置偏移0个字节,指针移动到文件流的开头
    std::ifstream ifs(engine_file_path);
    engine_file_stream << ifs.rdbuf();  //将读取到的数据流交给engine_file_stream
    ifs.close();

    engine_file_stream.seekg(0, std::ios::end); //先把文件输入流指针定位到文档末尾来获取文档的长度
    const int model_size = engine_file_stream.tellg();  //获取文件流的总长度
    engine_file_stream.seekg(0, std::ios::beg);
    void *model_mem = malloc(model_size);               //开辟一样长的空间
    engine_file_stream.read(static_cast<char *>(model_mem), model_size);    //将内容读取到model_mem中

    nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
    nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(model_mem, model_size);

    free(model_mem);

//三、模型推理
    nvinfer1::IExecutionContext *context = engine->createExecutionContext();

    void *buffers[2];
    // 获取模型输入尺寸并分配GPU内存
    nvinfer1::Dims input_dim = engine->getBindingDimensions(0);
    int input_size = 1;
    for (int j = 0; j < input_dim.nbDims; ++j) {
        if(input_dim.d[j] < 0)
            input_size *= -input_dim.d[j];
        else
            input_size *= input_dim.d[j];
    }
    cudaMalloc(&buffers[0], input_size * sizeof(float));

    // 获取模型输出尺寸并分配GPU内存
    nvinfer1::Dims output_dim = engine->getBindingDimensions(1);

    int output_size = 1;
    for (int j = 0; j < output_dim.nbDims; ++j) {
        if(output_dim.d[j] < 0)
            output_size *= -output_dim.d[j];
        else
            output_size *= output_dim.d[j];
    }
    cudaMalloc(&buffers[1], output_size * sizeof(float));

    // 给模型输出数据分配相应的CPU内存
    float *output_buffer = new float[output_size];
    //数据投入
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    // 拷贝输入数据
    cudaMemcpyAsync(buffers[0], input_blob, input_size * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    // 执行推理
    if(context->enqueueV2(buffers, stream, nullptr))
    {
        cout << "enqueueV2执行推理成功" << endl;
    }
    else{
        cout << "enqueueV2执行推理失败" << endl;
        return -1;
    }
    // 拷贝输出数据
    cudaMemcpyAsync(output_buffer, buffers[1], output_size * sizeof(float),
                    cudaMemcpyDeviceToHost, stream);

    cudaStreamSynchronize(stream);

    delete context;
    delete engine;
    delete runtime;
    delete[] input_blob;

//四、输出结果output_buffer,放入objs  xywh为中心点坐标 和宽高
    float *ptr = output_buffer;     // 1x84x8400  =  705600
    vector<vector<float>> temp(84, vector<float>(8400));
    vector<vector<float>> outVec(8400, vector<float>(84));
    for(int i = 0; i < 705600; i++)
    {
        temp[i/8400][i%8400] = *ptr;
        ptr++;
    }
    for(int i = 0; i < 84; i++)
    {
        for(int j = 0; j < 8400; j++)
        {
            outVec[j][i] = temp[i][j];
        }
    }
    std::vector<Object> objs;
    for (int i = 0; i < 8400; ++i)
    {
        const float objectness = *(std::max_element(outVec[i].begin() + 4, outVec[i].begin() + 83));
        if (objectness >= 0.45f)
        {
            const int label = std::max_element(outVec[i].begin() + 4, outVec[i].begin() + 83) - (outVec[i].begin() + 4);  //std::max_element返回范围内的最大元素
            const float confidence = outVec[i][label + 4] * objectness;
            if (confidence >= 0.25f) {
                const float bx = outVec[i][0];
                const float by = outVec[i][1];
                const float bw = outVec[i][2];
                const float bh = outVec[i][3];
                Object obj;
                // 还原图像尺寸中box的尺寸比例,这里要减掉偏移值,并把box中心点坐标xy转成左上角坐标xy
                obj.box.x = (bx - bw * 0.5f - x_offset) / _ratio;
                obj.box.y = (by - bh * 0.5f - y_offset) / _ratio;
                obj.box.width = bw / _ratio;
                obj.box.height = bh / _ratio;
                obj.label = label;
                obj.confidence = confidence;
                objs.push_back(std::move(obj));
            }
        }
    }  // i loop

//五、NMS非极大值抑制
    vector<Object> output;
    hardNMS(objs, output, 0.6, 10);

//六、画框
    vector<Object>::iterator it = output.begin();
    while(it != output.end()){
        cv::Point topLeft(it->box.x, it->box.y);
        cv::Point bottomRight(it->box.x + it->box.width, it->box.y + it->box.height);
        cv::rectangle(input_image, topLeft, bottomRight, cv::Scalar(0, 0, 255), 2);
        std::stringstream buff;
        buff.precision(2);  //覆盖默认精度,置信度保留2位小数
        buff.setf(std::ios::fixed);
        buff << it->confidence;
        string text =names[it->label] + " " + buff.str();
        cv::putText(input_image, text, topLeft, 0, 1, cv::Scalar(0, 255, 0), 2);
        it++;
    }
    cv::imwrite("detected.jpg", input_image);

    return 0;
}

preprocessing.hpp

#include <iostream>
#include <vector>
#include <list>
using namespace std;

//以coco数据集为例
string names[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "'skis'", "'snowboard'", "'sports ball'", "'kite'", "'baseball bat'", "'baseball glove'", "'skateboard'", "'surfboard'",
        "'tennis racket'", "'bottle'", "'wine glass'", "'cup'", "'fork'", "'knife'", "'spoon'", "'bowl'", "'banana'", "'apple'",
        "'sandwich'", "'orange'", "'broccoli'", "'carrot'", "'hot dog'", "'pizza'", "'donut'", "'cake'", "'chair'", "'couch'",
        "'potted plant'", "'bed'", "'dining table'", "'toilet'", "'tv'", "'laptop'", "'mouse'", "'remote'", "'keyboard'", "'cell phone'",
        "'microwave'", "'oven'", "'toaster'", "'sink'", "'refrigerator'", "'book'", "'clock'", "'vase'", "'scissors'", "'teddy bear'",
        "'hair drier'", "'toothbrush'"};

struct BOX
{
    float x;
    float y;
    float width;
    float height;
};

struct Object
{
    BOX box;    // lu点和wh
    int label;
    float confidence;  //这里的confidence实际指的是score 即 objectness*confidence
};

bool cmp(Object &obj1, Object &obj2){
    return obj1.confidence > obj2.confidence;
}

float iou_of(const Object &obj1, const Object &obj2)
{
    float x1_lu = obj1.box.x;
    float y1_lu = obj1.box.y;
    float x1_rb = x1_lu + obj1.box.width;
    float y1_rb = y1_lu + obj1.box.height;
    float x2_lu = obj2.box.x;
    float y2_lu = obj2.box.y;
    float x2_rb = x2_lu + obj2.box.width;
    float y2_rb = y2_lu + obj2.box.height;
    //交集左上角坐标i_x1, i_y1
    float i_x1 = std::max(x1_lu, x2_lu);
    float i_y1 = std::max(y1_lu, y2_lu);
    //交集右下角坐标i_x2, i_y2
    float i_x2 = std::min(x1_rb, x2_rb);
    float i_y2 = std::min(y1_rb, y2_rb);
    //交集框宽高
    float i_w = i_x2 - i_x1;
    float i_h = i_y2 - i_y1;
    //并集左上角坐标
    float o_x1 = std::min(x1_lu, x2_lu);
    float o_y1 = std::min(y1_lu, y2_lu);
    //并集右下角坐标
    float o_x2 = std::max(x1_rb, x2_rb);
    float o_y2 = std::max(y1_rb, y2_rb);
    //并集宽高
    float o_w = o_x2 - o_x1;
    float o_h = o_y2 - o_y1;

    return (i_w*i_h) / (o_w*o_h);
}

std::vector<int> hardNMS(std::vector<Object> &input, std::vector<Object> &output, float iou_threshold, unsigned int topk)
{  //Object只有confidence和label
    const unsigned int box_num = input.size(); 
    std::vector<int> merged(box_num, 0);
    std::vector<int> indices;

    if (input.empty())
        return indices;
    std::vector<Object> res;
    //先对bboxs按照conf进行排序
    std::sort(input.begin(), input.end(),
            [](const Object &a, const Object &b)
            { return a.confidence > b.confidence; });   //[]表示C++中的lambda函数
    
    unsigned int count = 0;
    for (unsigned int i = 0; i < box_num; ++i)
    {   //按照conf依次遍历bbox
        if (merged[i])
            continue;
        //如果已经被剔除,continue
        Object buf;
        buf = input[i];
        merged[i] = 1; //剔除当前bbox

        //由于后面的置信度低,只需要考虑当前bbox后面的即可
        for (unsigned int j = i + 1; j < box_num; ++j)
        {
            if (merged[j])
                continue;

            float iou = static_cast<float>(iou_of(input[j], input[i]));
            //计算iou
            if (iou > iou_threshold)
            { //超过阈值认为重合,剔除第j个bbox,
                merged[j] = 1;
            }
        }
        indices.push_back(i);
        res.push_back(buf); //将最高conf的bbox填入结果

        // keep top k
        //获取前k个输出,这个应该是针对密集输出的情况,此时input已经做了conf剔除
        count += 1;
        if (count >= topk)
            break;
    }
    output.swap(res);

    return indices;
}

float sigmoid(float x)
{
    return 1.0 / (exp(-x) + 1.0);
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.6)
project(Demo)

set(CMAKE_BUILD_TYPE "Debug")    # 用于gdb调试

add_definitions(-std=c++11)      # 14?

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)    # 是否必要
# set(CMAKE_CXX_STANDARD 11)        # 14?
# set(CMAKE_BUILD_TYPE Debug)       # 用于gdb调试

find_package(CUDA REQUIRED)

if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message("embed_platform on")
    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
else()
    message("embed_platform off")
    include_directories(/usr/local/cuda/include)
    link_directories(/usr/local/cuda/lib64)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")

find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS} /home/home_expand/TensorRT-8.6.1.6/include)
link_directories(/home/home_expand/TensorRT-8.6.1.6/lib)

add_executable(Demo segment.cpp)

target_link_libraries(Demo nvinfer cudart ${OpenCV_LIBRARIES})

add_definitions(-O2 -pthread)

YOLOv8YOLOv4的一个改进版本,它包含更多的优化和改进,以提高目标检测的性能和准确性。 TensorRT是NVIDIA推出的一个高性能的深度学习加速库,它可以将训练好的模型转换为可部署的高效推理引擎。下面是YOLOv8TensorRT上的部署步骤: 1. 首先,需要将YOLOv8的模型转换为TensorRT格式。可以使用NVIDIA的TensorRT转换工具,将Darknet框架训练的模型转换为TensorRT格式。具体步骤可以参考官方文档:https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-models。 2. 在TensorRT中创建推理引擎。可以使用TensorRT提供的C++ API或Python API来创建推理引擎。具体代码可以参考官方示例:https://github.com/NVIDIA/TensorRT/tree/master/samples/python/yolov4_onnx。 3. 准备推理数据。需要将输入图像转换为TensorRT支持的格式,并将其传递给推理引擎进行推理。具体代码可以参考官方示例:https://github.com/NVIDIA/TensorRT/tree/master/samples/python/yolov4_onnx。 4. 运行推理。将推理数据传递给推理引擎,并从推理引擎获取输出结果。具体代码可以参考官方示例:https://github.com/NVIDIA/TensorRT/tree/master/samples/python/yolov4_onnx。 需要注意的是,YOLOv8相对于YOLOv4来说,对硬件要求更高,需要使用NVIDIA的Ampere架构或更高的GPU才能获得更好的性能。同时,部署时需要根据硬件配置和应用场景进行调整,以获得最佳的性能和准确性。
评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

窝工昆邪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值