TensorRT部署YoloX

1、TensorRT简介:

TensorRT 是英伟达(NVIDIA)公司推出的一款用于高性能深度学习模型推理的软件开发工具包。它可以为深度学习应用提供低延迟、高吞吐率的部署推理,适用于超大规模数据中心、嵌入式平台或自动驾驶平台等场景。

2、YoloX简介

YoloX 是基于 YOLO(You Only Look Once)系列算法的实时目标检测算法的改进版本。YOLO 系列算法在计算机视觉领域具有很高的知名度和影响力,它们可以实现快速准确的目标检测。YoloX 对 YOLO 系列算法进行了优化和改进,以提高性能和准确性。其具有anchor free的特性,从一种内生的视角来识别覆盖了锚点的目标物。

3、本文任务:

笔者将基于TensorRT部署YoloX目标检测算法,用于检测透镜位置。透镜图像如下图所示:

为避免TensorRT Plugin的编写(插件编写后续也会实现),本文将原始YoloX的Focus操作使用5×5卷积进行替换,替换后的模型头部如图所示,模型的输入大小为1×3×H(480)×W(640)

模型输出三个尺度的特征层L(1×10(dx,dy,w,h)×60×80)预测小目标、M(1×10(dx,dy,w,h)×30×40)预测中等尺度目标、S(1×10(dx,dy,w,h)×15×20)预测大尺度目标。

4、模型构建

4.1、构建YoloxDetector推理类

#ifndef YOLOX_YOLOXDETECTOR_CUH#define YOLOX_YOLOXDETECTOR_CUH
#include <iostream>#include <fstream>#include "NvOnnxParser.h"#include "NvInfer.h"#include "../../../../../Documents/vcpkg/packages/opencv4_x64-windows/include/opencv2/opencv.hpp"#include <cuda_runtime_api.h>#include <device_launch_parameters.h>
using namespace cv;using namespace nvinfer1;using namespace nvonnxparser;using namespace std;
class Logger : public ILogger{    //void log(Severity severity, const char* msg) override    void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{                    // suppress info-level messages                    if (severity <= Severity::kWARNING)                    std::cout << msg << std::endl;            }};typedef struct __BoundingBox{    Rect rect;    float confidence;    int class_id;} BoundingBox;
class YoloXDetector {
public:    // members
    // functions    bool  load_engine(const char* engine_path);    bool  load_onnx_engine(const char* onnx_path, const char* engine_save_path);    void  detect(cv::Mat& src,float conf_thread,std::vector<BoundingBox>& bboxes);

    YoloXDetector();
    ~YoloXDetector();private:    // members    float deconf;    Logger  gLogger;    IRuntime* runtime;    IExecutionContext* context;    void* outblob[3] = { nullptr, nullptr, nullptr };    void* buffers[4] = { nullptr, nullptr, nullptr, nullptr };    void* byteImg = nullptr;    cudaStream_t stream;    ICudaEngine* engine;
    const int h = 480;//获取图片的长    const int w = 640;//获取图片的宽    const int ch = 3;    const int nBatchSize = 1;

    // functions    float desigmoid(float x);    float sigmoid(float x);
};#endif //YOLOX_YOLOXDETECTOR_CUH

4.2、模型构建

从TensorRT模型构建推理会话

bool  YoloXDetector::load_engine(const char* engine_path){    try {        this->runtime = createInferRuntime(gLogger);        std::ifstream trtModelFile(engine_path, std::ios_base::in | std::ios_base::binary);        trtModelFile.seekg(0, ios::end);        int size = trtModelFile.tellg();        trtModelFile.seekg(0, ios::beg);
        char* buff = new char[size];        trtModelFile.read(buff, size);        trtModelFile.close();        ICudaEngine* engine = runtime->deserializeCudaEngine((void*)buff, size, NULL);        delete[]buff;
        //创建context        this->context = engine->createExecutionContext();        return true;
    }    catch (const std::exception& e) {        return false;    }}

5、推理流程

目标检测问题的推理大致流程如下,其他推理框架也大致相同

读入图片->不失真缩放到640*480->维度变换为(CH×H×W)->归一化(除255,减均值,除方差)->与模型输入输出缓冲区绑定->推理->读取推理结果张量->结果解析->NMS

本文使用OpenCV读取图片,读取完成后使用CUDA编程实现不失真缩放和归一化,然后输入到模型进行推理,推理完成后将输出张量从显存转移到内存,在CPU上进行后处理

5.1、读图

cv::Mat src = imread(file_name, cv::IMREAD_COLOR);

5.2、内存拷贝&resize&归一化

内存拷贝

// 内存到GPU显存std::cout << "将原始图像数据拷贝至cuda显存中:" << std::endl;cudaMemcpyAsync(byteImg, src.data, (int)(nBatchSize * src.rows * src.cols * ch * sizeof(uchar)), cudaMemcpyHostToDevice, stream);  

不失真缩放和归一化

//计算缩放系数float scale_w = (float)(this->w) / src.cols;//this->w为模型默认输入宽度float scale_h = (float)(this->h) / src.rows;//this->h为模型默认输入高度float resize_scale = min(scale_h, scale_w);//计算缩放系数std::cout << "resize_scale:" << resize_scale << std::endl;

cuda核函数

__global__ void resize_normalize_cuda(uchar * input, float* output, int input_width, int input_height, int output_width, int output_height, float scale){    uint idx = (gridDim.x * gridDim.y * blockIdx.z + gridDim.x * blockIdx.y + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z) + blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;    uint col = idx % (output_width);    uint row = idx / (output_width);    if (col < 0 || col >= (int)(input_width * scale) || row < 0 || row >= (int)(input_height * scale)) {        output[0 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.229) / 0.485;        output[1 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.224) / 0.456;        output[2 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.225) / 0.406;        return;    }    else {        // 获取在原图中的亚像素位置        float origin_y = row / scale;        int origin_row = (int)origin_y;        float delta_y = origin_y - origin_row;
        float origin_x = col / scale;        int origin_col = (int)origin_x;        float delta_x = origin_x - origin_col;        int row1, row2, col1, col2;        if (delta_x > 0.5 && delta_y > 0.5) {            row1 = origin_row;            row2 = origin_row + 1;            col1 = origin_col;            col2 = origin_col + 1;        }        else if (delta_x > 0.5 && delta_y <= 0.5) {            row1 = origin_row - 1;            row2 = origin_row;            col1 = origin_col;            col2 = origin_col + 1;        }        else if (delta_x <= 0.5 && delta_y > 0.5) {            row1 = origin_row;            row2 = origin_row + 1;            col1 = origin_col - 1;            col2 = origin_col;        }        else {            row1 = origin_row - 1;            row2 = origin_row;            col1 = origin_col - 1;            col2 = origin_col;        }
        if (origin_y >= 1 && origin_y < input_height - 1 && origin_x >= 1 && origin_x < input_width - 1) {            float fq11_r = input[row1 * input_width * 3 + col1 * 3 + 0];            float fq11_g = input[row1 * input_width * 3 + col1 * 3 + 1];            float fq11_b = input[row1 * input_width * 3 + col1 * 3 + 2];
            float fq21_r = input[row1 * input_width * 3 + col2 * 3 + 0];            float fq21_g = input[row1 * input_width * 3 + col2 * 3 + 1];            float fq21_b = input[row1 * input_width * 3 + col2 * 3 + 2];
            float fq12_r = input[row2 * input_width * 3 + col1 * 3 + 0];            float fq12_g = input[row2 * input_width * 3 + col1 * 3 + 1];            float fq12_b = input[row2 * input_width * 3 + col1 * 3 + 2];
            float fq22_r = input[row2 * input_width * 3 + col2 * 3 + 0];            float fq22_g = input[row2 * input_width * 3 + col2 * 3 + 1];            float fq22_b = input[row2 * input_width * 3 + col2 * 3 + 2];
            float value_r = ((int)(fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.229) / 0.485;            float value_g = ((int)(fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.224) / 0.456;            float value_b = ((int)(fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.225) / 0.406;            output[0 * output_width * output_height + row * output_width + col] = value_r;            output[1 * output_width * output_height + row * output_width + col] = value_g;            output[2 * output_width * output_height + row * output_width + col] = value_b;        }    }}

核函数调用

std::cout << "开始cuda的归一化预处理:" << std::endl;dim3 gridDim = { 32,15 };dim3 blockDim = { 32,20 };resize_normalize_cuda << <gridDim, blockDim, 0, stream >> > ((uchar*)byteImg, (float*)buffers[0], src.cols, src.rows, this->w, this->h, resize_scale);

5.3、推理和结果拷贝

    context->enqueueV2(buffers, stream, nullptr);    cudaMemcpyAsync(outblob[0], buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);    cudaMemcpyAsync(outblob[1], buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);    cudaMemcpyAsync(outblob[2], buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);

5.4、结果解析

    vector<Rect> o_rect; vector<float> o_rect_cof; vector<int> class_ids;    float scales[] = { 8.0f,16.0f,32.0f };    for (int i = 0; i < 3; i++)    {        float scale = scales[i];        int feature_w = w / scale;        int feature_h = h / scale;        float* outdata = (float*)outblob[i];        for (int row = 0; row < feature_h; ++row)        {            for (int col = 0; col < feature_w; ++col)            {                float isObj = outdata[feature_h * feature_w * 4 + row * feature_w + col];                if (isObj < deconf)                {                    continue;                }                isObj = sigmoid(isObj);                float conf[5];                for (int cls_id = 0; cls_id < 5; cls_id++) {                    conf[cls_id] = outdata[feature_h * feature_w * (5 + cls_id) + row * feature_w + col];                    conf[cls_id] = sigmoid(conf[cls_id]);
                }
                int idx = distance(conf, max_element(conf, conf + sizeof(conf) / sizeof(conf[0])));                float max = conf[idx];                float cof = max * isObj;

                //注意此处输出为中心点坐标,需要转化为角点坐标                float x = outdata[feature_h * feature_w * 0 + row * feature_w + col];                x += col;                x *= scale;                x /= resize_scale;                float y = outdata[feature_h * feature_w * 1 + row * feature_w + col];                y += row;                y *= scale;                y /= resize_scale;                float w = outdata[feature_h * feature_w * 2 + row * feature_w + col];                w = exp(w);                w *= scale;                w /= resize_scale;                float h = outdata[feature_h * feature_w * 3 + row * feature_w + col];                h = exp(h);                h *= scale;                h /= resize_scale;                float r_x = (x - w / 2);                float r_y = (y - h / 2);                Rect rect = Rect(round(r_x), round(r_y), round(w), round(h));                o_rect.push_back(rect);                o_rect_cof.push_back(cof);                class_ids.push_back(idx);            }        }    }

    vector<int> final_id;    dnn::NMSBoxes(o_rect, o_rect_cof, 0.5, 0.5, final_id);
    for (int i = 0; i < final_id.size(); i++) {        int idx = final_id[i];        Rect rect = o_rect[idx];        float cof = o_rect_cof[idx];        int class_id = class_ids[idx];        BoundingBox  bbox;        bbox.rect = rect;        bbox.confidence = cof;        bbox.class_id = class_id;        bboxes.push_back(bbox);    }

6、结果展示

7、完整代码

类的头文件

#ifndef YOLOX_YOLOXDETECTOR_CUH#define YOLOX_YOLOXDETECTOR_CUH
#include <iostream>//#include <ppl.h>//#include <mutex>#include <fstream>#include "NvOnnxParser.h"#include "NvInfer.h"#include "../../../../../Documents/vcpkg/packages/opencv4_x64-windows/include/opencv2/opencv.hpp"#include <cuda_runtime_api.h>#include <device_launch_parameters.h>
using namespace cv;using namespace nvinfer1;using namespace nvonnxparser;using namespace std;
class Logger : public ILogger{    //void log(Severity severity, const char* msg) override    void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{                    // suppress info-level messages                    if (severity <= Severity::kWARNING)                    std::cout << msg << std::endl;            }};typedef struct __BoundingBox{    Rect rect;    float confidence;    int class_id;} BoundingBox;
class YoloXDetector {
public:    // members
    // functions    bool  load_engine(const char* engine_path);    bool  load_onnx_engine(const char* onnx_path, const char* engine_save_path);    void  detect(cv::Mat& src,float conf_thread,std::vector<BoundingBox>& bboxes);

    YoloXDetector();
    ~YoloXDetector();private:    // members    float deconf;    Logger  gLogger;    IRuntime* runtime;    IExecutionContext* context;    void* outblob[3] = { nullptr, nullptr, nullptr };    void* buffers[4] = { nullptr, nullptr, nullptr, nullptr };    void* byteImg = nullptr;    cudaStream_t stream;    ICudaEngine* engine;
    const int h = 480;//获取图片的长    const int w = 640;//获取图片的宽    const int ch = 3;    const int nBatchSize = 1;

    // functions    float desigmoid(float x);    float sigmoid(float x);
};

#endif //YOLOX_YOLOXDETECTOR_CUH

类的源文件

//// Created by 17706 on 2023/9/9.//
#include "../include/YoloXDetector.cuh"
__global__ void resize_normalize_cuda(uchar * input, float* output, int input_width, int input_height, int output_width, int output_height, float scale){    uint idx = (gridDim.x * gridDim.y * blockIdx.z + gridDim.x * blockIdx.y + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z) + blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;    uint col = idx % (output_width);    uint row = idx / (output_width);    if (col < 0 || col >= (int)(input_width * scale) || row < 0 || row >= (int)(input_height * scale)) {        output[0 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.229) / 0.485;        output[1 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.224) / 0.456;        output[2 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.225) / 0.406;        return;    }    else {        // 获取在原图中的亚像素位置        float origin_y = row / scale;        int origin_row = (int)origin_y;        float delta_y = origin_y - origin_row;
        float origin_x = col / scale;        int origin_col = (int)origin_x;        float delta_x = origin_x - origin_col;        int row1, row2, col1, col2;        if (delta_x > 0.5 && delta_y > 0.5) {            row1 = origin_row;            row2 = origin_row + 1;            col1 = origin_col;            col2 = origin_col + 1;        }        else if (delta_x > 0.5 && delta_y <= 0.5) {            row1 = origin_row - 1;            row2 = origin_row;            col1 = origin_col;            col2 = origin_col + 1;        }        else if (delta_x <= 0.5 && delta_y > 0.5) {            row1 = origin_row;            row2 = origin_row + 1;            col1 = origin_col - 1;            col2 = origin_col;        }        else {            row1 = origin_row - 1;            row2 = origin_row;            col1 = origin_col - 1;            col2 = origin_col;        }
        if (origin_y >= 1 && origin_y < input_height - 1 && origin_x >= 1 && origin_x < input_width - 1) {            float fq11_r = input[row1 * input_width * 3 + col1 * 3 + 0];            float fq11_g = input[row1 * input_width * 3 + col1 * 3 + 1];            float fq11_b = input[row1 * input_width * 3 + col1 * 3 + 2];
            float fq21_r = input[row1 * input_width * 3 + col2 * 3 + 0];            float fq21_g = input[row1 * input_width * 3 + col2 * 3 + 1];            float fq21_b = input[row1 * input_width * 3 + col2 * 3 + 2];
            float fq12_r = input[row2 * input_width * 3 + col1 * 3 + 0];            float fq12_g = input[row2 * input_width * 3 + col1 * 3 + 1];            float fq12_b = input[row2 * input_width * 3 + col1 * 3 + 2];
            float fq22_r = input[row2 * input_width * 3 + col2 * 3 + 0];            float fq22_g = input[row2 * input_width * 3 + col2 * 3 + 1];            float fq22_b = input[row2 * input_width * 3 + col2 * 3 + 2];
            float value_r = ((int)(fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.229) / 0.485;            float value_g = ((int)(fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.224) / 0.456;            float value_b = ((int)(fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.225) / 0.406;            output[0 * output_width * output_height + row * output_width + col] = value_r;            output[1 * output_width * output_height + row * output_width + col] = value_g;            output[2 * output_width * output_height + row * output_width + col] = value_b;
            //float value_r = ((fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)));            //float value_g = ((fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)));            //float value_b = ((fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)));            //output[row * output_width * 3 + col * 3 + 0] = value_r;            //output[row * output_width * 3 + col * 3 + 1] = value_g;            //output[row * output_width * 3 + col * 3 + 2] = value_b;        }    }}
float YoloXDetector::desigmoid(float x) {    return  -log(1 / x - 1);}
float YoloXDetector::sigmoid(float x) {    return (1 / (1 + exp(-x)));}
bool  YoloXDetector::load_onnx_engine(const char* onnx_path, const char* engine_save_path){    try {        // 实例化builder        IBuilder* builder = createInferBuilder(gLogger);        const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
        nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
        // 加载onnx文件        nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);        parser->parseFromFile(onnx_path, static_cast<int>(Logger::Severity::kWARNING));        for (int i = 0; i < parser->getNbErrors(); ++i)        {            std::cout << "load error: " << parser->getError(i)->desc() << std::endl;        }        std::cout << "successfully load the onnx model" << std::endl;
        // 创建引擎        //unsigned int maxBatchSize = 1;        //builder->setMaxBatchSize(maxBatchSize);        IBuilderConfig* config = builder->createBuilderConfig();        //config->setMaxWorkspaceSize(16 * (1 << 20));        config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));        config->setFlag(nvinfer1::BuilderFlag::kTF32);//设置精度        //config->setQuantizationFlag(nvinfer1::QuantizationFlag::kCALIBRATE_BEFORE_FUSION)
        // 获取输入与输出名称,格式        const char* input_blob_name = network->getInput(0)->getName();        const char* output_blob_name_l = network->getOutput(0)->getName();        const char* output_blob_name_m = network->getOutput(1)->getName();        const char* output_blob_name_s = network->getOutput(2)->getName();
        printf("input_blob_name : %s \n", input_blob_name);        printf("output_blob_name_l : %s \n", output_blob_name_l);        printf("output_blob_name_m : %s \n", output_blob_name_m);        printf("output_blob_name_s : %s \n", output_blob_name_s);
        const int inputCH = network->getInput(0)->getDimensions().d[1];        const int inputH = network->getInput(0)->getDimensions().d[2];        const int inputW = network->getInput(0)->getDimensions().d[3];        printf("inputCH:%d, inputH : %d, inputW: %d \n", inputCH, inputH, inputW);

        // 序列化        IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);        std::ofstream serialize_output_stream = std::ofstream(engine_save_path, std::ios_base::out | std::ios_base::binary);        serialize_output_stream.write((char*)serializedModel->data(), serializedModel->size());        serialize_output_stream.close();        std::cout << "成功导出" << endl;
        delete parser;        delete network;        delete config;        delete builder;

        return true;    }    catch (const std::exception& e) {        std::cout << e.what() << std::endl;        return false;    }}
bool  YoloXDetector::load_engine(const char* engine_path){    try {        this->runtime = createInferRuntime(gLogger);        std::ifstream trtModelFile(engine_path, std::ios_base::in | std::ios_base::binary);        trtModelFile.seekg(0, ios::end);        int size = trtModelFile.tellg();        trtModelFile.seekg(0, ios::beg);
        char* buff = new char[size];        trtModelFile.read(buff, size);        trtModelFile.close();        ICudaEngine* engine = runtime->deserializeCudaEngine((void*)buff, size, NULL);        delete[]buff;
        //创建context        this->context = engine->createExecutionContext();        return true;
    }    catch (const std::exception& e) {        return false;    }}
YoloXDetector::YoloXDetector() {    outblob[0] = malloc((int)(h / 8 * w / 8 * 10 * sizeof(float)));    outblob[1] = malloc((int)(h / 16 * w / 16 * 10 * sizeof(float)));    outblob[2] = malloc((int)(h / 32 * w / 32 * 10 * sizeof(float)));
    cudaMalloc(&byteImg, (int)(640 * 960 * 3 * sizeof(uchar)));//原始图像数据    cudaMalloc(&buffers[0], (int)(h * w * 3 * sizeof(float)));//输入特征层    cudaMalloc(&buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)));//输出特征层    cudaMalloc(&buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)));//输出特征层    cudaMalloc(&buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)));//输出特征层    //创建cuda流    cudaStreamCreate(&(this->stream));}
YoloXDetector::~YoloXDetector() {    cudaStreamDestroy(stream);
    std::free(outblob[0]);    std::free(outblob[1]);    std::free(outblob[2]);
    context->destroy();    engine->destroy();    runtime->destroy();
    cudaFree(byteImg);
    cudaFree(buffers[0]);    cudaFree(buffers[1]);    cudaFree(buffers[2]);    cudaFree(buffers[3]);
}
void YoloXDetector::detect(cv::Mat& src,float conf_threshold,std::vector<BoundingBox>& bboxes) {    this->deconf = desigmoid(conf_threshold);    float scale_w = (float)(this->w) / src.cols;    float scale_h = (float)(this->h) / src.rows;    float resize_scale = min(scale_h, scale_w);    std::cout << "resize_scale:" << resize_scale << std::endl;
    串行计算版    //for (size_t row = 0; row < 640; row++) {    //  for (size_t col = 0; col < 960; col++) {    //    for (size_t ch = 0; ch < 3; ch++) {    //      data[960 * 640 * ch + row * 960 + col] = float(img.at<Vec3b>(row, col)[ch]) / 255.0f;    //      data[960 * 640 * ch + row * 960 + col] -= mean_value[ch];    //      data[960 * 640 * ch + row * 960 + col] /= scale_value[ch];          data[960 * 640 * ch + row * 960 + col] = float(img.at<Vec3b>(row, col)[ch]);    //    }    //  }    //}

    并行计算版    //parallel_for(0, 3, [&](int ch)    //  {    //    for (size_t row = 0; row < h; row++) {    //      for (size_t col = 0; col < w; col++) {    //        //data[w * h * ch + row * w + col] = (float(img.at<Vec3b>(row, col)[ch]) / 255.0 - mean_value[ch]) / scale_value[ch];    //        //data[960 * 640 * ch + row * 960 + col] = img.at<Vec3f>(row, col)[ch];    //        data[w * h * ch + row * w + col] = ((float)(img.data[row * w * 3 + col * 3 + ch] )/ 255.0 - mean_value[ch]) / scale_value[ch];    //      }    //    }    //  });




    int64 start = cv::getTickCount();    // 内存到GPU显存    std::cout << "将原始图像数据拷贝至cuda显存中:" << std::endl;    cudaMemcpyAsync(byteImg, src.data, (int)(nBatchSize * src.rows * src.cols * ch * sizeof(uchar)), cudaMemcpyHostToDevice, stream);    std::cout << "开始cuda的归一化预处理:" << std::endl;    dim3 gridDim = { 32,15 };    dim3 blockDim = { 32,20 };    resize_normalize_cuda << <gridDim, blockDim, 0, stream >> > ((uchar*)byteImg, (float*)buffers[0], src.cols, src.rows, this->w, this->h, resize_scale);
    std::cout << "start to infer image..." << std::endl;
    // 推理    context->enqueueV2(buffers, stream, nullptr);    cudaMemcpyAsync(outblob[0], buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);    cudaMemcpyAsync(outblob[1], buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);    cudaMemcpyAsync(outblob[2], buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);    float fps = cv::getTickFrequency() / (cv::getTickCount() - start);    float time = (cv::getTickCount() - start) / cv::getTickFrequency();

    //结果解析    vector<Rect> o_rect; vector<float> o_rect_cof; vector<int> class_ids;    float scales[] = { 8.0f,16.0f,32.0f };    for (int i = 0; i < 3; i++)    {        float scale = scales[i];        int feature_w = w / scale;        int feature_h = h / scale;        float* outdata = (float*)outblob[i];        for (int row = 0; row < feature_h; ++row)        {            for (int col = 0; col < feature_w; ++col)            {                float isObj = outdata[feature_h * feature_w * 4 + row * feature_w + col];                if (isObj < deconf)                {                    continue;                }                isObj = sigmoid(isObj);                float conf[5];                for (int cls_id = 0; cls_id < 5; cls_id++) {                    conf[cls_id] = outdata[feature_h * feature_w * (5 + cls_id) + row * feature_w + col];                    conf[cls_id] = sigmoid(conf[cls_id]);
                }
                int idx = distance(conf, max_element(conf, conf + sizeof(conf) / sizeof(conf[0])));                float max = conf[idx];                float cof = max * isObj;

                //注意此处输出为中心点坐标,需要转化为角点坐标                float x = outdata[feature_h * feature_w * 0 + row * feature_w + col];                x += col;                x *= scale;                x /= resize_scale;                float y = outdata[feature_h * feature_w * 1 + row * feature_w + col];                y += row;                y *= scale;                y /= resize_scale;                float w = outdata[feature_h * feature_w * 2 + row * feature_w + col];                w = exp(w);                w *= scale;                w /= resize_scale;                float h = outdata[feature_h * feature_w * 3 + row * feature_w + col];                h = exp(h);                h *= scale;                h /= resize_scale;                float r_x = (x - w / 2);                float r_y = (y - h / 2);                Rect rect = Rect(round(r_x), round(r_y), round(w), round(h));                o_rect.push_back(rect);                o_rect_cof.push_back(cof);                class_ids.push_back(idx);            }        }    }

    vector<int> final_id;    dnn::NMSBoxes(o_rect, o_rect_cof, 0.5, 0.5, final_id);
    for (int i = 0; i < final_id.size(); i++) {        int idx = final_id[i];        Rect rect = o_rect[idx];        float cof = o_rect_cof[idx];        int class_id = class_ids[idx];        BoundingBox  bbox;        bbox.rect = rect;        bbox.confidence = cof;        bbox.class_id = class_id;        bboxes.push_back(bbox);    }}

主函数

#include <iostream>#include "../include/YoloXDetector.cuh"
int main() {    std::string src_path = "D:\\WorkSpace\\graduate-student\\study\\graduation_study\\codes\\3-yolox-quantization\\TensorRT-int8-quantization\\dataset\\images\\";    std::vector<cv::String> file_vec;    cv::glob(src_path + "*.jpg", file_vec, false);    std::random_shuffle(file_vec.begin(), file_vec.end());    int idx_num = 0;    YoloXDetector detector =  YoloXDetector();    detector.load_engine("optim_model_fp16.engine");
    for (std::string file_name : file_vec)    {        vector<BoundingBox> bboxes;        cv::Mat src = imread(file_name, cv::IMREAD_COLOR);        if (!src.isContinuous()) {            return -1;        }        detector.detect(src, 0.5f,bboxes);        for(BoundingBox bbox : bboxes) {            std::cout << file_name << std::endl;            std::cout << bbox.class_id << " " << bbox.confidence << " " << bbox.rect << std::endl;            cv::rectangle(src, bbox.rect, cv::Scalar(0, 0, 255), 2, 8, 0);            cv::putText(src,                        std::to_string(bbox.class_id),                        cv::Point(bbox.rect.x,bbox.rect.y - 5),                        cv::FONT_HERSHEY_PLAIN,                        1.2,                        cv::Scalar(0, 255, 0), 2);        }        cv::imshow("src", src);        cv::waitKey();    }}

CMakeLists.txt

cmake_minimum_required (VERSION 3.8)
#=========================================================project(yolox LANGUAGES CUDA CXX C)                                 #1 工程名# OpenCV配置set(OpenCV_DIR "D:\\Documents\\vcpkg\\packages\\opencv4_x64-windows")    #2 opencv目录改成自己的目录set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) #3set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\lib) #4file(GLOB  OpenCV_Release_LIBS "${OpenCV_DIR}\\lib\\*.lib")set(OpenCV_Release_LIBS            opencv_aruco4.lib            opencv_barcode4.lib            opencv_bgsegm4.lib            opencv_bioinspired4.lib            opencv_calib3d4.lib            opencv_ccalib4.lib            opencv_core4.lib            opencv_datasets4.lib            opencv_dnn4.lib            opencv_dnn_objdetect4.lib            opencv_dnn_superres4.lib            opencv_dpm4.lib            opencv_face4.lib            opencv_features2d4.lib            opencv_flann4.lib            opencv_fuzzy4.lib            opencv_hdf4.lib            opencv_hfs4.lib            opencv_highgui4.lib            opencv_imgcodecs4.lib            opencv_imgproc4.lib            opencv_img_hash4.lib            opencv_intensity_transform4.lib            opencv_line_descriptor4.lib            opencv_mcc4.lib            opencv_ml4.lib            opencv_objdetect4.lib            opencv_optflow4.lib            opencv_phase_unwrapping4.lib            opencv_photo4.lib            opencv_plot4.lib            opencv_quality4.lib            opencv_rapid4.lib            opencv_reg4.lib            opencv_saliency4.lib            opencv_shape4.lib            opencv_stereo4.lib            opencv_stitching4.lib            opencv_structured_light4.lib            opencv_superres4.lib            opencv_surface_matching4.lib            opencv_text4.lib            opencv_tracking4.lib            opencv_video4.lib            opencv_videoio4.lib            opencv_videostab4.lib            opencv_wechat_qrcode4.lib            opencv_xfeatures2d4.lib            opencv_ximgproc4.lib            opencv_xobjdetect4.lib            opencv_xphoto4.lib)    #6message(STATUS "    OpenCV_Release_LIBS: " ${OpenCV_Release_LIBS})
link_directories(${OpenCV_LIB_DIRS})include_directories(${OpenCV_INCLUDE_DIRS})

# TRT配置set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\TensorRT-8.6.1.6")  #7set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) #8set(TRT_LIB_DIRS ${TRT_DIR}\\lib)         #9set(TRT_LIBS    nvinfer.lib    nvinfer_plugin.lib    nvonnxparser.lib    nvparsers.lib)link_directories(${TRT_LIB_DIRS})include_directories(${TRT_INCLUDE_DIRS})
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)set(CMAKE_CXX_STANDARD 14) # CUDA配置find_package(CUDA REQUIRED)message(STATUS "    libraries: ${CUDA_LIBRARIES}")message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}") include_directories(${CUDA_INCLUDE_DIRS})link_directories(${CUDA_LIB_DIRS}) # 设置set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED ")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED '-DCMAKE_MAKE_PROGRAM=C:/Program Files/JetBrains/CLion 2023.1.4/bin/ninja/win/x64/ninja.exe' -G Ninja -S 'D:/WorkSpace/CppSolution/YoloXTensorRT_Cmake' -B 'D:/WorkSpace/CppSolution/YoloXTensorRT_Cmake/cmake-build-release-visual-studio'")
set(CMAKE_INSTALL_PREFIX  "${CMAKE_CURRENT_SOURCE_DIR}/install")
cuda_add_executable(yolox_detector ${PROJECT_SOURCE_DIR}/src/source/YoloXTensorRT.cu)   #17target_link_libraries(yolox_detector    ${TRT_LIBS}    ${OpenCV_Release_LIBS}    ${CUDA_LIBRARIES}) #18set_target_properties(yolox_detector PROPERTIES CUDA_ARCHITECTURES "86")


file(GLOB_RECURSE CURRENT_HEADERS  *.h *.hpp *.cuh)file(GLOB CURRENT_SOURCES  *.cpp *.cu)source_group("Include" FILES ${CURRENT_HEADERS})source_group("Source" FILES ${CURRENT_SOURCES})cuda_add_library(YoloxDetector STATIC    ${CURRENT_HEADERS}    ${CURRENT_SOURCES}    src/source/YoloXDetector.cu)target_link_libraries(YoloxDetector    ${TRT_LIBS}    ${OpenCV_Release_LIBS}    ${CUDA_LIBRARIES}) #18set_target_properties(YoloxDetector PROPERTIES CUDA_ARCHITECTURES "86")install(TARGETS YoloxDetector DESTINATION bin) #将 test 安装到 /usr/local/bin 目录下

link_directories(main PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/install/")
cuda_add_executable(main    src/source/main.cpp#    ${CURRENT_HEADERS}    )target_link_libraries(main    ${TRT_LIBS}    ${OpenCV_Release_LIBS}    ${CUDA_LIBRARIES}    YoloxDetector) #18set_target_properties(main PROPERTIES CUDA_ARCHITECTURES "86")install(TARGETS main DESTINATION bin) #将 test 安装到 /usr/local/bin 目录下
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值