1、TensorRT简介:
TensorRT 是英伟达(NVIDIA)公司推出的一款用于高性能深度学习模型推理的软件开发工具包。它可以为深度学习应用提供低延迟、高吞吐率的部署推理,适用于超大规模数据中心、嵌入式平台或自动驾驶平台等场景。

2、YoloX简介
YoloX 是基于 YOLO(You Only Look Once)系列算法的实时目标检测算法的改进版本。YOLO 系列算法在计算机视觉领域具有很高的知名度和影响力,它们可以实现快速准确的目标检测。YoloX 对 YOLO 系列算法进行了优化和改进,以提高性能和准确性。其具有anchor free的特性,从一种内生的视角来识别覆盖了锚点的目标物。

3、本文任务:
笔者将基于TensorRT部署YoloX目标检测算法,用于检测透镜位置。透镜图像如下图所示:


为避免TensorRT Plugin的编写(插件编写后续也会实现),本文将原始YoloX的Focus操作使用5×5卷积进行替换,替换后的模型头部如图所示,模型的输入大小为1×3×H(480)×W(640)

模型输出三个尺度的特征层L(1×10(dx,dy,w,h)×60×80)预测小目标、M(1×10(dx,dy,w,h)×30×40)预测中等尺度目标、S(1×10(dx,dy,w,h)×15×20)预测大尺度目标。



4、模型构建
4.1、构建YoloxDetector推理类
#ifndef YOLOX_YOLOXDETECTOR_CUH#define YOLOX_YOLOXDETECTOR_CUH#include <iostream>#include <fstream>#include "NvOnnxParser.h"#include "NvInfer.h"#include "../../../../../Documents/vcpkg/packages/opencv4_x64-windows/include/opencv2/opencv.hpp"#include <cuda_runtime_api.h>#include <device_launch_parameters.h>using namespace cv;using namespace nvinfer1;using namespace nvonnxparser;using namespace std;class Logger : public ILogger{//void log(Severity severity, const char* msg) overridevoid log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{// suppress info-level messagesif (severity <= Severity::kWARNING)std::cout << msg << std::endl;}};typedef struct __BoundingBox{Rect rect;float confidence;int class_id;} BoundingBox;class YoloXDetector {public:// members// functionsbool load_engine(const char* engine_path);bool load_onnx_engine(const char* onnx_path, const char* engine_save_path);void detect(cv::Mat& src,float conf_thread,std::vector<BoundingBox>& bboxes);YoloXDetector();~YoloXDetector();private:// membersfloat deconf;Logger gLogger;IRuntime* runtime;IExecutionContext* context;void* outblob[3] = { nullptr, nullptr, nullptr };void* buffers[4] = { nullptr, nullptr, nullptr, nullptr };void* byteImg = nullptr;cudaStream_t stream;ICudaEngine* engine;const int h = 480;//获取图片的长const int w = 640;//获取图片的宽const int ch = 3;const int nBatchSize = 1;// functionsfloat desigmoid(float x);float sigmoid(float x);};#endif //YOLOX_YOLOXDETECTOR_CUH
4.2、模型构建
从TensorRT模型构建推理会话
bool YoloXDetector::load_engine(const char* engine_path){try {this->runtime = createInferRuntime(gLogger);std::ifstream trtModelFile(engine_path, std::ios_base::in | std::ios_base::binary);trtModelFile.seekg(0, ios::end);int size = trtModelFile.tellg();trtModelFile.seekg(0, ios::beg);char* buff = new char[size];trtModelFile.read(buff, size);trtModelFile.close();ICudaEngine* engine = runtime->deserializeCudaEngine((void*)buff, size, NULL);delete[]buff;//创建contextthis->context = engine->createExecutionContext();return true;}catch (const std::exception& e) {return false;}}
5、推理流程
目标检测问题的推理大致流程如下,其他推理框架也大致相同
读入图片->不失真缩放到640*480->维度变换为(CH×H×W)->归一化(除255,减均值,除方差)->与模型输入输出缓冲区绑定->推理->读取推理结果张量->结果解析->NMS
本文使用OpenCV读取图片,读取完成后使用CUDA编程实现不失真缩放和归一化,然后输入到模型进行推理,推理完成后将输出张量从显存转移到内存,在CPU上进行后处理
5.1、读图
cv::Mat src = imread(file_name, cv::IMREAD_COLOR);
5.2、内存拷贝&resize&归一化
内存拷贝
// 内存到GPU显存std::cout << "将原始图像数据拷贝至cuda显存中:" << std::endl;cudaMemcpyAsync(byteImg, src.data, (int)(nBatchSize * src.rows * src.cols * ch * sizeof(uchar)), cudaMemcpyHostToDevice, stream);
不失真缩放和归一化
//计算缩放系数float scale_w = (float)(this->w) / src.cols;//this->w为模型默认输入宽度float scale_h = (float)(this->h) / src.rows;//this->h为模型默认输入高度float resize_scale = min(scale_h, scale_w);//计算缩放系数std::cout << "resize_scale:" << resize_scale << std::endl;
cuda核函数
__global__ void resize_normalize_cuda(uchar * input, float* output, int input_width, int input_height, int output_width, int output_height, float scale){uint idx = (gridDim.x * gridDim.y * blockIdx.z + gridDim.x * blockIdx.y + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z) + blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;uint col = idx % (output_width);uint row = idx / (output_width);if (col < 0 || col >= (int)(input_width * scale) || row < 0 || row >= (int)(input_height * scale)) {output[0 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.229) / 0.485;output[1 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.224) / 0.456;output[2 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.225) / 0.406;return;}else {// 获取在原图中的亚像素位置float origin_y = row / scale;int origin_row = (int)origin_y;float delta_y = origin_y - origin_row;float origin_x = col / scale;int origin_col = (int)origin_x;float delta_x = origin_x - origin_col;int row1, row2, col1, col2;if (delta_x > 0.5 && delta_y > 0.5) {row1 = origin_row;row2 = origin_row + 1;col1 = origin_col;col2 = origin_col + 1;}else if (delta_x > 0.5 && delta_y <= 0.5) {row1 = origin_row - 1;row2 = origin_row;col1 = origin_col;col2 = origin_col + 1;}else if (delta_x <= 0.5 && delta_y > 0.5) {row1 = origin_row;row2 = origin_row + 1;col1 = origin_col - 1;col2 = origin_col;}else {row1 = origin_row - 1;row2 = origin_row;col1 = origin_col - 1;col2 = origin_col;}if (origin_y >= 1 && origin_y < input_height - 1 && origin_x >= 1 && origin_x < input_width - 1) {float fq11_r = input[row1 * input_width * 3 + col1 * 3 + 0];float fq11_g = input[row1 * input_width * 3 + col1 * 3 + 1];float fq11_b = input[row1 * input_width * 3 + col1 * 3 + 2];float fq21_r = input[row1 * input_width * 3 + col2 * 3 + 0];float fq21_g = input[row1 * input_width * 3 + col2 * 3 + 1];float fq21_b = input[row1 * input_width * 3 + col2 * 3 + 2];float fq12_r = input[row2 * input_width * 3 + col1 * 3 + 0];float fq12_g = input[row2 * input_width * 3 + col1 * 3 + 1];float fq12_b = input[row2 * input_width * 3 + col1 * 3 + 2];float fq22_r = input[row2 * input_width * 3 + col2 * 3 + 0];float fq22_g = input[row2 * input_width * 3 + col2 * 3 + 1];float fq22_b = input[row2 * input_width * 3 + col2 * 3 + 2];float value_r = ((int)(fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.229) / 0.485;float value_g = ((int)(fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.224) / 0.456;float value_b = ((int)(fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.225) / 0.406;output[0 * output_width * output_height + row * output_width + col] = value_r;output[1 * output_width * output_height + row * output_width + col] = value_g;output[2 * output_width * output_height + row * output_width + col] = value_b;}}}
核函数调用
std::cout << "开始cuda的归一化预处理:" << std::endl;dim3 gridDim = { 32,15 };dim3 blockDim = { 32,20 };resize_normalize_cuda << <gridDim, blockDim, 0, stream >> > ((uchar*)byteImg, (float*)buffers[0], src.cols, src.rows, this->w, this->h, resize_scale);
5.3、推理和结果拷贝
context->enqueueV2(buffers, stream, nullptr);cudaMemcpyAsync(outblob[0], buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);cudaMemcpyAsync(outblob[1], buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);cudaMemcpyAsync(outblob[2], buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);
5.4、结果解析
vector<Rect> o_rect; vector<float> o_rect_cof; vector<int> class_ids;float scales[] = { 8.0f,16.0f,32.0f };for (int i = 0; i < 3; i++){float scale = scales[i];int feature_w = w / scale;int feature_h = h / scale;float* outdata = (float*)outblob[i];for (int row = 0; row < feature_h; ++row){for (int col = 0; col < feature_w; ++col){float isObj = outdata[feature_h * feature_w * 4 + row * feature_w + col];if (isObj < deconf){continue;}isObj = sigmoid(isObj);float conf[5];for (int cls_id = 0; cls_id < 5; cls_id++) {conf[cls_id] = outdata[feature_h * feature_w * (5 + cls_id) + row * feature_w + col];conf[cls_id] = sigmoid(conf[cls_id]);}int idx = distance(conf, max_element(conf, conf + sizeof(conf) / sizeof(conf[0])));float max = conf[idx];float cof = max * isObj;//注意此处输出为中心点坐标,需要转化为角点坐标float x = outdata[feature_h * feature_w * 0 + row * feature_w + col];x += col;x *= scale;x /= resize_scale;float y = outdata[feature_h * feature_w * 1 + row * feature_w + col];y += row;y *= scale;y /= resize_scale;float w = outdata[feature_h * feature_w * 2 + row * feature_w + col];w = exp(w);w *= scale;w /= resize_scale;float h = outdata[feature_h * feature_w * 3 + row * feature_w + col];h = exp(h);h *= scale;h /= resize_scale;float r_x = (x - w / 2);float r_y = (y - h / 2);Rect rect = Rect(round(r_x), round(r_y), round(w), round(h));o_rect.push_back(rect);o_rect_cof.push_back(cof);class_ids.push_back(idx);}}}vector<int> final_id;dnn::NMSBoxes(o_rect, o_rect_cof, 0.5, 0.5, final_id);for (int i = 0; i < final_id.size(); i++) {int idx = final_id[i];Rect rect = o_rect[idx];float cof = o_rect_cof[idx];int class_id = class_ids[idx];BoundingBox bbox;bbox.rect = rect;bbox.confidence = cof;bbox.class_id = class_id;bboxes.push_back(bbox);}
6、结果展示

7、完整代码
类的头文件
#ifndef YOLOX_YOLOXDETECTOR_CUH#define YOLOX_YOLOXDETECTOR_CUH#include <iostream>//#include <ppl.h>//#include <mutex>#include <fstream>#include "NvOnnxParser.h"#include "NvInfer.h"#include "../../../../../Documents/vcpkg/packages/opencv4_x64-windows/include/opencv2/opencv.hpp"#include <cuda_runtime_api.h>#include <device_launch_parameters.h>using namespace cv;using namespace nvinfer1;using namespace nvonnxparser;using namespace std;class Logger : public ILogger{//void log(Severity severity, const char* msg) overridevoid log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{// suppress info-level messagesif (severity <= Severity::kWARNING)std::cout << msg << std::endl;}};typedef struct __BoundingBox{Rect rect;float confidence;int class_id;} BoundingBox;class YoloXDetector {public:// members// functionsbool load_engine(const char* engine_path);bool load_onnx_engine(const char* onnx_path, const char* engine_save_path);void detect(cv::Mat& src,float conf_thread,std::vector<BoundingBox>& bboxes);YoloXDetector();~YoloXDetector();private:// membersfloat deconf;Logger gLogger;IRuntime* runtime;IExecutionContext* context;void* outblob[3] = { nullptr, nullptr, nullptr };void* buffers[4] = { nullptr, nullptr, nullptr, nullptr };void* byteImg = nullptr;cudaStream_t stream;ICudaEngine* engine;const int h = 480;//获取图片的长const int w = 640;//获取图片的宽const int ch = 3;const int nBatchSize = 1;// functionsfloat desigmoid(float x);float sigmoid(float x);};#endif //YOLOX_YOLOXDETECTOR_CUH
类的源文件
//// Created by 17706 on 2023/9/9.//#include "../include/YoloXDetector.cuh"__global__ void resize_normalize_cuda(uchar * input, float* output, int input_width, int input_height, int output_width, int output_height, float scale){uint idx = (gridDim.x * gridDim.y * blockIdx.z + gridDim.x * blockIdx.y + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z) + blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;uint col = idx % (output_width);uint row = idx / (output_width);if (col < 0 || col >= (int)(input_width * scale) || row < 0 || row >= (int)(input_height * scale)) {output[0 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.229) / 0.485;output[1 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.224) / 0.456;output[2 * output_width * output_height + row * output_width + col] = (128.0 / 255.0 - 0.225) / 0.406;return;}else {// 获取在原图中的亚像素位置float origin_y = row / scale;int origin_row = (int)origin_y;float delta_y = origin_y - origin_row;float origin_x = col / scale;int origin_col = (int)origin_x;float delta_x = origin_x - origin_col;int row1, row2, col1, col2;if (delta_x > 0.5 && delta_y > 0.5) {row1 = origin_row;row2 = origin_row + 1;col1 = origin_col;col2 = origin_col + 1;}else if (delta_x > 0.5 && delta_y <= 0.5) {row1 = origin_row - 1;row2 = origin_row;col1 = origin_col;col2 = origin_col + 1;}else if (delta_x <= 0.5 && delta_y > 0.5) {row1 = origin_row;row2 = origin_row + 1;col1 = origin_col - 1;col2 = origin_col;}else {row1 = origin_row - 1;row2 = origin_row;col1 = origin_col - 1;col2 = origin_col;}if (origin_y >= 1 && origin_y < input_height - 1 && origin_x >= 1 && origin_x < input_width - 1) {float fq11_r = input[row1 * input_width * 3 + col1 * 3 + 0];float fq11_g = input[row1 * input_width * 3 + col1 * 3 + 1];float fq11_b = input[row1 * input_width * 3 + col1 * 3 + 2];float fq21_r = input[row1 * input_width * 3 + col2 * 3 + 0];float fq21_g = input[row1 * input_width * 3 + col2 * 3 + 1];float fq21_b = input[row1 * input_width * 3 + col2 * 3 + 2];float fq12_r = input[row2 * input_width * 3 + col1 * 3 + 0];float fq12_g = input[row2 * input_width * 3 + col1 * 3 + 1];float fq12_b = input[row2 * input_width * 3 + col1 * 3 + 2];float fq22_r = input[row2 * input_width * 3 + col2 * 3 + 0];float fq22_g = input[row2 * input_width * 3 + col2 * 3 + 1];float fq22_b = input[row2 * input_width * 3 + col2 * 3 + 2];float value_r = ((int)(fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.229) / 0.485;float value_g = ((int)(fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.224) / 0.456;float value_b = ((int)(fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)) / 255.0 - 0.225) / 0.406;output[0 * output_width * output_height + row * output_width + col] = value_r;output[1 * output_width * output_height + row * output_width + col] = value_g;output[2 * output_width * output_height + row * output_width + col] = value_b;//float value_r = ((fq11_r * (col2 - origin_x) * (row2 - origin_y) + fq21_r * (origin_x - col1) * (row2 - origin_y) + fq12_r * (col2 - origin_x) * (origin_y - row1) + fq22_r * (origin_x - col1) * (origin_y - row1)));//float value_g = ((fq11_g * (col2 - origin_x) * (row2 - origin_y) + fq21_g * (origin_x - col1) * (row2 - origin_y) + fq12_g * (col2 - origin_x) * (origin_y - row1) + fq22_g * (origin_x - col1) * (origin_y - row1)));//float value_b = ((fq11_b * (col2 - origin_x) * (row2 - origin_y) + fq21_b * (origin_x - col1) * (row2 - origin_y) + fq12_b * (col2 - origin_x) * (origin_y - row1) + fq22_b * (origin_x - col1) * (origin_y - row1)));//output[row * output_width * 3 + col * 3 + 0] = value_r;//output[row * output_width * 3 + col * 3 + 1] = value_g;//output[row * output_width * 3 + col * 3 + 2] = value_b;}}}float YoloXDetector::desigmoid(float x) {return -log(1 / x - 1);}float YoloXDetector::sigmoid(float x) {return (1 / (1 + exp(-x)));}bool YoloXDetector::load_onnx_engine(const char* onnx_path, const char* engine_save_path){try {// 实例化builderIBuilder* builder = createInferBuilder(gLogger);const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);// 加载onnx文件nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);parser->parseFromFile(onnx_path, static_cast<int>(Logger::Severity::kWARNING));for (int i = 0; i < parser->getNbErrors(); ++i){std::cout << "load error: " << parser->getError(i)->desc() << std::endl;}std::cout << "successfully load the onnx model" << std::endl;// 创建引擎//unsigned int maxBatchSize = 1;//builder->setMaxBatchSize(maxBatchSize);IBuilderConfig* config = builder->createBuilderConfig();//config->setMaxWorkspaceSize(16 * (1 << 20));config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));config->setFlag(nvinfer1::BuilderFlag::kTF32);//设置精度//config->setQuantizationFlag(nvinfer1::QuantizationFlag::kCALIBRATE_BEFORE_FUSION)// 获取输入与输出名称,格式const char* input_blob_name = network->getInput(0)->getName();const char* output_blob_name_l = network->getOutput(0)->getName();const char* output_blob_name_m = network->getOutput(1)->getName();const char* output_blob_name_s = network->getOutput(2)->getName();printf("input_blob_name : %s \n", input_blob_name);printf("output_blob_name_l : %s \n", output_blob_name_l);printf("output_blob_name_m : %s \n", output_blob_name_m);printf("output_blob_name_s : %s \n", output_blob_name_s);const int inputCH = network->getInput(0)->getDimensions().d[1];const int inputH = network->getInput(0)->getDimensions().d[2];const int inputW = network->getInput(0)->getDimensions().d[3];printf("inputCH:%d, inputH : %d, inputW: %d \n", inputCH, inputH, inputW);// 序列化IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);std::ofstream serialize_output_stream = std::ofstream(engine_save_path, std::ios_base::out | std::ios_base::binary);serialize_output_stream.write((char*)serializedModel->data(), serializedModel->size());serialize_output_stream.close();std::cout << "成功导出" << endl;delete parser;delete network;delete config;delete builder;return true;}catch (const std::exception& e) {std::cout << e.what() << std::endl;return false;}}bool YoloXDetector::load_engine(const char* engine_path){try {this->runtime = createInferRuntime(gLogger);std::ifstream trtModelFile(engine_path, std::ios_base::in | std::ios_base::binary);trtModelFile.seekg(0, ios::end);int size = trtModelFile.tellg();trtModelFile.seekg(0, ios::beg);char* buff = new char[size];trtModelFile.read(buff, size);trtModelFile.close();ICudaEngine* engine = runtime->deserializeCudaEngine((void*)buff, size, NULL);delete[]buff;//创建contextthis->context = engine->createExecutionContext();return true;}catch (const std::exception& e) {return false;}}YoloXDetector::YoloXDetector() {outblob[0] = malloc((int)(h / 8 * w / 8 * 10 * sizeof(float)));outblob[1] = malloc((int)(h / 16 * w / 16 * 10 * sizeof(float)));outblob[2] = malloc((int)(h / 32 * w / 32 * 10 * sizeof(float)));cudaMalloc(&byteImg, (int)(640 * 960 * 3 * sizeof(uchar)));//原始图像数据cudaMalloc(&buffers[0], (int)(h * w * 3 * sizeof(float)));//输入特征层cudaMalloc(&buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)));//输出特征层cudaMalloc(&buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)));//输出特征层cudaMalloc(&buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)));//输出特征层//创建cuda流cudaStreamCreate(&(this->stream));}YoloXDetector::~YoloXDetector() {cudaStreamDestroy(stream);std::free(outblob[0]);std::free(outblob[1]);std::free(outblob[2]);context->destroy();engine->destroy();runtime->destroy();cudaFree(byteImg);cudaFree(buffers[0]);cudaFree(buffers[1]);cudaFree(buffers[2]);cudaFree(buffers[3]);}void YoloXDetector::detect(cv::Mat& src,float conf_threshold,std::vector<BoundingBox>& bboxes) {this->deconf = desigmoid(conf_threshold);float scale_w = (float)(this->w) / src.cols;float scale_h = (float)(this->h) / src.rows;float resize_scale = min(scale_h, scale_w);std::cout << "resize_scale:" << resize_scale << std::endl;串行计算版//for (size_t row = 0; row < 640; row++) {// for (size_t col = 0; col < 960; col++) {// for (size_t ch = 0; ch < 3; ch++) {// data[960 * 640 * ch + row * 960 + col] = float(img.at<Vec3b>(row, col)[ch]) / 255.0f;// data[960 * 640 * ch + row * 960 + col] -= mean_value[ch];// data[960 * 640 * ch + row * 960 + col] /= scale_value[ch];data[960 * 640 * ch + row * 960 + col] = float(img.at<Vec3b>(row, col)[ch]);// }// }//}并行计算版//parallel_for(0, 3, [&](int ch)// {// for (size_t row = 0; row < h; row++) {// for (size_t col = 0; col < w; col++) {// //data[w * h * ch + row * w + col] = (float(img.at<Vec3b>(row, col)[ch]) / 255.0 - mean_value[ch]) / scale_value[ch];// //data[960 * 640 * ch + row * 960 + col] = img.at<Vec3f>(row, col)[ch];// data[w * h * ch + row * w + col] = ((float)(img.data[row * w * 3 + col * 3 + ch] )/ 255.0 - mean_value[ch]) / scale_value[ch];// }// }// });int64 start = cv::getTickCount();// 内存到GPU显存std::cout << "将原始图像数据拷贝至cuda显存中:" << std::endl;cudaMemcpyAsync(byteImg, src.data, (int)(nBatchSize * src.rows * src.cols * ch * sizeof(uchar)), cudaMemcpyHostToDevice, stream);std::cout << "开始cuda的归一化预处理:" << std::endl;dim3 gridDim = { 32,15 };dim3 blockDim = { 32,20 };resize_normalize_cuda << <gridDim, blockDim, 0, stream >> > ((uchar*)byteImg, (float*)buffers[0], src.cols, src.rows, this->w, this->h, resize_scale);std::cout << "start to infer image..." << std::endl;// 推理context->enqueueV2(buffers, stream, nullptr);cudaMemcpyAsync(outblob[0], buffers[1], (int)(h / 8 * w / 8 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);cudaMemcpyAsync(outblob[1], buffers[2], (int)(h / 16 * w / 16 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);cudaMemcpyAsync(outblob[2], buffers[3], (int)(h / 32 * w / 32 * 10 * sizeof(float)), cudaMemcpyDeviceToHost, stream);float fps = cv::getTickFrequency() / (cv::getTickCount() - start);float time = (cv::getTickCount() - start) / cv::getTickFrequency();//结果解析vector<Rect> o_rect; vector<float> o_rect_cof; vector<int> class_ids;float scales[] = { 8.0f,16.0f,32.0f };for (int i = 0; i < 3; i++){float scale = scales[i];int feature_w = w / scale;int feature_h = h / scale;float* outdata = (float*)outblob[i];for (int row = 0; row < feature_h; ++row){for (int col = 0; col < feature_w; ++col){float isObj = outdata[feature_h * feature_w * 4 + row * feature_w + col];if (isObj < deconf){continue;}isObj = sigmoid(isObj);float conf[5];for (int cls_id = 0; cls_id < 5; cls_id++) {conf[cls_id] = outdata[feature_h * feature_w * (5 + cls_id) + row * feature_w + col];conf[cls_id] = sigmoid(conf[cls_id]);}int idx = distance(conf, max_element(conf, conf + sizeof(conf) / sizeof(conf[0])));float max = conf[idx];float cof = max * isObj;//注意此处输出为中心点坐标,需要转化为角点坐标float x = outdata[feature_h * feature_w * 0 + row * feature_w + col];x += col;x *= scale;x /= resize_scale;float y = outdata[feature_h * feature_w * 1 + row * feature_w + col];y += row;y *= scale;y /= resize_scale;float w = outdata[feature_h * feature_w * 2 + row * feature_w + col];w = exp(w);w *= scale;w /= resize_scale;float h = outdata[feature_h * feature_w * 3 + row * feature_w + col];h = exp(h);h *= scale;h /= resize_scale;float r_x = (x - w / 2);float r_y = (y - h / 2);Rect rect = Rect(round(r_x), round(r_y), round(w), round(h));o_rect.push_back(rect);o_rect_cof.push_back(cof);class_ids.push_back(idx);}}}vector<int> final_id;dnn::NMSBoxes(o_rect, o_rect_cof, 0.5, 0.5, final_id);for (int i = 0; i < final_id.size(); i++) {int idx = final_id[i];Rect rect = o_rect[idx];float cof = o_rect_cof[idx];int class_id = class_ids[idx];BoundingBox bbox;bbox.rect = rect;bbox.confidence = cof;bbox.class_id = class_id;bboxes.push_back(bbox);}}
主函数
#include <iostream>#include "../include/YoloXDetector.cuh"int main() {std::string src_path = "D:\\WorkSpace\\graduate-student\\study\\graduation_study\\codes\\3-yolox-quantization\\TensorRT-int8-quantization\\dataset\\images\\";std::vector<cv::String> file_vec;cv::glob(src_path + "*.jpg", file_vec, false);std::random_shuffle(file_vec.begin(), file_vec.end());int idx_num = 0;YoloXDetector detector = YoloXDetector();detector.load_engine("optim_model_fp16.engine");for (std::string file_name : file_vec){vector<BoundingBox> bboxes;cv::Mat src = imread(file_name, cv::IMREAD_COLOR);if (!src.isContinuous()) {return -1;}detector.detect(src, 0.5f,bboxes);for(BoundingBox bbox : bboxes) {std::cout << file_name << std::endl;std::cout << bbox.class_id << " " << bbox.confidence << " " << bbox.rect << std::endl;cv::rectangle(src, bbox.rect, cv::Scalar(0, 0, 255), 2, 8, 0);cv::putText(src,std::to_string(bbox.class_id),cv::Point(bbox.rect.x,bbox.rect.y - 5),cv::FONT_HERSHEY_PLAIN,1.2,cv::Scalar(0, 255, 0), 2);}cv::imshow("src", src);cv::waitKey();}}
CMakeLists.txt
cmake_minimum_required (VERSION 3.8)#=========================================================project(yolox LANGUAGES CUDA CXX C) #1 工程名# OpenCV配置set(OpenCV_DIR "D:\\Documents\\vcpkg\\packages\\opencv4_x64-windows") #2 opencv目录改成自己的目录set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) #3set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\lib) #4file(GLOB OpenCV_Release_LIBS "${OpenCV_DIR}\\lib\\*.lib")set(OpenCV_Release_LIBSopencv_aruco4.libopencv_barcode4.libopencv_bgsegm4.libopencv_bioinspired4.libopencv_calib3d4.libopencv_ccalib4.libopencv_core4.libopencv_datasets4.libopencv_dnn4.libopencv_dnn_objdetect4.libopencv_dnn_superres4.libopencv_dpm4.libopencv_face4.libopencv_features2d4.libopencv_flann4.libopencv_fuzzy4.libopencv_hdf4.libopencv_hfs4.libopencv_highgui4.libopencv_imgcodecs4.libopencv_imgproc4.libopencv_img_hash4.libopencv_intensity_transform4.libopencv_line_descriptor4.libopencv_mcc4.libopencv_ml4.libopencv_objdetect4.libopencv_optflow4.libopencv_phase_unwrapping4.libopencv_photo4.libopencv_plot4.libopencv_quality4.libopencv_rapid4.libopencv_reg4.libopencv_saliency4.libopencv_shape4.libopencv_stereo4.libopencv_stitching4.libopencv_structured_light4.libopencv_superres4.libopencv_surface_matching4.libopencv_text4.libopencv_tracking4.libopencv_video4.libopencv_videoio4.libopencv_videostab4.libopencv_wechat_qrcode4.libopencv_xfeatures2d4.libopencv_ximgproc4.libopencv_xobjdetect4.libopencv_xphoto4.lib) #6message(STATUS " OpenCV_Release_LIBS: " ${OpenCV_Release_LIBS})link_directories(${OpenCV_LIB_DIRS})include_directories(${OpenCV_INCLUDE_DIRS})# TRT配置set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\TensorRT-8.6.1.6") #7set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) #8set(TRT_LIB_DIRS ${TRT_DIR}\\lib) #9set(TRT_LIBSnvinfer.libnvinfer_plugin.libnvonnxparser.libnvparsers.lib)link_directories(${TRT_LIB_DIRS})include_directories(${TRT_INCLUDE_DIRS})option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)set(CMAKE_CXX_STANDARD 14)# CUDA配置find_package(CUDA REQUIRED)message(STATUS " libraries: ${CUDA_LIBRARIES}")message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")include_directories(${CUDA_INCLUDE_DIRS})link_directories(${CUDA_LIB_DIRS})# 设置set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED ")#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED '-DCMAKE_MAKE_PROGRAM=C:/Program Files/JetBrains/CLion 2023.1.4/bin/ninja/win/x64/ninja.exe' -G Ninja -S 'D:/WorkSpace/CppSolution/YoloXTensorRT_Cmake' -B 'D:/WorkSpace/CppSolution/YoloXTensorRT_Cmake/cmake-build-release-visual-studio'")set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install")cuda_add_executable(yolox_detector ${PROJECT_SOURCE_DIR}/src/source/YoloXTensorRT.cu) #17target_link_libraries(yolox_detector${TRT_LIBS}${OpenCV_Release_LIBS}${CUDA_LIBRARIES}) #18set_target_properties(yolox_detector PROPERTIES CUDA_ARCHITECTURES "86")file(GLOB_RECURSE CURRENT_HEADERS *.h *.hpp *.cuh)file(GLOB CURRENT_SOURCES *.cpp *.cu)source_group("Include" FILES ${CURRENT_HEADERS})source_group("Source" FILES ${CURRENT_SOURCES})cuda_add_library(YoloxDetector STATIC${CURRENT_HEADERS}${CURRENT_SOURCES}src/source/YoloXDetector.cu)target_link_libraries(YoloxDetector${TRT_LIBS}${OpenCV_Release_LIBS}${CUDA_LIBRARIES}) #18set_target_properties(YoloxDetector PROPERTIES CUDA_ARCHITECTURES "86")install(TARGETS YoloxDetector DESTINATION bin) #将 test 安装到 /usr/local/bin 目录下link_directories(main PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/install/")cuda_add_executable(mainsrc/source/main.cpp# ${CURRENT_HEADERS})target_link_libraries(main${TRT_LIBS}${OpenCV_Release_LIBS}${CUDA_LIBRARIES}YoloxDetector) #18set_target_properties(main PROPERTIES CUDA_ARCHITECTURES "86")install(TARGETS main DESTINATION bin) #将 test 安装到 /usr/local/bin 目录下

705

被折叠的 条评论
为什么被折叠?



