YOLOV8 实例分割输出两个output, 一个用于实例bbox检测, 另一个用于分割,两者相结合提取实例分割目标结果。
#include "logging.h"
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include <NvInferRuntime.h>
#include <cuda_runtime.h> // cuda include
#include<fstream>
#include<iostream>
#include<string>
#include<opencv2/opencv.hpp>
using namespace nvinfer1;
static Logger gLogger;
# define CHECK(call)\
do\
{\
const cudaError_t error_code=call;\
if (error_code!=cudaSuccess)\
{\
printf("CUDA Error:\n");\
printf(" FILE :%s",__FILE__);\
printf("LINE %d\n",__LINE__);\
printf("Error code:%d\n",error_code);\
printf("Error text:%s\n",cudaGetErrorString(error_code));\
exit(1);\
}\
}while(0)\
#define clip(x) (x < 0 ? 0.0 : ( x > 255.0 ? 255.0 : x))
class YOLO_SEG
{
private:
char *_trtModelStream{nullptr};
IRuntime* _runtime = nullptr;
ICudaEngine* _engine=nullptr;
IExecutionContext* _context=nullptr;
void *_inferbuffers[3];
cudaStream_t _stream;
public:
int _max_batchsize = 1;
int _input_h = 960;
int _input_w = 1920;
int _inputSize = 3 * _input_h * _input_w;
int _outputSize0 = 1;
int _outputSize1 = 1;
int _inputIndex;
int _outputIndex0;
int _outputIndex1 ;
private:
void get_input_ouput_size(){
_inputIndex = _engine->getBindingIndex("images");
_outputIndex0 = _engine->getBindingIndex("output0");
_outputIndex1 = _engine->getBindingIndex("output1");
assert(_inputIndex == 0);
assert(_outputIndex0 == 2);
assert(_outputIndex1 == 1);
auto out_dims1 = _engine->getBindingDimensions(_outputIndex0);
// BCHW
for(int j = 1; j < out_dims1.nbDims; j++) {
std::cout << "j = " << j << " size = " << out_dims1.d[j] << std::endl;
_outputSize0 *= out_dims1.d[j];
}
// BCN
auto out_dims2 = _engine->getBindingDimensions(_outputIndex1);
for(int j = 1; j < out_dims2.nbDims; j++) {
std::cout << "j = " << j << " size = " << out_dims2.d[j] << std::endl;
_outputSize1 *= out_dims2.d[j];
}
//std::cout << " _outputSize2 = " << _outputSize2 << std::endl;
}
public:
YOLO_SEG(/* args */){};
~YOLO_SEG(){
if (nullptr != _trtModelStream){
delete [] _trtModelStream;
}
};
// 文件读取模型,并反序列化成engine
void load_trtmodel(std::string trt_model_path){
std::ifstream file(trt_model_path, std::ios::binary);
size_t size{0};
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
_trtModelStream = new char[size];
assert(_trtModelStream);
file.read(_trtModelStream, size);
file.close();
}
_runtime = createInferRuntime(gLogger);
assert(_runtime != nullptr);
_engine = _runtime->deserializeCudaEngine(_trtModelStream, size);
assert(_engine != nullptr);
_context = _engine->createExecutionContext();
assert(_context != nullptr);
}
//分配处理相关内存
void initbuff(){
get_input_ouput_size();
//fix _max_batchsize
_context->setBindingDimensions(0, nvinfer1::Dims4(_max_batchsize, 3, _input_h, _input_w));
assert(_engine->getNbBindings() == 3);
//_context->setBindingDimensions(1, nvinfer1::Dims4(_max_batchsize, 3, _input_h * 3, _input_w * 3));
std::cout << " outputIndex0 = " << _outputIndex0 << " outputIndex1 = " << _outputIndex1 << std::endl;
//const int outputIndex1 = 1;
//const int outputIndex2 = 2;
CHECK(cudaMalloc((void**)&_inferbuffers[_inputIndex], _max_batchsize * _inputSize * sizeof(float))); //trt输入内存申请
CHECK(cudaMalloc((void**)&_inferbuffers[_outputIndex0], _max_batchsize * _outputSize0 * sizeof(float))); //trt输出内存申请
CHECK(cudaMalloc((void**)&_inferbuffers[_outputIndex1], _max_batchsize * _outputSize1 * sizeof(float))); //trt输出内存申请
CHECK(cudaStreamCreate(&_stream));
}
void releasebuff(){
CHECK(cudaFree(_inferbuffers[_inputIndex]));
CHECK(cudaFree(_inferbuffers[_outputIndex0]));
CHECK(cudaFree(_inferbuffers[_outputIndex1]));
_context->destroy();
_engine->destroy();
_runtime->destroy();
}
// 推理
void infer_trtmodel(const int infer_batch, const float* input_data, float *outputbuff0, float *outputbuff1){
//图像数据填充_inferbuffers[0],GPU CUDA处理
cudaMemcpy(_inferbuffers[0], input_data, infer_batch * _inputSize * sizeof(float), cudaMemcpyHostToDevice);
printf("infer model \n");
_context->enqueueV2((void **)_inferbuffers, _stream, nullptr);
cudaStreamSynchronize(_stream);
printf("post process \n");
//
//_inferbuffers[1]模型输出后处理,可以GPU处理,否则拷贝到cpu处理
cudaMemcpy(outputbuff0, _inferbuffers[_outputIndex0], infer_batch * _outputSize0 * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(outputbuff1, _inferbuffers[_outputIndex1], infer_batch * _outputSize1 * sizeof(float), cudaMemcpyDeviceToHost);
}
};
struct OutputSeg {
int id; //结果类别id
float confidence; //结果置信度
cv::Rect box; //矩形框
cv::Mat boxMask; //矩形框内mask,节省内存空间和加快速度
};
void DrawPred(cv::Mat& img, std:: vector<OutputSeg> result) {
std::vector<cv::Scalar> color = {cv::Scalar(0, 0, 255)} ;
cv::Mat mask = img.clone();
for (int i = 0; i < result.size(); i++) {
cv::rectangle(img, result[i].box, color[result[i].id], 2, 8);
mask(result[i].box).setTo(color[result[i].id], result[i].boxMask);
std::string label = std::to_string(result[i].id) + " " + std::to_string(result[i].confidence);
cv::putText(img, label, cv::Point(result[i].box.x, result[i].box.y-5), cv::FONT_HERSHEY_SIMPLEX, 0.5, color[result[i].id], 1);
}
cv::addWeighted(img, 0.7, mask, 0.3, 1, img); //将mask加在原图上面
}
int main(int argc, char** argv){
YOLO_SEG *yoloseg = new YOLO_SEG();
//srcnn ->load_trtmodel("../best.trt");
//const std::string trt_path = argv[1];
const std::string trt_path = "../best_seg.trt";
//const std::string img_path = argv[2];
const std::string img_path = "../input_seg.jpg";
yoloseg ->load_trtmodel(trt_path);
yoloseg ->initbuff();
int BatchSize = yoloseg->_max_batchsize;
int channel = 3;
int imgH = yoloseg->_input_h;
int imgW = yoloseg->_input_w;
float* input_last = new float[BatchSize * channel * imgH * imgW];
//bgr2rgb + resize + norm + hwc2chw
cv::Mat img;
for(int i =0; i< BatchSize; i++){
//std::string img_path = "../../imgs/face" + std::to_string(i) + ".png";
img = cv::imread(img_path);
int img_width = img.cols;
int img_height = img.rows;
// cv::imshow("img", img);
// cv::waitKey(0);
//std::cout << " img_height " << img_height << " imgH " << imgH << std::endl;
//std::cout << "size = " << (BatchSize * channel * imgH * imgW) << std::endl;
//crop + hwc->chw + bgr->rgb + norm
for (int h = img_height - imgH; h < img_height; ++h){
for (int w = 0; w < imgW; ++w){
for (int c = 0; c < channel; ++c){
int hid = h - (img_height - imgH);
//input_last[0] = static_cast<float>(img.at<cv::Vec3b>(h, w)[2-c]) / 255.0;
input_last[i * channel * imgH * imgW + c * imgH * imgW + hid * imgW + w] = static_cast<float>(img.at<cv::Vec3b>(h, w)[2-c]) / 255.0;
}
}
}
}
int outputSize0 = yoloseg->_outputSize0;
int outputSize1 = yoloseg->_outputSize1;
float *outputbuff0 = new float[BatchSize * outputSize0];
float *outputbuff1 = new float[BatchSize * outputSize1];
std::cout << " infer model " << std::endl;
yoloseg ->infer_trtmodel(BatchSize, input_last, outputbuff0, outputbuff1);
std::vector<int> classIds;//结果id数组
std::vector<float> confidences;//结果每个id对应置信度数组
std::vector<cv::Rect> boxes;//每个id矩形框
std::vector<cv::Mat> picked_proposals; //后续计算mask
// 处理box
static const float CONF_THRESHOLD = 0.1;
static const float NMS_THRESHOLD = 0.5;
static const float MASK_THRESHOLD = 0.5;
static const int _segWidth = imgW / 4;
static const int _segHeight = imgH / 4;
static const int _segChannels = 32;
static const int CLASSES = 1;
static const int Num_box = 37800;
int net_length = CLASSES + 4 + _segChannels;
// 处理box
cv::Mat out1 = cv::Mat(net_length, Num_box, CV_32F, outputbuff0);
auto start = std::chrono::system_clock::now();
for (int i = 0; i < Num_box; i++) {
//输出是1*net_length*Num_box;所以每个box的属性是每隔Num_box取一个值,共net_length个值
cv::Mat scores = out1(cv::Rect(i, 4, 1, CLASSES)).clone();
cv::Point classIdPoint;
double max_class_socre;
minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);
max_class_socre = (float)max_class_socre;
if (max_class_socre >= CONF_THRESHOLD) {
cv::Mat temp_proto = out1(cv::Rect(i, 4 + CLASSES, 1, _segChannels)).clone();
picked_proposals.push_back(temp_proto.t());
float x = out1.at<float>(0, i) ; //cx
float y = out1.at<float>(1, i); //cy
float w = out1.at<float>(2, i); //w
float h = out1.at<float>(3, i); //h
int left = MAX((x - 0.5 * w), 0);
int top = MAX((y - 0.5 * h), 0);
int width = (int)w;
int height = (int)h;
if (width <= 0 || height <= 0) { continue; }
classIds.push_back(classIdPoint.y);
confidences.push_back(max_class_socre);
boxes.push_back(cv::Rect(left, top, width, height));
}
}
//执行非最大抑制以消除具有较低置信度的冗余重叠框(NMS)
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, CONF_THRESHOLD, NMS_THRESHOLD, nms_result);
std::vector<cv::Mat> temp_mask_proposals;
std::vector<OutputSeg> output;
cv::Rect holeImgRect(0, 0, imgW, imgH);
for (int i = 0; i < nms_result.size(); ++i) {
int idx = nms_result[i];
OutputSeg result;
result.id = classIds[idx];
result.confidence = confidences[idx];
result.box = boxes[idx]&holeImgRect;
output.push_back(result);
temp_mask_proposals.push_back(picked_proposals[idx]);
}
std::cout << " process mask " << std::endl;
// 处理mask
cv::Mat maskProposals;
for (int i = 0; i < temp_mask_proposals.size(); ++i)
maskProposals.push_back(temp_mask_proposals[i]);
cv::Mat protos = cv::Mat(_segChannels, _segWidth * _segHeight, CV_32F, outputbuff1);
cv::Mat matmulRes = (maskProposals * protos).t();// A*B是以数学运算中矩阵相乘的方式实现的,要求A的列数等于B的行数时
cv::Mat masks = matmulRes.reshape(output.size(), { _segHeight, _segWidth});
std::vector<cv::Mat> maskChannels;
cv::split(masks, maskChannels);
for (int i = 0; i < output.size(); ++i) {
cv::Mat dest, mask;
cv::exp(-maskChannels[i], dest);//sigmoid
dest = 1.0 / (1.0 + dest);
std::cout << " cv::resize " << std::endl;
cv::resize(dest, mask, cv::Size(imgW, imgH), cv::INTER_NEAREST);
//crop----截取box中的mask作为该box对应的mask
cv::Rect temp_rect = output[i].box;
mask = mask(temp_rect) > MASK_THRESHOLD;
output[i].boxMask = mask;
}
auto end = std::chrono::system_clock::now();
std::cout << "后处理时间:" << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
DrawPred(img, output);
cv::imshow("output.jpg", img);
char c = cv::waitKey(0);
delete []input_last;
input_last =nullptr;
delete []outputbuff0;
outputbuff0 =nullptr;
delete []outputbuff1;
outputbuff1 =nullptr;
yoloseg ->releasebuff();
return 0;
}
CMakeLists.txt
project(trt_detect)
#add_definitions(-std=c++11)
add_definitions(-w)
find_package(CUDA REQUIRED)
# OpenCV package
FIND_PACKAGE(OpenCV REQUIRED)
# OpenCV include directories
INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
set(CMAKE_CXX_STANDARD 14)
#set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Debug)
#cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
include_directories(/home/a/TensorRT-8.5.1.7/include)
link_directories(/home/a/TensorRT-8.5.1.7/lib)
cuda_add_executable(inference_seg inference_seg.cpp)
target_link_libraries(inference_seg nvinfer)
target_link_libraries(inference_seg cudart)
target_link_libraries(inference_seg nvonnxparser)
target_link_libraries(inference_seg ${OpenCV_LIBS})
add_definitions(-O2)