主要文件结构如上图,cpp文件最后一个没有什么作用,源码会进行公开不浪费大家的积分下载;
engine可以通过python源码进行导出,源码下载可以自行搜索导出当时可以查看之前的tensorrt部署有详细的过程
-
config.h
#pragma once
static const int INPUT_H = 640;
static const int INPUT_W = 640;
static const int CLASSES = 80;
static const int _segChannels = 32;
static const int _segWidth = 160;
static const int _segHeight = 160;
static const float MASK_THRESHOLD = 0.5;
//每次变更模型注意需要更改的参数
//path to engine model
const static char* engine_path = "yolov5s-seg.engine";
const static char* label_path = "coco.txt";
const static char* image_path = "zidane.jpg";
// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";
// Detection model and Segmentation model' number of classes
constexpr static int kNumClass = 3;
// Yolo's input width and height must by divisible by 32
constexpr static int kInputH = 640;
constexpr static int kInputW = 640;
// NMS overlapping thresh and final detection confidence thresh
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;
-
cuda_utils.h
#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_
#include <cuda_runtime_api.h>
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif // CUDA_CHECK
#endif // TRTX_CUDA_UTILS_H_
-
precess22.h
#pragma once
#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>
//void cuda_preprocess_init(int max_image_size);
//void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
cudaStream_t stream);
-
Tensor.h
#pragma once
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "NvinferRuntime.h"
#include <fstream>
#include <opencv2/core/utils/logger.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/highgui/highgui_c.h>
#include "cuda_utils.h"
#include "config.h"
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;
//分割结果结构体
struct Outputseg
{
int id; //类别ID
string name; //类别名称
float confidence; //置信度
Rect box; //矩形框
Mat boxmask; //结果局部位置的掩膜
};
//日志相关
class Logger : public ILogger
{
virtual void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
}
};
class TensorRT_detect
{
//指针务必进行初始化,否则会异常
public:
TensorRT_detect() = default;
//默认构造函数
TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name, string classesFile, int srcwidth, int srcheight);
//初始化模型函数
void initialize();
//重载等比例缩放函数
void Padding_Resize();
//缩放图像的函数
void Resize_Image();
//析构函数
~TensorRT_detect();
//得到模型需要输入的图像的长宽信息
int getinputh();
int getinputw();
//创建输入输出缓存区
void Create_Buffer();
//检测函数
void detect();
//数据后处理函数
void Post_processing();
//数据前处理函数
void Pre_processing();
Mat dstimg; //缩放后的图
std::vector<float> input_data; //输入
const char* image_path; //测试图片的路径
Mat test_img; //测试图片
void** data_buffer = nullptr; //存储输入输出数据的缓冲区
int input_node_index; //输入节点对应的编号
cudaStream_t stream = nullptr; //创建cuda数据流
uint8_t* img_buffer_host = nullptr; //页锁定内存
uint8_t* img_buffer_device = nullptr; //处理前图像数据的存储区
//绘制虚线的函数
void drawDottedLine(cv::Mat& image, cv::Point start, cv::Point end, cv::Scalar color, int thickness, int lineType, int dotSize, int flag = 1);
//对图像进行后处理包括绘制线框和掩膜
void postprocess_mask();
int neww = 0; //对原图像进行等比例缩放后的宽度
int newh = 0; //对原图像进行等比例缩放后的高度
int padw = 0; //对原图像进行等比例缩放后的起始点的横坐标
int padh = 0; //对原图像进行等比例缩放后的起始点的纵坐标
float scale;
private:
const char* model_path_engine; //engine模型文件的路径
std::string classesFile; //标签加载
const char* input_node_name; //模型输入节点的名称
const char* output_node_name_1 = "output0"; //模型输出节点的名称
const char* output_node_name_2 = "output1"; //模型输出节点的名称
std::vector<std::string> class_names; //标签数组
int num_ionode = 0; //模型输入和输出的节点总数
Logger logger;
nvinfer1::IRuntime* runtime; //反序列化引擎
nvinfer1::ICudaEngine* engine; //推理引擎
nvinfer1::IExecutionContext* context; //上下文
nvinfer1::Dims input_node_dim; //输入节点的维度相关的信息
size_t input_data_length; //输入的数据量的大小
int output_node_index_1; //输出节点对应的编号
int output_node_index_2; //输出节点对应的编号
nvinfer1::Dims output_node_dim_1; //输出节点1的维度相关信息
nvinfer1::Dims output_node_dim_2; //输出节点2的维度相关信息
size_t output_data_length_1; //输出节点1的数据量的大小
size_t output_data_length_2; //输出节点2的数据量的大小
float* result_array = nullptr; //输出数据
float* mask_result_array = nullptr; //mask原型输出数据
const int max_image_size = 4096 * 3112; //开辟的空间大小,在设备端
int src_width = 0; //相机或者图片的原始宽度
int src_height = 0; //相机或者图片的原始高度
};
-
main.cpp
#include "Tensor.h"
#include "preprocess22.h"
#include <iostream>
int main() {
cv::VideoCapture video("sample.mp4");
if (!video.isOpened()) {
std::cout << "Failed to open video file" << std::endl;
return -1;
}
Mat image = imread("zidane.jpg");
// 获取图像的宽度和高度
int width = static_cast<int>(video.get(cv::CAP_PROP_FRAME_WIDTH));
int height = static_cast<int>(video.get(cv::CAP_PROP_FRAME_HEIGHT));
//创建初始化ylov5s-seg检测器(需要更改)
TensorRT_detect TD(engine_path, "zidane.jpg", kInputTensorName, label_path, image.cols, image.rows);
// 创建一个名为 "Window" 的窗口
cv::namedWindow("Window", cv::WINDOW_NORMAL);
// 设置窗口的大小为 1280x720
cv::resizeWindow("Window", image.cols, image.rows);
TD.initialize();
TD.Create_Buffer();
TD.Padding_Resize();
int num = 1;
//统计每个阶段的耗时
while (num--) {
clock_t start1 = clock();
//video >> TD.test_img;
//TD.Resize_Image();
TD.test_img = image.clone();
vector<Mat> img_bach1;
img_bach1.push_back(image);
{
//unique_lock<mutex> lk(myMutex);
cuda_batch_preprocess(img_bach1, (float*)TD.data_buffer[TD.input_node_index], kInputW, kInputH, TD.img_buffer_host, TD.img_buffer_device, TD.stream);
}
clock_t end1 = clock();
double exec_time1 = static_cast<double>(end1 - start1) / CLOCKS_PER_SEC;
// 输出执行时间
std::cout << "预处理 Execution time : " << exec_time1 << " seconds" << std::endl;
clock_t start2 = clock();
TD.detect();
clock_t end2 = clock();
double exec_time2 = static_cast<double>(end2 - start2) / CLOCKS_PER_SEC;
// 输出执行时间
std::cout << "推理事件 Execution time : " << exec_time2 << " seconds" << std::endl;
clock_t start3 = clock();
TD.Post_processing();
clock_t end3 = clock();
double exec_time3 = static_cast<double>(end3 - start3) / CLOCKS_PER_SEC;
// 输出执行时间大概30ms
std::cout << "后处理时间 Execution time : " << exec_time3 << " seconds" << std::endl;
imshow("Window", TD.test_img);
cv::waitKey(0);
}
// 销毁窗口
cv::destroyAllWindows();
//释放相机资源
video.release();
return 0;
}
-
process22.cu
#include "preprocess22.h"
#include "cuda_utils.h"
#include <device_launch_parameters.h>
//static uint8_t* img_buffer_host = nullptr;
//static uint8_t* img_buffer_device = nullptr;
struct AffineMatrix {
float value[6];
};
__global__ void warpaffine_kernel(
uint8_t* src, int src_line_size, int src_width,
int src_height, float* dst, int dst_width,
int dst_height, uint8_t const_value_st,
AffineMatrix d2s, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
float m_x1 = d2s.value[0];
float m_y1 = d2s.value[1];
float m_z1 = d2s.value[2];
float m_x2 = d2s.value[3];
float m_y2 = d2s.value[4];
float m_z2 = d2s.value[5];
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
// out of range
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = const_value;
uint8_t* v2 = const_value;
uint8_t* v3 = const_value;
uint8_t* v4 = const_value;
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
// bgr to rgb
float t = c2;
c2 = c0;
c0 = t;
// normalization
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
// rgbrgbrgb to rrrgggbbb
int area = dst_width * dst_height;
float* pdst_c0 = dst + dy * dst_width + dx;
float* pdst_c1 = pdst_c0 + area;
float* pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
void cuda_preprocess(
uint8_t* src, int src_width, int src_height,
float* dst, int dst_width, int dst_height,uint8_t* img_buffer_host, uint8_t* img_buffer_device,
cudaStream_t stream) {
int img_size = src_width * src_height * 3;
// copy data to pinned memory
memcpy(img_buffer_host, src, img_size);
// copy data to device memory
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
AffineMatrix s2d, d2s;
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
//缩放加平移
s2d.value[0] = scale;
s2d.value[1] = 0;
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
s2d.value[3] = 0;
s2d.value[4] = scale;
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float)threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(
img_buffer_device, src_width * 3, src_width,
src_height, dst, dst_width,
dst_height, 128, d2s, jobs);
}
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
cudaStream_t stream) {
int dst_size = dst_width * dst_height * 3;
for (size_t i = 0; i < img_batch.size(); i++) {
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height,img_buffer_host, img_buffer_device, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
//void cuda_preprocess_init(int max_image_size) {
// // prepare input data in pinned memory
// CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
// // prepare input data in device memory
// CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
//}
//
//void cuda_preprocess_destroy() {
// CUDA_CHECK(cudaFree(img_buffer_device));
// CUDA_CHECK(cudaFreeHost(img_buffer_host));
//}
-
Tensor.cpp
#include"Tensor.h"
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;
TensorRT_detect::TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name, string classesFile, int srcwidth, int srcheight) :
model_path_engine(model_path_engine), image_path(image_path), input_node_name(input_node_name), classesFile(classesFile)
{
src_height = srcheight;
src_width = srcwidth;
//加载标签样本
std::ifstream ifs(classesFile.c_str());
std::string line;
while (getline(ifs, line)) class_names.push_back(line);
cout << "标签的数量::::" << class_names.size() << endl;
//读取模型文件
std::ifstream file_ptr(model_path_engine, std::ios::binary);
if (!file_ptr.good()) {
std::cerr << "模型文件无法打开!" << std::endl;
}
//读取测试图片
test_img = imread(image_path);
开辟一段1280*1280的空间
//dstimg = Mat(cv::Size(1280, 1280), CV_8UC3);
//dstimg.setTo(cv::Scalar(0, 0, 0));
size_t size = 0;
file_ptr.seekg(0, file_ptr.end);
size = file_ptr.tellg();
file_ptr.seekg(0, file_ptr.beg);
char* model_stream = new char[size];
file_ptr.read(model_stream, size);
file_ptr.close();
//初始化引擎
runtime = nvinfer1::createInferRuntime(logger);
engine = runtime->deserializeCudaEngine(model_stream, size);
context = engine->createExecutionContext();
}
void TensorRT_detect::initialize()
{
//得到模型所有输入输出节点的个数
num_ionode = engine->getNbBindings();
data_buffer = new void* [num_ionode];
//得到输入节点对应的编号
input_node_index = engine->getBindingIndex(input_node_name);
input_node_dim = engine->getBindingDimensions(input_node_index);
input_data_length = int(input_node_dim.d[1] * input_node_dim.d[2] * input_node_dim.d[3]);
input_data.resize(input_data_length);
//得到输出节点1对应的编号
output_node_index_1 = engine->getBindingIndex(output_node_name_1);
output_node_dim_1 = engine->getBindingDimensions(output_node_index_1);
output_data_length_1 = int(output_node_dim_1.d[1] * output_node_dim_1.d[2]);//(1,25200,117)
result_array = new float[output_data_length_1];
//得到输出节点2对应的编号
output_node_index_2 = engine->getBindingIndex(output_node_name_2);
output_node_dim_2 = engine->getBindingDimensions(output_node_index_2);
output_data_length_2 = int(output_node_dim_2.d[1] * output_node_dim_2.d[2] * output_node_dim_2.d[3]);//(1,32,160,160)
mask_result_array = new float[output_data_length_2];
}
//更新相关参数相机确定了四个相关参数也就确定了
void TensorRT_detect::Padding_Resize()
{
int t_height = input_node_dim.d[3];
int t_width = input_node_dim.d[2];
//resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
scale = (float)src_height / src_width;
if (scale > 1) {
newh = t_height;
neww = int(t_height / scale);
padw = int(t_width - neww) * 0.5;
}
else if (scale < 1) {
neww = t_width;
newh = (int)t_width * scale;
padh = (t_height - newh) * 0.5;
}
else {
newh = t_height;
neww = t_width;
padh = 0;
padw = 0;
}
}
void TensorRT_detect::Resize_Image() {
int t_height = input_node_dim.d[3];
int t_width = input_node_dim.d[2];
//resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
int srch = test_img.rows;
int srcw = test_img.cols;
int newh, neww;
float scale = (float)srch / srcw;
if (scale > 1) {
newh = t_height;
neww = int(t_height / scale);
resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
int left = int(t_width - neww) * 0.5;
copyMakeBorder(dstimg, dstimg, 0, 0, left, t_width - neww - left, BORDER_CONSTANT, 0);
}
else if (scale < 1) {
neww = t_width;
newh = (int)t_width * scale;
resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
int top = (t_height - newh) * 0.5;
copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
//copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
}
else {
resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
}
}
int TensorRT_detect::getinputw()
{
return input_node_dim.d[2];
}
int TensorRT_detect::getinputh()
{
return input_node_dim.d[3];
}
TensorRT_detect::~TensorRT_detect()
{
if (stream != nullptr)
{
cudaStreamDestroy(stream);
}
if (data_buffer != nullptr) {
for (int i = 0; i < num_ionode; i++)
{
cudaFree(data_buffer[i]);
}
delete[]data_buffer;
data_buffer = nullptr;
}
if (result_array != nullptr) {
delete[]result_array;
result_array = nullptr;
}
if (mask_result_array != nullptr) {
delete[]mask_result_array;
mask_result_array = nullptr;
}
//释放内存,避免内存泄露,注意销毁顺序的问题
context->destroy();
engine->destroy();
runtime->destroy();
//释放cuda内存
CUDA_CHECK(cudaFree(img_buffer_device));
CUDA_CHECK(cudaFreeHost(img_buffer_host));
}
void TensorRT_detect::Create_Buffer()
{
//创建输入缓存区
cudaError_t err1 = cudaMalloc(&(data_buffer[input_node_index]), input_data_length * sizeof(float));
if (err1 != cudaSuccess) {
std::cout << "Failed to allocate memory for input data: " << cudaGetErrorString(err1) << std::endl;
return;
}
cout << "创建输入缓冲区已完成:" << " " << input_data_length << endl;
//创建输出缓存区
cudaError_t err2 = cudaMalloc(&(data_buffer[output_node_index_1]), output_data_length_1 * sizeof(float));
if (err2 != cudaSuccess) {
std::cout << "Failed to allocate memory for output_1 data: " << cudaGetErrorString(err2) << std::endl;
return;
}
cout << "创建输出1缓冲区已完成:" << " " << output_data_length_1 << endl;
//创建mask掩膜原型输出缓存区
cudaError_t err3 = cudaMalloc(&(data_buffer[output_node_index_2]), output_data_length_2 * sizeof(float));
if (err3 != cudaSuccess) {
std::cout << "Failed to allocate memory for output_2 data: " << cudaGetErrorString(err3) << std::endl;
return;
}
cout << "创建输出2缓冲区已完成:" << " " << output_data_length_2 << endl;
cudaStreamCreate(&stream);
//初始化内存
// prepare input data in pinned memory
cudaMallocHost((void**)&img_buffer_host, max_image_size * 3);
// prepare input data in device memory
cudaMalloc((void**)&img_buffer_device, max_image_size * 3);
}
void TensorRT_detect::Pre_processing()
{
//vector<Mat> img_bach1;
//img_bach1.push_back(test_img);
//cuda_batch_preprocess(img_bach1, (float*)data_buffer[input_node_index], kInputW, kInputH, img_buffer_host, img_buffer_device, stream);
for (int c = 0; c < 3; c++)
{
for (int i = 0; i < 640; i++)
{
for (int j = 0; j < 640; j++)
{
float pix = dstimg.ptr<uchar>(i)[j * 3 + 2 - c];//
input_data[c * 640 * 640 + i * 640 + size_t(j)] = pix / 255.0;
}
}
}
}
void TensorRT_detect::drawDottedLine(cv::Mat& image, cv::Point start, cv::Point end, cv::Scalar color, int thickness, int lineType, int dotSize, int flag) {
//计算x方向和y方向的长度
int dx = end.x - start.x;
int dy = end.y - start.y;
//实现开始地方加粗
cv::Point end_line1(start.x + dx / 10, start.y + dy / 10);
cv::Point start_line2(end.x - dx / 10, end.y - dy / 10);
cv::line(image, start, end_line1, color, thickness + 4, lineType);
cv::line(image, start_line2, end, color, thickness + 4, lineType);
//实现虚线绘制
int segments = max(std::abs(dx), std::abs(dy)) / dotSize;
cv::Point increment(dx / segments, dy / segments);
for (int i = 0; i < segments; ++i)
{
if (i % 2 == 0)
cv::line(image, start + increment * i, start + increment * (i + 1), color, thickness, lineType);
}
//绘制中间部分的短实线
cv::Point center(start.x + dx / 2, start.y + dy / 2);
cv::Point end_center1;
if (dx == 0)
{
end_center1.x = end.x + 7.5 * flag;
end_center1.y = start.y + dy / 2;
}
else {
end_center1.x = start.x + dx / 2;
end_center1.y = start.y + 7.5 * flag;
}
//中间画一小段直线
cv::line(image, center, end_center1, color, thickness + 4, lineType);
}
void TensorRT_detect::detect()
{
//将数据拷贝至显卡
//cudaError_t err3 = cudaMemcpyAsync(data_buffer[input_node_index], input_data.data(), input_data_length * sizeof(float), cudaMemcpyHostToDevice, stream);
//if (err3 != cudaSuccess) {
// std::cout << "Failed to transfer input data to GPU1: " << cudaGetErrorString(err3) << std::endl;
// return;
//}
//进行推理
//clock_t start_time1 = clock();
context->enqueueV2(data_buffer, stream, nullptr);
//clock_t end_time = clock();
//double exec_time = static_cast<double>(end_time - start_time1) / CLOCKS_PER_SEC;
//将输出1数据拷贝至主机
cudaError_t err4 = cudaMemcpyAsync(result_array, data_buffer[output_node_index_1], output_data_length_1 * sizeof(float), cudaMemcpyDeviceToHost, stream);
if (err4 != cudaSuccess) {
std::cout << "Failed to transfer input data to HOST: " << cudaGetErrorString(err4) << std::endl;
return;
}
//将输出2数据拷贝至主机
cudaError_t err5 = cudaMemcpyAsync(mask_result_array, data_buffer[output_node_index_2], output_data_length_2 * sizeof(float), cudaMemcpyDeviceToHost, stream);
if (err5 != cudaSuccess) {
std::cout << "Failed to transfer input data to HOST: " << cudaGetErrorString(err5) << std::endl;
return;
}
}
void TensorRT_detect::Post_processing()
{
std::vector<float> output(result_array, result_array + output_data_length_1);
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
int numClasses = (int)output_node_dim_1.d[2] - 5 - _segChannels;
float confThreshold = 0.5;
//比例系数
float ratio_h = (float)src_height / newh;
float ratio_w = (float)src_width / neww;
//存储output0[:,:, 5 + _className.size():net_width]用以后续计算mask
std::vector<std::vector<float>> picked_proposals;
int net_width = CLASSES + 5 + _segChannels;
//对(1,25200,117)进行数据处理
for (auto it = output.begin(); it != output.begin() + output_data_length_1; it += output_node_dim_1.d[2])
{
float clsConf = *(it + 4);//object scores
if (clsConf > confThreshold)
{
//将坐标转化为原始图像上的坐标
float x = (*it - padw) * ratio_w; //x
float y = (*(it + 1) - padh) * ratio_h; //y
float w = *(it + 2) * ratio_w; //w
float h = *(it + 3) * ratio_h; //h
//避免越界
int left = MAX((x - 0.5 * w), 0);
int top = MAX((y - 0.5 * h), 0);
boxes.push_back(Rect(left, top, int(w), int(h)));
//存放每个检测框的85-117协方差系数
std::vector<float> temp_proto(it + 5 + CLASSES, it + net_width);
picked_proposals.push_back(temp_proto);
// first 5 element are x y w h and obj confidence
int bestClassId = -1;
float bestConf = 0.0;
for (int i = 5; i < numClasses + 5; i++)
{
if ((*(it + i)) > bestConf)
{
bestConf = it[i];
bestClassId = i - 5;
}
}
//confs.emplace_back(bestConf * clsConf);
confs.emplace_back(clsConf);
classIds.emplace_back(bestClassId);
}
}
float iouThreshold = 0.5;
std::vector<int> indices;
// Perform non maximum suppression to eliminate redundant overlapping boxes with
cv::dnn::NMSBoxes(boxes, confs, confThreshold, iouThreshold, indices);
//对掩膜数据进行处理(1,32,160,160)
std::vector<std::vector<float>> temp_mask_proposals;
Rect holeImgRect(0, 0, src_width, src_height);
std::vector<Outputseg> output_seg;
for (int i = 0; i < indices.size(); ++i) {
int idx = indices[i];
Outputseg result;
result.id = classIds[idx];
result.confidence = confs[idx];
//与操作,使得超出图像边界范围的矩形框多余部分裁掉,保证可靠性
result.box = boxes[idx] & holeImgRect;
//result.box = boxes[idx];
output_seg.push_back(result);
temp_mask_proposals.push_back(picked_proposals[idx]);
}
// 处理mask
Mat maskProposals;
for (int i = 0; i < temp_mask_proposals.size(); ++i)
maskProposals.push_back(Mat(temp_mask_proposals[i]).t());
//取出第二个节点的输出数据
std::vector<float> mask(mask_result_array, mask_result_array + _segChannels * _segWidth * _segHeight);
Mat mask_protos = Mat(mask);
Mat protos = mask_protos.reshape(0, { _segChannels,_segWidth * _segHeight });//将prob1的值 赋给mask_protos
//考虑并行处理
clock_t start = clock();
Mat matmulRes = (maskProposals * protos).t();//n*32 32*25600 A*B是以数学运算中矩阵相乘的方式实现的,要求A的列数等于B的行数时
clock_t end = clock();
std::cout << "矩阵乘法的时间: " << end - start << endl;
//开始时间,这一块处理时间太长
clock_t start_1 = clock();
Mat masks = matmulRes.reshape(output_seg.size(), { _segWidth,_segHeight });
std::vector<Mat> maskChannels;
split(masks, maskChannels);
for (int i = 0; i < output_seg.size(); ++i) {
Mat dest, mask;
//sigmoid
cv::exp(-maskChannels[i], dest);
dest = 1.0 / (1.0 + dest);//160*160
//将等比例缩放后的区域映射到160*160
Rect roi(int((float)padw / INPUT_W * _segWidth), int((float)padh / INPUT_H * _segHeight), int(_segWidth - padw / 2), int(_segHeight - padh / 2));
dest = dest(roi);
//比例不变
resize(dest, mask, Size(src_width,src_height), INTER_NEAREST);
//crop----截取box中的mask作为该box对应的mask
Rect temp_rect = output_seg[i].box;
//实现类似sigmod激活函数的作用,同时将mask调整到和box一样的大小
mask = mask(temp_rect) > MASK_THRESHOLD;
output_seg[i].boxmask = mask;
}
clock_t end_1 = clock();
cout << "mask时间:" << end_1 - start_1 << endl;
Mat madk = test_img.clone();
RNG rng((unsigned)time(NULL));
for (size_t i = 0; i < indices.size(); ++i)
{
int index = indices[i];
int colorR = rng.uniform(0, 255);
int colorG = rng.uniform(0, 255);
int colorB = rng.uniform(0, 255);
Point textPos0(boxes[index].br().x, boxes[index].tl().y);
// 背景区域
Rect bgRect(textPos0, Size(1, 1));
// 在背景区域上方创建模糊背景
Mat bg = test_img(bgRect);
// 对图像 bg 进行均值模糊处理,核越大,模糊效果越明显
blur(bg, bg, Size(20, 20));
// 亮度调整因子,可根据需要进行调整
double brightness = 1.2;
//相乘之后会使得局部变亮
bg *= brightness;
madk(output_seg[i].box).setTo(cv::Scalar(colorR, colorG, colorB), output_seg[i].boxmask);
//保留两位小数
float scores = round(confs[index] * 100);
std::ostringstream oss;
oss << scores;
//rectangle(dstimg, Point(boxes[index].tl().x, boxes[index].tl().y), Point(boxes[index].br().x, boxes[index].br().y), Scalar(colorR, colorG, colorB), 2 ,cv::LINE_AA);
cv::Rect rect(boxes[index].tl(), boxes[index].br());
drawDottedLine(test_img, rect.tl(), cv::Point(rect.br().x, rect.tl().y), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, 1);
drawDottedLine(test_img, cv::Point(rect.br().x, rect.tl().y), rect.br(), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, -1);
drawDottedLine(test_img, rect.br(), cv::Point(rect.tl().x, rect.br().y), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, -1);
drawDottedLine(test_img, cv::Point(rect.tl().x, rect.br().y), rect.tl(), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, 1);
std::string s0 = "ID: " + std::to_string(classIds[index]);
std::string s1 = "Name: " + class_names[classIds[index]];
std::string s2 = oss.str() + " %";
std::string s3 = "Area: " + std::to_string(boxes[index].area());
putText(test_img, s0, Point(boxes[index].br().x + 10, boxes[index].tl().y + 20), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
putText(test_img, s1, Point(boxes[index].br().x + 10, boxes[index].tl().y + 45), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
putText(test_img, s3, Point(boxes[index].br().x + 10, boxes[index].tl().y + 70), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
putText(test_img, s2, Point(boxes[index].br().x + 40, boxes[index].tl().y + 105), FONT_HERSHEY_SIMPLEX, 1.0, Scalar(255, 255, 255), 4);
}
addWeighted(test_img, 0.5, madk, 0.5, 0, test_img); //将mask加在原图上面
}
void TensorRT_detect::postprocess_mask() {
std::vector<float> output(result_array, result_array + output_data_length_1);
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
//cout<<"输出矩阵的大小:"<<output.size()<<endl;
int numClasses = (int)output_node_dim_1.d[2] - 5 - 32;
float confThreshold = 0.5;
for (auto it = output.begin(); it != output.begin() + output_data_length_1; it += output_node_dim_1.d[2])
{
float clsConf = *(it + 4);//object scores
if (clsConf > confThreshold)
{
int centerX = (int)(*it);
int centerY = (int)(*(it + 1));
int width = (int)(*(it + 2));
int height = (int)(*(it + 3));
int x1 = centerX - width / 2;
int y1 = centerY - height / 2;
boxes.emplace_back(cv::Rect(x1, y1, width, height));
// first 5 element are x y w h and obj confidence
int bestClassId = -1;
float bestConf = 0.0;
for (int i = 5; i < numClasses + 5; i++)
{
if ((*(it + i)) > bestConf)
{
bestConf = it[i];
bestClassId = i - 5;
}
}
//confs.emplace_back(bestConf * clsConf);
confs.emplace_back(clsConf);
classIds.emplace_back(bestClassId);
}
}
//std::cout<<"11111111"<<std::endl;
float iouThreshold = 0.5;
std::vector<int> indices;
// Perform non maximum suppression to eliminate redundant overlapping boxes with
cv::dnn::NMSBoxes(boxes, confs, confThreshold, iouThreshold, indices);
//std::cout<<"22222222"<<std::endl;
RNG rng((unsigned)time(NULL));
for (size_t i = 0; i < indices.size(); ++i)
{
int index = indices[i];
int colorR = rng.uniform(0, 255);
int colorG = rng.uniform(0, 255);
int colorB = rng.uniform(0, 255);
//
float scores = round(confs[index] * 100) / 100;
std::ostringstream oss;
oss << scores;
rectangle(dstimg, Point(boxes[index].tl().x, boxes[index].tl().y), Point(boxes[index].br().x, boxes[index].br().y), Scalar(colorR, colorG, colorB), 1.5);
putText(dstimg, class_names[classIds[index]] + " " + oss.str(), Point(boxes[index].tl().x, boxes[index].tl().y - 5), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(colorR, colorG, colorB), 2);
}
}