Tensorrt加速部署yoloV5 7.0分割算法

最新推荐文章于 2024-07-19 11:20:37 发布

Rainbow Sea

最新推荐文章于 2024-07-19 11:20:37 发布

阅读量600

点赞数 9

文章标签： YOLO

本文链接：https://blog.csdn.net/qq_46463876/article/details/134968230

版权

主要文件结构如上图,cpp文件最后一个没有什么作用，源码会进行公开不浪费大家的积分下载；

engine可以通过python源码进行导出，源码下载可以自行搜索导出当时可以查看之前的tensorrt部署有详细的过程

config.h

#pragma once

static const int INPUT_H = 640;
static const int INPUT_W = 640;
static const int CLASSES = 80;
static const int _segChannels = 32;
static const int _segWidth = 160;
static const int _segHeight = 160;
static const float MASK_THRESHOLD = 0.5;
//每次变更模型注意需要更改的参数
//path to engine  model
const static char* engine_path = "yolov5s-seg.engine";
const static char* label_path = "coco.txt";
const static char* image_path = "zidane.jpg";

// These are used to define input/output tensor names,
// you can set them to whatever you want.
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output";

// Detection model and Segmentation model' number of classes
constexpr static int kNumClass = 3;

// Yolo's input width and height must by divisible by 32
constexpr static int kInputH = 640;
constexpr static int kInputW = 640;

// NMS overlapping thresh and final detection confidence thresh
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;

// If your image size is larger than 4096 * 3112, please increase this value
const static int kMaxInputImageSize = 4096 * 3112;

cuda_utils.h

#ifndef TRTX_CUDA_UTILS_H_
#define TRTX_CUDA_UTILS_H_

#include <cuda_runtime_api.h>

#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  // CUDA_CHECK

#endif  // TRTX_CUDA_UTILS_H_

precess22.h

#pragma once

#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>

//void cuda_preprocess_init(int max_image_size);
//void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
                     float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
                     cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
                           cudaStream_t stream);

Tensor.h

#pragma once
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "NvinferRuntime.h"
#include <fstream>
#include <opencv2/core/utils/logger.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/highgui/highgui_c.h>
#include "cuda_utils.h"
#include "config.h"

using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;

//分割结果结构体
struct Outputseg
{
	int id;					//类别ID
	string name;			//类别名称	
	float confidence;		//置信度
	Rect box;				//矩形框
	Mat boxmask;			//结果局部位置的掩膜
};

//日志相关
class Logger : public ILogger
{
	virtual void log(Severity severity, const char* msg) noexcept override
	{
		// suppress info-level messages
		if (severity != Severity::kINFO)
			std::cout << msg << std::endl;
	}
};

class TensorRT_detect
{
	//指针务必进行初始化，否则会异常
public:
	TensorRT_detect() = default;
	//默认构造函数
	TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name, string classesFile, int srcwidth, int srcheight);
	//初始化模型函数
	void initialize();
	//重载等比例缩放函数
	void Padding_Resize();
	//缩放图像的函数
	void Resize_Image();
	//析构函数
	~TensorRT_detect();
	//得到模型需要输入的图像的长宽信息
	int getinputh();
	int getinputw();
	//创建输入输出缓存区
	void Create_Buffer();
	//检测函数
	void detect();
	//数据后处理函数
	void Post_processing();
	//数据前处理函数
	void Pre_processing();
	Mat dstimg;                             //缩放后的图
	std::vector<float> input_data;          //输入
	const char* image_path;                 //测试图片的路径
	Mat test_img;                           //测试图片
	void** data_buffer = nullptr;           //存储输入输出数据的缓冲区
	int input_node_index;                   //输入节点对应的编号
	cudaStream_t stream = nullptr;          //创建cuda数据流
	uint8_t* img_buffer_host = nullptr;		//页锁定内存
	uint8_t* img_buffer_device = nullptr;	//处理前图像数据的存储区
	//绘制虚线的函数
	void drawDottedLine(cv::Mat& image, cv::Point start, cv::Point end, cv::Scalar color, int thickness, int lineType, int dotSize, int flag = 1);
	//对图像进行后处理包括绘制线框和掩膜
	void postprocess_mask();
	int neww = 0;							//对原图像进行等比例缩放后的宽度
	int newh = 0;							//对原图像进行等比例缩放后的高度
	int padw = 0;							//对原图像进行等比例缩放后的起始点的横坐标
	int padh = 0;							//对原图像进行等比例缩放后的起始点的纵坐标
	float scale;
private:
	const char* model_path_engine;          //engine模型文件的路径
	std::string classesFile;                //标签加载
	const char* input_node_name;            //模型输入节点的名称
	const char* output_node_name_1 = "output0";         //模型输出节点的名称
	const char* output_node_name_2 = "output1";         //模型输出节点的名称
	std::vector<std::string> class_names;   //标签数组
	int num_ionode = 0;                     //模型输入和输出的节点总数
	Logger logger;
	nvinfer1::IRuntime* runtime;            //反序列化引擎
	nvinfer1::ICudaEngine* engine;          //推理引擎
	nvinfer1::IExecutionContext* context;   //上下文
	nvinfer1::Dims input_node_dim;          //输入节点的维度相关的信息
	size_t input_data_length;               //输入的数据量的大小
	int output_node_index_1;                //输出节点对应的编号
	int output_node_index_2;                //输出节点对应的编号
	nvinfer1::Dims output_node_dim_1;       //输出节点1的维度相关信息
	nvinfer1::Dims output_node_dim_2;       //输出节点2的维度相关信息
	size_t output_data_length_1;            //输出节点1的数据量的大小
	size_t output_data_length_2;            //输出节点2的数据量的大小
	float* result_array = nullptr;          //输出数据
	float* mask_result_array = nullptr;     //mask原型输出数据
	const int max_image_size = 4096 * 3112;	//开辟的空间大小，在设备端
	int src_width = 0;						//相机或者图片的原始宽度
	int src_height = 0;						//相机或者图片的原始高度
};

main.cpp

#include "Tensor.h"
#include "preprocess22.h"
#include <iostream>


int main() {
	cv::VideoCapture video("sample.mp4");

    if (!video.isOpened()) {
        std::cout << "Failed to open video file" << std::endl;
        return -1;
    }
    Mat image = imread("zidane.jpg");
    // 获取图像的宽度和高度
    int width = static_cast<int>(video.get(cv::CAP_PROP_FRAME_WIDTH));
    int height = static_cast<int>(video.get(cv::CAP_PROP_FRAME_HEIGHT));
    //创建初始化ylov5s-seg检测器(需要更改)
    TensorRT_detect TD(engine_path, "zidane.jpg", kInputTensorName, label_path, image.cols, image.rows);

    // 创建一个名为 "Window" 的窗口
    cv::namedWindow("Window", cv::WINDOW_NORMAL);

    // 设置窗口的大小为 1280x720
    cv::resizeWindow("Window", image.cols, image.rows);

    TD.initialize();
    TD.Create_Buffer();
    TD.Padding_Resize();
    int num = 1;
    //统计每个阶段的耗时
    while (num--) {
        clock_t start1 = clock();
        //video >> TD.test_img;
        //TD.Resize_Image();
        TD.test_img = image.clone();
        vector<Mat> img_bach1;
        img_bach1.push_back(image);
        {
            //unique_lock<mutex> lk(myMutex);
            cuda_batch_preprocess(img_bach1, (float*)TD.data_buffer[TD.input_node_index], kInputW, kInputH, TD.img_buffer_host, TD.img_buffer_device, TD.stream);
        }
        clock_t end1 = clock();
        double exec_time1 = static_cast<double>(end1 - start1) / CLOCKS_PER_SEC;
        // 输出执行时间
        std::cout << "预处理 Execution time : " << exec_time1 << " seconds" << std::endl;
        clock_t start2 = clock();
        TD.detect();
        clock_t end2 = clock();
        double exec_time2 = static_cast<double>(end2 - start2) / CLOCKS_PER_SEC;
        // 输出执行时间
        std::cout << "推理事件 Execution time : " << exec_time2 << " seconds" << std::endl;
        clock_t start3 = clock();
        TD.Post_processing();
        clock_t end3 = clock();
        double exec_time3 = static_cast<double>(end3 - start3) / CLOCKS_PER_SEC;
        // 输出执行时间大概30ms
        std::cout << "后处理时间 Execution time : " << exec_time3 << " seconds" << std::endl;
        imshow("Window", TD.test_img);
        cv::waitKey(0);
    }
    // 销毁窗口
    cv::destroyAllWindows();
    //释放相机资源
    video.release();
    return 0;
}

process22.cu

#include "preprocess22.h"
#include "cuda_utils.h"
#include <device_launch_parameters.h>

//static uint8_t* img_buffer_host = nullptr;
//static uint8_t* img_buffer_device = nullptr;

struct AffineMatrix {
  float value[6];
};

__global__ void warpaffine_kernel(
    uint8_t* src, int src_line_size, int src_width,
    int src_height, float* dst, int dst_width,
    int dst_height, uint8_t const_value_st,
    AffineMatrix d2s, int edge) {
  int position = blockDim.x * blockIdx.x + threadIdx.x;
  if (position >= edge) return;

  float m_x1 = d2s.value[0];
  float m_y1 = d2s.value[1];
  float m_z1 = d2s.value[2];
  float m_x2 = d2s.value[3];
  float m_y2 = d2s.value[4];
  float m_z2 = d2s.value[5];

  int dx = position % dst_width;
  int dy = position / dst_width;

  float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
  float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
  float c0, c1, c2;

  if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
    // out of range
    c0 = const_value_st;
    c1 = const_value_st;
    c2 = const_value_st;
  } else {
    int y_low = floorf(src_y);
    int x_low = floorf(src_x);
    int y_high = y_low + 1;
    int x_high = x_low + 1;

    uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
    float ly = src_y - y_low;
    float lx = src_x - x_low;
    float hy = 1 - ly;
    float hx = 1 - lx;
    float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
    uint8_t* v1 = const_value;
    uint8_t* v2 = const_value;
    uint8_t* v3 = const_value;
    uint8_t* v4 = const_value;

    if (y_low >= 0) {
      if (x_low >= 0)
        v1 = src + y_low * src_line_size + x_low * 3;

      if (x_high < src_width)
        v2 = src + y_low * src_line_size + x_high * 3;
    }

    if (y_high < src_height) {
      if (x_low >= 0)
        v3 = src + y_high * src_line_size + x_low * 3;

      if (x_high < src_width)
        v4 = src + y_high * src_line_size + x_high * 3;
    }

    c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
    c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
    c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
  }

  // bgr to rgb 
  float t = c2;
  c2 = c0;
  c0 = t;

  // normalization
  c0 = c0 / 255.0f;
  c1 = c1 / 255.0f;
  c2 = c2 / 255.0f;

  // rgbrgbrgb to rrrgggbbb
  int area = dst_width * dst_height;
  float* pdst_c0 = dst + dy * dst_width + dx;
  float* pdst_c1 = pdst_c0 + area;
  float* pdst_c2 = pdst_c1 + area;
  *pdst_c0 = c0;
  *pdst_c1 = c1;
  *pdst_c2 = c2;
}

void cuda_preprocess(
    uint8_t* src, int src_width, int src_height, 
    float* dst, int dst_width, int dst_height,uint8_t* img_buffer_host, uint8_t* img_buffer_device,
    cudaStream_t stream) {

  int img_size = src_width * src_height * 3;
  // copy data to pinned memory
  memcpy(img_buffer_host, src, img_size);
  // copy data to device memory
  CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));

  AffineMatrix s2d, d2s;
  float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

  //缩放加平移
  s2d.value[0] = scale;
  s2d.value[1] = 0;
  s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
  s2d.value[3] = 0;
  s2d.value[4] = scale;
  s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;

  cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
  cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
  cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

  memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

  int jobs = dst_height * dst_width;
  int threads = 256;
  int blocks = ceil(jobs / (float)threads);

  warpaffine_kernel<<<blocks, threads, 0, stream>>>(
      img_buffer_device, src_width * 3, src_width,
      src_height, dst, dst_width,
      dst_height, 128, d2s, jobs);
}

void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
                           float* dst, int dst_width, int dst_height, uint8_t* img_buffer_host, uint8_t* img_buffer_device,
                           cudaStream_t stream) {
  int dst_size = dst_width * dst_height * 3;
  for (size_t i = 0; i < img_batch.size(); i++) {
    cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height,img_buffer_host, img_buffer_device, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
  }
}

//void cuda_preprocess_init(int max_image_size) {
//  // prepare input data in pinned memory
//  CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
//  // prepare input data in device memory
//  CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
//}
//
//void cuda_preprocess_destroy() {
//  CUDA_CHECK(cudaFree(img_buffer_device));
//  CUDA_CHECK(cudaFreeHost(img_buffer_host));
//}

Tensor.cpp


#include"Tensor.h"

using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;

TensorRT_detect::TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name,  string classesFile, int srcwidth, int srcheight) :
	model_path_engine(model_path_engine), image_path(image_path), input_node_name(input_node_name), classesFile(classesFile)
{
	src_height = srcheight;
	src_width = srcwidth;
	//加载标签样本
	std::ifstream ifs(classesFile.c_str());
	std::string line;
	while (getline(ifs, line)) class_names.push_back(line);
	cout << "标签的数量：：：：" << class_names.size() << endl;

	//读取模型文件
	std::ifstream file_ptr(model_path_engine, std::ios::binary);
	if (!file_ptr.good()) {
		std::cerr << "模型文件无法打开！" << std::endl;
	}

	//读取测试图片
	test_img = imread(image_path);
	开辟一段1280*1280的空间
	//dstimg = Mat(cv::Size(1280, 1280), CV_8UC3);
	//dstimg.setTo(cv::Scalar(0, 0, 0));

	size_t size = 0;
	file_ptr.seekg(0, file_ptr.end);
	size = file_ptr.tellg();
	file_ptr.seekg(0, file_ptr.beg);
	char* model_stream = new char[size];
	file_ptr.read(model_stream, size);
	file_ptr.close();
	//初始化引擎
	runtime = nvinfer1::createInferRuntime(logger);
	engine = runtime->deserializeCudaEngine(model_stream, size);
	context = engine->createExecutionContext();
}

void TensorRT_detect::initialize()
{
	//得到模型所有输入输出节点的个数
	num_ionode = engine->getNbBindings();
	data_buffer = new void* [num_ionode];
	//得到输入节点对应的编号
	input_node_index = engine->getBindingIndex(input_node_name);
	input_node_dim = engine->getBindingDimensions(input_node_index);
	input_data_length = int(input_node_dim.d[1] * input_node_dim.d[2] * input_node_dim.d[3]);
	input_data.resize(input_data_length);
	//得到输出节点1对应的编号
	output_node_index_1 = engine->getBindingIndex(output_node_name_1);
	output_node_dim_1 = engine->getBindingDimensions(output_node_index_1);
	output_data_length_1 = int(output_node_dim_1.d[1] * output_node_dim_1.d[2]);//(1,25200,117)
	result_array = new float[output_data_length_1];
	//得到输出节点2对应的编号
	output_node_index_2 = engine->getBindingIndex(output_node_name_2);
	output_node_dim_2 = engine->getBindingDimensions(output_node_index_2);
	output_data_length_2 = int(output_node_dim_2.d[1] * output_node_dim_2.d[2] * output_node_dim_2.d[3]);//(1，32，160，160)
	mask_result_array = new float[output_data_length_2];
}

//更新相关参数相机确定了四个相关参数也就确定了
void TensorRT_detect::Padding_Resize()
{
	int t_height = input_node_dim.d[3];
	int t_width = input_node_dim.d[2];
	//resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
	scale = (float)src_height / src_width;
	if (scale > 1) {
		newh = t_height;
		neww = int(t_height / scale);
		padw = int(t_width - neww) * 0.5;
	}
	else if (scale < 1) {
		neww = t_width;
		newh = (int)t_width * scale;
		padh = (t_height - newh) * 0.5;
	}
	else {
		newh = t_height;
		neww = t_width;
		padh = 0;
		padw = 0;
	}
}

void TensorRT_detect::Resize_Image() {
	int t_height = input_node_dim.d[3];
	int t_width = input_node_dim.d[2];
	//resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
	int srch = test_img.rows;
	int srcw = test_img.cols;
	int newh, neww;
	float scale = (float)srch / srcw;

	if (scale > 1) {
		newh = t_height;
		neww = int(t_height / scale);
		resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
		int left = int(t_width - neww) * 0.5;
		copyMakeBorder(dstimg, dstimg, 0, 0, left, t_width - neww - left, BORDER_CONSTANT, 0);
	}
	else if (scale < 1) {
		neww = t_width;
		newh = (int)t_width * scale;
		resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
		int top = (t_height - newh) * 0.5;
		copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
		//copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
	}
	else {
		resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
	}
}

int TensorRT_detect::getinputw()
{
	return input_node_dim.d[2];
}

int TensorRT_detect::getinputh()
{
	return input_node_dim.d[3];
}

TensorRT_detect::~TensorRT_detect()
{
	if (stream != nullptr)
	{
		cudaStreamDestroy(stream);
	}
	if (data_buffer != nullptr) {	
		for (int i = 0; i < num_ionode; i++)
		{
			cudaFree(data_buffer[i]);
		}
		delete[]data_buffer;
		data_buffer = nullptr;
	}

	
	if (result_array != nullptr) {
		delete[]result_array;
		result_array = nullptr;
	}

	if (mask_result_array != nullptr) {
		delete[]mask_result_array;
		mask_result_array = nullptr;
	}
	//释放内存，避免内存泄露,注意销毁顺序的问题
	context->destroy();
	engine->destroy();
	runtime->destroy();

	//释放cuda内存
	CUDA_CHECK(cudaFree(img_buffer_device));
	CUDA_CHECK(cudaFreeHost(img_buffer_host));
}

void TensorRT_detect::Create_Buffer()
{

	//创建输入缓存区
	cudaError_t err1 = cudaMalloc(&(data_buffer[input_node_index]), input_data_length * sizeof(float));
	if (err1 != cudaSuccess) {
		std::cout << "Failed to allocate memory for input data: " << cudaGetErrorString(err1) << std::endl;
		return;
	}
	cout << "创建输入缓冲区已完成：" << "  " << input_data_length << endl;
	//创建输出缓存区
	cudaError_t err2 = cudaMalloc(&(data_buffer[output_node_index_1]), output_data_length_1 * sizeof(float));
	if (err2 != cudaSuccess) {
		std::cout << "Failed to allocate memory for output_1 data: " << cudaGetErrorString(err2) << std::endl;
		return;
	}
	cout << "创建输出1缓冲区已完成：" << "  " << output_data_length_1 << endl;
	//创建mask掩膜原型输出缓存区
	cudaError_t err3 = cudaMalloc(&(data_buffer[output_node_index_2]), output_data_length_2 * sizeof(float));
	if (err3 != cudaSuccess) {
		std::cout << "Failed to allocate memory for output_2 data: " << cudaGetErrorString(err3) << std::endl;
		return;
	}
	cout << "创建输出2缓冲区已完成：" << "  " << output_data_length_2 << endl;
	cudaStreamCreate(&stream);

	//初始化内存
    // prepare input data in pinned memory
	cudaMallocHost((void**)&img_buffer_host, max_image_size * 3);
	// prepare input data in device memory
	cudaMalloc((void**)&img_buffer_device, max_image_size * 3);
}

void TensorRT_detect::Pre_processing()
{
	//vector<Mat> img_bach1;
	//img_bach1.push_back(test_img);
	//cuda_batch_preprocess(img_bach1, (float*)data_buffer[input_node_index], kInputW, kInputH, img_buffer_host, img_buffer_device,  stream);
	for (int c = 0; c < 3; c++)
	{
		for (int i = 0; i < 640; i++)
		{
			for (int j = 0; j < 640; j++)
			{
				float pix = dstimg.ptr<uchar>(i)[j * 3 + 2 - c];//
				input_data[c * 640 * 640 + i * 640 + size_t(j)] = pix / 255.0;
			}
		}
	}
}

void TensorRT_detect::drawDottedLine(cv::Mat& image, cv::Point start, cv::Point end, cv::Scalar color, int thickness, int lineType, int dotSize, int flag) {
	//计算x方向和y方向的长度
	int dx = end.x - start.x;
	int dy = end.y - start.y;
	//实现开始地方加粗
	cv::Point end_line1(start.x + dx / 10, start.y + dy / 10);
	cv::Point start_line2(end.x - dx / 10, end.y - dy / 10);
	cv::line(image, start, end_line1, color, thickness + 4, lineType);
	cv::line(image, start_line2, end, color, thickness + 4, lineType);

	//实现虚线绘制
	int segments = max(std::abs(dx), std::abs(dy)) / dotSize;

	cv::Point increment(dx / segments, dy / segments);

	for (int i = 0; i < segments; ++i)
	{
		if (i % 2 == 0)
			cv::line(image, start + increment * i, start + increment * (i + 1), color, thickness, lineType);
	}

	//绘制中间部分的短实线
	cv::Point center(start.x + dx / 2, start.y + dy / 2);
	cv::Point end_center1;
	if (dx == 0)
	{
		end_center1.x = end.x + 7.5 * flag;
		end_center1.y = start.y + dy / 2;

	}
	else {
		end_center1.x = start.x + dx / 2;
		end_center1.y = start.y + 7.5 * flag;
	}
	//中间画一小段直线
	cv::line(image, center, end_center1, color, thickness + 4, lineType);
}

void TensorRT_detect::detect()
{
	//将数据拷贝至显卡
	//cudaError_t err3 = cudaMemcpyAsync(data_buffer[input_node_index], input_data.data(), input_data_length * sizeof(float), cudaMemcpyHostToDevice, stream);
	//if (err3 != cudaSuccess) {
	//	std::cout << "Failed to transfer input data to GPU1: " << cudaGetErrorString(err3) << std::endl;
	//	return;
	//}
	//进行推理
	//clock_t start_time1 = clock();
	context->enqueueV2(data_buffer, stream, nullptr);
	//clock_t end_time = clock();
	//double exec_time = static_cast<double>(end_time - start_time1) / CLOCKS_PER_SEC;

	//将输出1数据拷贝至主机
	cudaError_t err4 = cudaMemcpyAsync(result_array, data_buffer[output_node_index_1], output_data_length_1 * sizeof(float), cudaMemcpyDeviceToHost, stream);
	if (err4 != cudaSuccess) {
		std::cout << "Failed to transfer input data to HOST: " << cudaGetErrorString(err4) << std::endl;
		return;
	}

	//将输出2数据拷贝至主机
	cudaError_t err5 = cudaMemcpyAsync(mask_result_array, data_buffer[output_node_index_2], output_data_length_2 * sizeof(float), cudaMemcpyDeviceToHost, stream);
	if (err5 != cudaSuccess) {
		std::cout << "Failed to transfer input data to HOST: " << cudaGetErrorString(err5) << std::endl;
		return;
	}
}

void TensorRT_detect::Post_processing()
{

	std::vector<float> output(result_array, result_array + output_data_length_1);
	std::vector<cv::Rect> boxes;
	std::vector<float> confs;
	std::vector<int> classIds;
	int numClasses = (int)output_node_dim_1.d[2] - 5 - _segChannels;
	float confThreshold = 0.5;

	//比例系数
	float ratio_h = (float)src_height / newh;
	float ratio_w = (float)src_width / neww;

	//存储output0[:,:, 5 + _className.size():net_width]用以后续计算mask
	std::vector<std::vector<float>> picked_proposals;
	int net_width = CLASSES + 5 + _segChannels;

	//对（1，25200，117）进行数据处理
	for (auto it = output.begin(); it != output.begin() + output_data_length_1; it += output_node_dim_1.d[2])
	{
		float clsConf = *(it + 4);//object scores
		if (clsConf > confThreshold)
		{
			//将坐标转化为原始图像上的坐标
			float x = (*it - padw) * ratio_w;  //x
			float y = (*(it + 1) - padh) * ratio_h;  //y
			float w = *(it + 2) * ratio_w;  //w
			float h = *(it + 3) * ratio_h;  //h

			//避免越界
			int left = MAX((x - 0.5 * w), 0);
			int top = MAX((y - 0.5 * h), 0);
			boxes.push_back(Rect(left, top, int(w), int(h)));

			//存放每个检测框的85-117协方差系数
			std::vector<float> temp_proto(it + 5 + CLASSES, it + net_width);
			picked_proposals.push_back(temp_proto);

			// first 5 element are x y w h and obj confidence
			int bestClassId = -1;
			float bestConf = 0.0;

			for (int i = 5; i < numClasses + 5; i++)
			{
				if ((*(it + i)) > bestConf)
				{
					bestConf = it[i];
					bestClassId = i - 5;
				}
			}

			//confs.emplace_back(bestConf * clsConf);
			confs.emplace_back(clsConf);
			classIds.emplace_back(bestClassId);
		}
	}
	
	float iouThreshold = 0.5;
	std::vector<int> indices;
	// Perform non maximum suppression to eliminate redundant overlapping boxes with
	cv::dnn::NMSBoxes(boxes, confs, confThreshold, iouThreshold, indices);

	//对掩膜数据进行处理(1,32,160,160)
	std::vector<std::vector<float>> temp_mask_proposals;
	Rect holeImgRect(0, 0, src_width, src_height);
	std::vector<Outputseg> output_seg;

	for (int i = 0; i < indices.size(); ++i) {
		int idx = indices[i];
		Outputseg result;
		result.id = classIds[idx];
		result.confidence = confs[idx];
		//与操作，使得超出图像边界范围的矩形框多余部分裁掉，保证可靠性
		result.box = boxes[idx] & holeImgRect;
		//result.box = boxes[idx];
		output_seg.push_back(result);

		temp_mask_proposals.push_back(picked_proposals[idx]);
	}


	// 处理mask
	Mat maskProposals;
	for (int i = 0; i < temp_mask_proposals.size(); ++i)
		maskProposals.push_back(Mat(temp_mask_proposals[i]).t());

	//取出第二个节点的输出数据
	std::vector<float> mask(mask_result_array, mask_result_array + _segChannels * _segWidth * _segHeight);
	Mat mask_protos = Mat(mask);
	Mat protos = mask_protos.reshape(0, { _segChannels,_segWidth * _segHeight });//将prob1的值 赋给mask_protos


	//考虑并行处理
	clock_t start = clock();
	Mat matmulRes = (maskProposals * protos).t();//n*32 32*25600 A*B是以数学运算中矩阵相乘的方式实现的，要求A的列数等于B的行数时
	clock_t end = clock();
	std::cout << "矩阵乘法的时间： " << end - start << endl;
	//开始时间,这一块处理时间太长
	clock_t start_1 = clock();
	Mat masks = matmulRes.reshape(output_seg.size(), { _segWidth,_segHeight });
	std::vector<Mat> maskChannels;
	split(masks, maskChannels);	
	for (int i = 0; i < output_seg.size(); ++i) {
		Mat dest, mask;
		//sigmoid
		cv::exp(-maskChannels[i], dest);
		dest = 1.0 / (1.0 + dest);//160*160
		//将等比例缩放后的区域映射到160*160
		Rect roi(int((float)padw / INPUT_W * _segWidth), int((float)padh / INPUT_H * _segHeight), int(_segWidth - padw / 2), int(_segHeight - padh / 2));
		dest = dest(roi);
		//比例不变
		resize(dest, mask, Size(src_width,src_height), INTER_NEAREST);
		//crop----截取box中的mask作为该box对应的mask
		Rect temp_rect = output_seg[i].box;
		//实现类似sigmod激活函数的作用,同时将mask调整到和box一样的大小
		mask = mask(temp_rect) > MASK_THRESHOLD;
		output_seg[i].boxmask = mask;
	}
	clock_t end_1 = clock();
	cout << "mask时间：" << end_1 - start_1 << endl;


	Mat madk = test_img.clone();

	RNG rng((unsigned)time(NULL));
	for (size_t i = 0; i < indices.size(); ++i)
	{
		int index = indices[i];
		int colorR = rng.uniform(0, 255);
		int colorG = rng.uniform(0, 255);
		int colorB = rng.uniform(0, 255);

		Point textPos0(boxes[index].br().x, boxes[index].tl().y);
		// 背景区域
		Rect bgRect(textPos0, Size(1, 1));
		// 在背景区域上方创建模糊背景
		Mat bg = test_img(bgRect);

		// 对图像 bg 进行均值模糊处理,核越大，模糊效果越明显
		blur(bg, bg, Size(20, 20));
		// 亮度调整因子，可根据需要进行调整
		double brightness = 1.2;
		//相乘之后会使得局部变亮
		bg *= brightness;

		madk(output_seg[i].box).setTo(cv::Scalar(colorR, colorG, colorB), output_seg[i].boxmask);
		//保留两位小数
		float scores = round(confs[index] * 100);
		std::ostringstream oss;
		oss << scores;
		//rectangle(dstimg, Point(boxes[index].tl().x, boxes[index].tl().y), Point(boxes[index].br().x, boxes[index].br().y), Scalar(colorR, colorG, colorB), 2 ,cv::LINE_AA);
		cv::Rect rect(boxes[index].tl(), boxes[index].br());
		drawDottedLine(test_img, rect.tl(), cv::Point(rect.br().x, rect.tl().y), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, 1);
		drawDottedLine(test_img, cv::Point(rect.br().x, rect.tl().y), rect.br(), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, -1);
		drawDottedLine(test_img, rect.br(), cv::Point(rect.tl().x, rect.br().y), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, -1);
		drawDottedLine(test_img, cv::Point(rect.tl().x, rect.br().y), rect.tl(), cv::Scalar(colorR, colorG, colorB), 2, cv::LINE_AA, 5, 1);

		std::string s0 = "ID: " + std::to_string(classIds[index]);
		std::string s1 = "Name: " + class_names[classIds[index]];
		std::string s2 = oss.str() + " %";
		std::string s3 = "Area: " + std::to_string(boxes[index].area());


		putText(test_img, s0, Point(boxes[index].br().x + 10, boxes[index].tl().y + 20), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
		putText(test_img, s1, Point(boxes[index].br().x + 10, boxes[index].tl().y + 45), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
		putText(test_img, s3, Point(boxes[index].br().x + 10, boxes[index].tl().y + 70), FONT_HERSHEY_SIMPLEX, 0.8, Scalar(colorR, colorG, colorB), 2);
		putText(test_img, s2, Point(boxes[index].br().x + 40, boxes[index].tl().y + 105), FONT_HERSHEY_SIMPLEX, 1.0, Scalar(255, 255, 255), 4);
	}
	addWeighted(test_img, 0.5, madk, 0.5, 0, test_img); //将mask加在原图上面
}

void TensorRT_detect::postprocess_mask() {
	std::vector<float> output(result_array, result_array + output_data_length_1);
	std::vector<cv::Rect> boxes;
	std::vector<float> confs;
	std::vector<int> classIds;
	//cout<<"输出矩阵的大小:"<<output.size()<<endl;
	int numClasses = (int)output_node_dim_1.d[2] - 5 - 32;
	float confThreshold = 0.5;
	for (auto it = output.begin(); it != output.begin() + output_data_length_1; it += output_node_dim_1.d[2])
	{
		float clsConf = *(it + 4);//object scores
		if (clsConf > confThreshold)
		{
			int centerX = (int)(*it);
			int centerY = (int)(*(it + 1));
			int width = (int)(*(it + 2));
			int height = (int)(*(it + 3));
			int x1 = centerX - width / 2;
			int y1 = centerY - height / 2;
			boxes.emplace_back(cv::Rect(x1, y1, width, height));

			// first 5 element are x y w h and obj confidence
			int bestClassId = -1;
			float bestConf = 0.0;

			for (int i = 5; i < numClasses + 5; i++)
			{
				if ((*(it + i)) > bestConf)
				{
					bestConf = it[i];
					bestClassId = i - 5;
				}
			}

			//confs.emplace_back(bestConf * clsConf);
			confs.emplace_back(clsConf);
			classIds.emplace_back(bestClassId);
		}
	}
	//std::cout<<"11111111"<<std::endl;
	float iouThreshold = 0.5;
	std::vector<int> indices;
	// Perform non maximum suppression to eliminate redundant overlapping boxes with
	cv::dnn::NMSBoxes(boxes, confs, confThreshold, iouThreshold, indices);
	//std::cout<<"22222222"<<std::endl;
	RNG rng((unsigned)time(NULL));
	for (size_t i = 0; i < indices.size(); ++i)
	{
		int index = indices[i];
		int colorR = rng.uniform(0, 255);
		int colorG = rng.uniform(0, 255);
		int colorB = rng.uniform(0, 255);

		//
		float scores = round(confs[index] * 100) / 100;
		std::ostringstream oss;
		oss << scores;
		rectangle(dstimg, Point(boxes[index].tl().x, boxes[index].tl().y), Point(boxes[index].br().x, boxes[index].br().y), Scalar(colorR, colorG, colorB), 1.5);
		putText(dstimg, class_names[classIds[index]] + " " + oss.str(), Point(boxes[index].tl().x, boxes[index].tl().y - 5), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(colorR, colorG, colorB), 2);
	}
}