tensorrt+yolov5+VS2017的简单模型部署应用(c++版)

本文链接：https://blog.csdn.net/DOtafriad/article/details/140845566

引言

之前在做YOLOv5的模型部署工作，由于是第一次接触模型部署的相关技术领域，故对此一窍不通。为了完成部署工作，真是查阅了诸多资料，发现很多文章对tensorrt+yolov5的简单部署都是点到为止。本文将通过最简单的部署方式协助大家完成模型部署工作，希望能帮助到各位新手朋友。

如果有讲解错误的地方，请各位批评指正！

配置tensorrt环境

cuda:11.6

cudnn:8.4.1

tensorrt:8.6.1

vs:2017

opencv:4.55

首先，下载合适的tensorrt版本，点击此处下载，tensorrt版本需要和cuda版本对应起来。

下载安装cuda,可以参考CUDA安装教程（超详细）-CSDN博客。

engine文件导出

首先打开你的yolov5项目。

打开export.py,选择训练时的yaml文件，选择训练好的pt文件，再将后面的default改为onnx,右击运行即可，onnx文件会自动存入pt文件所在的目录中。

也可以直接输入命令行。

//weights 后面加上训练好的权重文件路径。
python export.py --weights yolov5s.pt --include onnx

随后，我们将导出的onnx文件存入之前下载安装的TensorRT-8.6.1.6\bin\目录下。

左键单击目录，输入cmd。回车。

输入以下指令。

//best.onnx替换为你的onnx模型
trtexec.exe --onnx=best.onnx --saveEngine=best.engine

大约等待十几分钟时间，即可在目录下得到engine文件。

模型初始化

打开vs，对vs进行环境配置。可参考文章：Win10+TensorRT 8.5安装+VS2022配置_tensorrt-8.5.2.2-CSDN博客

完成vs环境配置以后，开始对模型进行部署推理。

首先进行模型初始化。

cv::RNG rng;
	std::string filepath = "C:/Users/1/Desktop/tensorrt/best.engine";//engine文件路径。
	// runtime:运行时候的接口实例
	//engine:序列化文件
	//context:管理中间激活的其他状态。

	std::unique_ptr<nvinfer1::IRuntime> trtRuntime(nullptr);
	std::unique_ptr<nvinfer1::ICudaEngine> engine(nullptr);
	std::unique_ptr<nvinfer1::IExecutionContext> context(nullptr);

	loadEngine(filepath, engine, context);//读取初始化engine文件

	std::vector<nvinfer1::Dims> input_dims; // 
	std::vector<nvinfer1::Dims> output_dims; // 
	std::vector<void *> buffers(engine->getNbBindings());

	cudaGetMem(input_dims, output_dims, engine, buffers);//载入模型输出数据，为后续推理后处理做准备。
	//cout查看模型结构
	std::cout <<"input_dims[0].d[0]" <<input_dims[0].d[0] << std::endl;
	std::cout << "input_dims[0].d[1]" << input_dims[0].d[1] << std::endl;
	std::cout << "input_dims[0].d[2]" << input_dims[0].d[2] << std::endl;
	std::cout << "input_dims[0].d[3]" << input_dims[0].d[3] << std::endl;
	std::cout  << std::endl; std::cout << std::endl;
	std::cout << "output_dims[0].d[0]" << output_dims[0].d[0] << std::endl;
	std::cout << "output_dims[0].d[1]" << output_dims[0].d[1] << std::endl;
	std::cout << "output_dims[0].d[2]" << output_dims[0].d[2] << std::endl;
	std::cout << "output_dims[0].d[3]" << output_dims[0].d[3] << std::endl;
	std::cout << std::endl; std::cout << std::endl;
	std::cout << "output_dims[1].d[0]"<< output_dims[1].d[0] << std::endl;
	std::cout << "output_dims[1].d[1]" << output_dims[1].d[1] << std::endl;
	std::cout << "output_dims[1].d[2]" << output_dims[1].d[2] << std::endl;
	std::cout << "output_dims[1].d[3]" << output_dims[1].d[3] << std::endl;

//加载engine文件
void loadEngine(const std::string &filepath, std::unique_ptr<nvinfer1::ICudaEngine> &engine,
	std::unique_ptr<nvinfer1::IExecutionContext> &context)
{
	std::ifstream file(filepath, std::ios::binary);
	std::vector<char> data;

	file.seekg(0, file.end);
	const auto size = file.tellg();
	file.seekg(0, file.beg);

	data.resize(size);
	file.read(data.data(), size);
	file.close();

	std::unique_ptr<nvinfer1::IRuntime> trtRuntime(nvinfer1::createInferRuntime(gLogger));
	engine.reset(trtRuntime->deserializeCudaEngine(data.data(), data.size()));
	context.reset(engine->createExecutionContext());
}

//获取载入模型输出数据，为数据后处理做准备。
void cudaGetMem(std::vector<nvinfer1::Dims> &input_dims, std::vector<nvinfer1::Dims> &output_dims,
	const std::unique_ptr<nvinfer1::ICudaEngine> &engine, std::vector<void *> &buffers)
{
	
	// CPU->GPU memory
	
	for (int i = 0; i < buffers.size(); ++i)
	{
		auto binding_size = getSizeDims(engine->getBindingDimensions(i)) * sizeof(float);

		cudaMalloc(&buffers[i], binding_size);

		if (engine->bindingIsInput(i))
		{
			input_dims.emplace_back(engine->getBindingDimensions(i));
		}
		else
		{
			output_dims.emplace_back(engine->getBindingDimensions(i));
		}

	}
	if (input_dims.empty() || output_dims.empty())
	{
		std::cerr << "Failed load network" << std::endl;
		exit(1);
	}
}

图像前处理

对图像进行前处理操作：1、letterbox缩放，将图像大小变为640*640；2、将图像数据进行归一化；3、将图像从BGR转化为RGB；4、HWC格式和CHW格式的转化。

1、letterbox缩放。相较于resize函数，letterbox就是等比例缩放，其他的部分用背景色填充，避免了图像失真的问题出现。

//paddings为缩放参数，为后续图像还原做准备。
//img为原图像。
std::vector<float> paddings(3);       //scale, half_h, half_w
cv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterbox

cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{
	// Get current image shape [height, width]

	int img_h = img.rows;
	int img_w = img.cols;

	// Compute scale ratio(new / old) and target resized shape
	float scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);
	int resize_h = int(round(img_h * scale));
	int resize_w = int(round(img_w * scale));
	paddings[0] = scale;

	// Compute padding
	int pad_h = new_shape[1] - resize_h;
	int pad_w = new_shape[0] - resize_w;

	// Resize and pad image while meeting stride-multiple constraints
	cv::Mat resized_img;
	cv::resize(img, resized_img, cv::Size(resize_w, resize_h));

	// divide padding into 2 sides
	float half_h = pad_h * 1.0 / 2;
	float half_w = pad_w * 1.0 / 2;
	paddings[1] = half_h;
	paddings[2] = half_w;

	// Compute padding boarder
	int top = int(round(half_h - 0.1));
	int bottom = int(round(half_h + 0.1));
	int left = int(round(half_w - 0.1));
	int right = int(round(half_w + 0.1));

	// Add border
	cv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));

	return resized_img;
}

2、将图像数据进行归一化。

resized_img.convertTo(resized_img, CV_32FC3, 1.0f / 255.0f);

3、将图像从BGR转化为RGB。

cv::cvtColor(resized_img, resized_img, cv::COLOR_BGR2RGB);

4、HWC格式和CHW格式的转化

        //图像前处理后的数据存入warp_dst_nchw
        std::vector<cv::Mat> warp_dst_nchw_channels;
	    cv::split(resized_img, warp_dst_nchw_channels);
		for (auto &img1 : warp_dst_nchw_channels)
		{
			img1 = img1.reshape(1, 1);
		}
		cv::Mat warp_dst_nchw;

		cv::hconcat(warp_dst_nchw_channels, warp_dst_nchw);

图像推理

图像推理需将预处理后的图像推入GPU进行推理。

yolov5的onnx输出为25200*84，其中25200个cell,每个cell里面有4+1+80的输出值，4对应预测框参数（x,y,width,lenth）+1个置信度（最大类别概率）+80个类别概率（80个类别概率根据自己设置的类别情况做调整）。

首先申请GPU内存。

float *gpu_data[2];
//gpu_data[0]推入的预处理后的图片数据：3 * 640*640* sizeof(float)。
//gpu_data[1]推入的推理后的数据：25200 * 84 * sizeof(float)。
cudaMalloc(&gpu_data[0], 3 * 640*640* sizeof(float));
cudaMalloc(&gpu_data[1], 25200 * 84 * sizeof(float));
cudaMemcpy(gpu_data[0], warp_dst_nchw.ptr(), 3 * 640*640 * sizeof(float), cudaMemcpyHostToDevice);//将预处理后的图片数据推入gpu_data[0]。

进行推理。

cudaStream_t stream;
		bool success = context->enqueueV2((void**)gpu_data, 0, nullptr);
		if (!success)
		{
			std::cout << "Failed to context!";
		}

数据后处理

yolov5的onnx输出为25200*84，其中25200个cell,每个cell里面有4+1+80的输出值，4对应预测框参数（x,y,width,lenth）+1个置信度（最大类别概率）+80个类别概率（80个类别概率根据自己模型输出情况做调整）。

将数据从GPU传入CPU，后处理显示图像。

        float* cpu_output_buffer = nullptr；
		
		cpu_output_buffer = new float[25200 * 84 * sizeof(float)];
        cudaMemcpyAsync(cpu_output_buffer, gpu_data[1], 25200 * 84* sizeof(float), cudaMemcpyDeviceToHost);
	
		cv::Mat detect_buffer(output_dims[1].d[1], output_dims[1].d[2], CV_32F, cpu_output_buffer);//将数据转化为25200*84的矩阵，以便后续操作。
cv::Mat img_show=tar_img(img, detect_buffer2, paddings); 
cv::imshow("img_show",img_show);
cv::WaitKey(0);

后处理部分函数tar_img（）代码为:

cv::Mat tar_img(cv::Mat img, cv::Mat detect_buffer, std::vector<float> paddings)
{
	float conf_threshold = 0.2;//置信度自行设置
	float nms_threshold = 0.2;
	std::vector<cv::Rect> boxes;

	std::vector<int> class_ids;
	std::vector<float> class_scores;
	std::vector<float> confidences;
	
	// cx,cy,w,h,confidence,c1,c2,...c80
	float scale = paddings[0];

	for (int i = 0; i < detect_buffer.rows; i++) {
		float confidence = detect_buffer.at<float>(i, 4);
		if (confidence < conf_threshold) {
			continue;
		}
		cv::Mat classes_scores = detect_buffer.row(i).colRange(5,84);
		cv::Point class_id;
		double score;
		cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);
		score = confidence;
		// class score: 0~1
		if (score > 0.2)
		{
			
			float cx = detect_buffer.at<float>(i, 0);
			float cy = detect_buffer.at<float>(i, 1);
			float w = detect_buffer.at<float>(i, 2);
			float h = detect_buffer.at<float>(i, 3);
			//int _left = ;
			int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / scale);
			int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / scale);
			int width = static_cast<int>(w / scale);
			int height = static_cast<int>(h / scale);
			cv::Rect box;
			box.x = left;
			box.y = top;
			box.width = width;
			box.height = height;

			boxes.push_back(box);
			class_ids.push_back(class_id.x);
			class_scores.push_back(score);
			confidences.push_back(confidence);
		
		}
	}
	// NMS，极大值抑制
	std::vector<int> indices;
	cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);
	cv::Mat rgb_mask = cv::Mat::zeros(img.size(), img.type());

	// -------- Visualize the detection results -----------

	for (size_t i = 0; i < indices.size(); i++) {
		int index = indices[i];
		int class_id = class_ids[index];
		
		cv::Rect box = boxes[index];
	
		int x1 = std::max(0, box.x);
		int y1 = std::max(0, box.y);
		int x2 = std::max(0, box.br().x);
		int y2 = std::max(0, box.br().y);
		rectangle(img, box, cv::Scalar(0, 0, 255), 2 );//横矩形
	}
	
	return img;//输出为带有检测框的图像
}

最后

汇总

完整代码如下。

#include <iostream>
#include <string>
#include <memory>
#include <fstream>
#include <vector>
#include <algorithm>
#include <chrono>
#include <NvInferRuntime.h>
#include <cuda_runtime_api.h>

#include <numeric>

#include <opencv2/opencv.hpp>
//#include <opencv2/cudawarping.hpp>
//#include <opencv2/cudaarithm.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>

class Loger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, char const *msg) noexcept override
	{
	}

} gLogger;

cv::Mat tar_img(cv::Mat img, cv::Mat detect_buffer, std::vector<float> paddings)
{
	float conf_threshold = 0.2;//置信度自行设置
	float nms_threshold = 0.2;
	std::vector<cv::Rect> boxes;

	std::vector<int> class_ids;
	std::vector<float> class_scores;
	std::vector<float> confidences;

	// cx,cy,w,h,confidence,c1,c2,...c80
	float scale = paddings[0];

	for (int i = 0; i < detect_buffer.rows; i++) {
		float confidence = detect_buffer.at<float>(i, 4);
		if (confidence < conf_threshold) {
			continue;
		}
		cv::Mat classes_scores = detect_buffer.row(i).colRange(5, 84);
		cv::Point class_id;
		double score;
		cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);
		score = confidence;
		// class score: 0~1
		if (score > 0.2)
		{

			float cx = detect_buffer.at<float>(i, 0);
			float cy = detect_buffer.at<float>(i, 1);
			float w = detect_buffer.at<float>(i, 2);
			float h = detect_buffer.at<float>(i, 3);
			//int _left = ;
			int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / scale);
			int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / scale);
			int width = static_cast<int>(w / scale);
			int height = static_cast<int>(h / scale);
			cv::Rect box;
			box.x = left;
			box.y = top;
			box.width = width;
			box.height = height;

			boxes.push_back(box);
			class_ids.push_back(class_id.x);
			class_scores.push_back(score);
			confidences.push_back(confidence);

		}
	}
	// NMS，极大值抑制
	std::vector<int> indices;
	cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);
	cv::Mat rgb_mask = cv::Mat::zeros(img.size(), img.type());

	// -------- Visualize the detection results -----------

	for (size_t i = 0; i < indices.size(); i++) {
		int index = indices[i];
		int class_id = class_ids[index];

		cv::Rect box = boxes[index];

		int x1 = std::max(0, box.x);
		int y1 = std::max(0, box.y);
		int x2 = std::max(0, box.br().x);
		int y2 = std::max(0, box.br().y);
		rectangle(img, box, cv::Scalar(0, 0, 255), 2);//横矩形
	}

	return img;//输出为带有检测框的图像
}

cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{
	// Get current image shape [height, width]

	int img_h = img.rows;
	int img_w = img.cols;

	// Compute scale ratio(new / old) and target resized shape
	float scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);
	int resize_h = int(round(img_h * scale));
	int resize_w = int(round(img_w * scale));
	paddings[0] = scale;

	// Compute padding
	int pad_h = new_shape[1] - resize_h;
	int pad_w = new_shape[0] - resize_w;

	// Resize and pad image while meeting stride-multiple constraints
	cv::Mat resized_img;
	cv::resize(img, resized_img, cv::Size(resize_w, resize_h));

	// divide padding into 2 sides
	float half_h = pad_h * 1.0 / 2;
	float half_w = pad_w * 1.0 / 2;
	paddings[1] = half_h;
	paddings[2] = half_w;

	// Compute padding boarder
	int top = int(round(half_h - 0.1));
	int bottom = int(round(half_h + 0.1));
	int left = int(round(half_w - 0.1));
	int right = int(round(half_w + 0.1));

	// Add border
	cv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));

	return resized_img;
}


void loadEngine(const std::string &filepath, std::unique_ptr<nvinfer1::ICudaEngine> &engine,
	std::unique_ptr<nvinfer1::IExecutionContext> &context)
{
	std::ifstream file(filepath, std::ios::binary);
	std::vector<char> data;

	file.seekg(0, file.end);
	const auto size = file.tellg();
	file.seekg(0, file.beg);

	data.resize(size);
	file.read(data.data(), size);
	file.close();

	std::unique_ptr<nvinfer1::IRuntime> trtRuntime(nvinfer1::createInferRuntime(gLogger));
	engine.reset(trtRuntime->deserializeCudaEngine(data.data(), data.size()));
	context.reset(engine->createExecutionContext());

}

size_t getSizeDims(const nvinfer1::Dims &dims)
{
	size_t size = 1;
	for (int i = 0; i < dims.nbDims; ++i)
	{
		size *= dims.d[i];
	}
	return size;
}

void cudaGetMem(std::vector<nvinfer1::Dims> &input_dims, std::vector<nvinfer1::Dims> &output_dims,
	const std::unique_ptr<nvinfer1::ICudaEngine> &engine, std::vector<void *> &buffers)
{
	
	// CPU->GPU memory
	std::cout <<"buffers.size():"<< buffers.size() << std::endl;
	for (int i = 0; i < buffers.size(); ++i)
	{
		auto binding_size = getSizeDims(engine->getBindingDimensions(i)) * sizeof(float);//获取模型维度信息。

		std::cout << "binding_size"<< i<<":::" << getSizeDims(engine->getBindingDimensions(i)) << std::endl;

		cudaMalloc(&buffers[i], binding_size);

		if (engine->bindingIsInput(i))
		{
			input_dims.emplace_back(engine->getBindingDimensions(i));
		}
		else
		{
			output_dims.emplace_back(engine->getBindingDimensions(i));
		}

	}
	//
	if (input_dims.empty() || output_dims.empty())
	{
		std::cerr << "Failed load network" << std::endl;
		exit(1);
	}
}

//更换模型时，注意更改。
//1、更改output
//2、更改内存容量分配
//3、更改标签
int main()
{
	cv::RNG rng;
	std::cout << "Hello, World!" << std::endl;
	//-------------------模型载入----------------
	std::string filepath = "yolov5s.engine";//载入模型
	std::string str_dir = "C:/Users/1/Desktop/1.bmp";//输入图片地址
	// runtime:运行时候的接口实例
	//engine:序列化文件
	//context:管理中间激活的其他状态。

	std::unique_ptr<nvinfer1::IRuntime> trtRuntime(nullptr);
	std::unique_ptr<nvinfer1::ICudaEngine> engine(nullptr);
	std::unique_ptr<nvinfer1::IExecutionContext> context(nullptr);

	loadEngine(filepath, engine, context);

	std::vector<nvinfer1::Dims> input_dims; // we expect only one input
	std::vector<nvinfer1::Dims> output_dims; // and one output
	std::vector<void *> buffers(engine->getNbBindings());

	cudaGetMem(input_dims, output_dims, engine, buffers);

	std::cout <<"input_dims[0].d[0]" <<input_dims[0].d[0] << std::endl;
	std::cout << "input_dims[0].d[1]" << input_dims[0].d[1] << std::endl;
	std::cout << "input_dims[0].d[2]" << input_dims[0].d[2] << std::endl;
	std::cout << "input_dims[0].d[3]" << input_dims[0].d[3] << std::endl;
	std::cout  << std::endl; std::cout << std::endl;
	std::cout << "output_dims[0].d[0]" << output_dims[0].d[0] << std::endl;
	std::cout << "output_dims[0].d[1]" << output_dims[0].d[1] << std::endl;
	std::cout << "output_dims[0].d[2]" << output_dims[0].d[2] << std::endl;
	std::cout << "output_dims[0].d[3]" << output_dims[0].d[3] << std::endl;
	std::cout << std::endl; std::cout << std::endl;
	std::cout << "output_dims[1].d[0]"<< output_dims[1].d[0] << std::endl;
	std::cout << "output_dims[1].d[1]" << output_dims[1].d[1] << std::endl;
	std::cout << "output_dims[1].d[2]" << output_dims[1].d[2] << std::endl;
	std::cout << "output_dims[1].d[3]" << output_dims[1].d[3] << std::endl;



	const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };
	std::vector<cv::Scalar> colors = { cv::Scalar(0, 0, 255) , cv::Scalar(0, 255, 0) , cv::Scalar(255, 0, 0) ,
								   cv::Scalar(255, 255, 0) , cv::Scalar(0, 255, 255) , cv::Scalar(255, 0, 255) };
	//读取图片数据
	

		cv::Mat frame = cv::imread(str_dir);; //cpu
		cv::Mat img = frame.clone();
		std::vector<float> paddings(3);       //scale, half_h, half_w
		cv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterbox
		
		cv::Mat masked_img;
		//--------------- 图像预处理------------
		resized_img.convertTo(resized_img, CV_32FC3, 1.0f / 255.0f); 

		int _networkRows = input_dims[0].d[2];
		int _networkCols = input_dims[0].d[3];
		//std::cout << _networkRows << std::endl;
		//
		const cv::Size networkSize(_networkCols, _networkRows);

		float *inputptr = (float *)buffers.at(0);
		//std::vector<cv::cuda::GpuMat> channels;
		const int channelSize = networkSize.area();
		cv::cvtColor(resized_img, resized_img, cv::COLOR_BGR2RGB);
		std::vector<cv::Mat> warp_dst_nchw_channels;

		cv::split(resized_img, warp_dst_nchw_channels);
		for (auto &img1 : warp_dst_nchw_channels)
		{
			img1 = img1.reshape(1, 1);
		}
		cv::Mat warp_dst_nchw;

		cv::hconcat(warp_dst_nchw_channels, warp_dst_nchw);


		//------------------ 模型推理-------------------------
		float *gpu_data[2];
		float* cpu_output_buffer = nullptr;
		cpu_output_buffer = new float[25200 * 84 * sizeof(float)];

		cudaMalloc(&gpu_data[0], 3 * channelSize * sizeof(float)); //channelSize=640*640
		cudaMalloc(&gpu_data[1], 25200 * 42 * sizeof(float));

		cudaMemcpy(gpu_data[0], warp_dst_nchw.ptr(), 3 * channelSize * sizeof(float), cudaMemcpyHostToDevice);
		cudaStreamSynchronize(0);
		cudaStream_t stream;
		bool success = context->enqueueV2((void**)gpu_data, 0, nullptr);
		if (!success)
		{
			std::cout << "Failed to context!";
		}
		else
			std::cout << "Succeed to context!" << std::endl;
		cudaMemcpyAsync(cpu_output_buffer, gpu_data[1], 25200 * 84 * sizeof(float), cudaMemcpyDeviceToHost);
		cv::Mat detect_buffer(output_dims[1].d[1], output_dims[1].d[2], CV_32F, cpu_output_buffer);

		//------------------ 图像后处理-------------------------
		cv::Mat img_show = tar_img(img, detect_buffer, paddings);
		cv::imshow("img_show", img_show);
		cv::waitKey(0);

		cv::destroyAllWindows();

		cudaFree(&gpu_data[1]);
		cudaFree(&gpu_data[2]);
	
		return 0;
	}