tx2经过tensorrt部署yolov5代码详解

最新推荐文章于 2023-09-25 20:13:20 发布

冷兔 ҉҉҉҉҉

最新推荐文章于 2023-09-25 20:13:20 发布

阅读量304

点赞数

文章标签： YOLO

本文链接：https://blog.csdn.net/zhaoyan272898330/article/details/132052197

版权

CMakelist：

cmake_minimum_required(VERSION 3.5)指定CMake的最低版本要求为3.5。
project(Mask)定义项目名称为"Mask"
set(CMAKE_CXX_STANDARD 11)设置C++标准为C++11
set(Cudart_LIBS "/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudart.so")设置CUDA Runtime库的路径
enable_language(CUDA)启用CUDA支持
find_package(CUDA REQUIRED)查找CUDA，并确保找到CUDA
find_package(OpenCV 4 REQUIRED)查找OpenCV版本为4，并确保找到OpenCV库。
include_directories(
       Yolo
${OpenCV_INCLUDE_DIRS}
       /usr/local/cuda/include
)设置头文件的包含目录，包括Yolo目录、OpenCV的头文件目录和CUDA的头文件目录
add_library(
       Yolo
       Yolo/yolov5.cpp
)添加名为"Yolo"的静态库，将Yolo/yolov5.cpp文件编译为库文件。

cuda_add_library(myplugins SHARED ./Yolo/yololayer.cu)
使用CUDA编译Yolo/yololayer.cu文件，
并生成名为"myplugins"的共享库（动态链接库）

add_executable(Mask
       Mask_Detection.cpp
       Yolo/yolov5.cpp
)
       添加名为"Mask"的可执行文件，
       编译Mask_Detection.cpp和Yolo/yolov5.cpp文件。
target_link_libraries(
       Mask
       nvinfer
       cudart
       myplugins
       /usr/local/cuda/lib64
       ${Cudart_LIBS}
       ${OpenCV_LIBS}
)
指定"Mask"可执行文件需要链接的库文件，包括nvinfer（NVIDIA的推理引擎库）、cudart（CUDA Runtime库）、
myplugins（之前生成的共享库）、CUDA的链接库和OpenCV的链接库。

yolov5.cpp

#include"Yolov5.h"
#include "string.h"

#define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32
#define DEVICE 0 // GPU id
#define NMS_THRESH 0.4
#define CONF_THRESH 0.5
#define BATCH_SIZE 1

#define cam_U0 323.3152
#define cam_V0 227.4562
#define cam_F 828.0

using namespace nvinfer1;

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int CLASS_NUM = Yolo::CLASS_NUM;
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;
IRuntime* runtime;
ICudaEngine* engine;
IExecutionContext* context;
int inputIndex = 0;
int outputIndex = 0;

/*
函数的目的是将边界框坐标从一个坐标系（基于输入图像大小）
转换为另一个坐标系（基于YOLO网络的输入尺寸）。
这在使用YOLO或类似网络进行目标检测时经常需要，
因为这些网络期望输入图像具有特定的尺寸。
计算输入图像宽度(img.cols)与预定义的YOLO网络输入宽度(Yolo::INPUT_W)之间的比例(r_w)。这一步是为了将边界框坐标规范化为相对于YOLO网络输入尺寸的比例。

计算输入图像高度(img.rows)与预定义的YOLO网络输入高度(Yolo::INPUT_H)之间的比例(r_h)。

比较r_h和r_w，以确定是根据输入图像的宽度还是高度来调整边界框坐标。

如果r_h > r_w，意味着输入图像的高度较大，因此使用高度比例(r_h)来调整边界框坐标。

通过从中心坐标(bbox[0]和bbox[1])中减去/加上宽度/高度的一半，计算边界框的左侧(l)、右侧(r)、顶部(t)和底部(b)坐标。
调整顶部(t)和底部(b)坐标，以考虑YOLO输入高度与图像高度之间的差异，确保边界框坐标正确地规范化到YOLO的输入尺寸。
通过将坐标除以宽度比例(r_w)来进行归一化。
如果r_h <= r_w，意味着输入图像的宽度较大，因此使用宽度比例(r_w)来调整边界框坐标。过程与前一步类似，但这次是针对宽度进行调整。

返回一个cv::Rect对象，其中包含计算得到的左侧(l)、顶部(t)、宽度(r - l)和高度(b - t)值，用于表示边界框在图像坐标系中的位置。*/
//输入是一个图像img和一个包含四个元素的边界框数组bbox，数组的
//四个元素分别是0 边界框中心点的x坐标，1 边界框中心点的y坐标，边界框的宽度
//边界框的高度
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
int l, r, t, b;
float r_w = Yolo::INPUT_W / (img.cols * 1.0);//计算输入图像宽度(img.cols)与预定义的YOLO网络输入宽度(Yolo::INPUT_W)之间的比例(r_w)
float r_h = Yolo::INPUT_H / (img.rows * 1.0);//这一步是为了将边界框坐标规范化为相对于YOLO网络输入尺寸的比例
if (r_h > r_w) {
l = bbox[0] - bbox[2] / 2.f;
r = bbox[0] + bbox[2] / 2.f;
t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
l = l / r_w;
r = r / r_w;
t = t / r_w;
b = b / r_w;
} else {
l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
t = bbox[1] - bbox[3] / 2.f;
b = bbox[1] + bbox[3] / 2.f;
l = l / r_h;
r = r / r_h;
t = t / r_h;
b = b / r_h;
}
return cv::Rect(l, t, r - l, b - t);
}
/*
代码计算了两个边界框之间的交并比（Intersection over Union，IoU），也称为重叠联合比或交叉比。IoU是在目标检测等任务中常用的评估指标，用于衡量两个边界框之间的重叠程度。

函数iou接受两个边界框的数组作为输入参数：lbox和rbox。数组的四个元素依次是：左上角点的x坐标，左上角点的y坐标，边界框的宽度，以及边界框的高度。

以下是函数的具体步骤：

计算两个边界框的交集区域的左上角和右下角坐标，并存储在interBox数组中。

interBox[0]表示交集区域的左侧(x坐标最小值)。
interBox[1]表示交集区域的右侧(x坐标最大值)。
interBox[2]表示交集区域的顶部(y坐标最小值)。
interBox[3]表示交集区域的底部(y坐标最大值)。
检查交集区域是否有效。如果交集区域的顶部大于底部或左侧大于右侧，说明两个边界框没有交集，直接返回0.0f。

计算交集区域的面积(interBoxS)，即交集的宽度乘以高度。

计算并返回IoU值。IoU的计算公式为：交集区域的面积 / (lbox的面积 + rbox的面积 - 交集区域的面积)。这样计算得到的IoU值介于0和1之间，越接近1表示两个边界框的重叠程度越高，越接近0表示两个边界框没有交集。

函数iou的作用是评估两个边界框之间的重叠程度，以便在目标检测中判断检测结果的准确性。通常情况下，IoU阈值会被设置为一个合适的值，用于判断一个检测结果是否与真实目标匹配。例如，当IoU大于等于阈值时，将该检测结果视为正确的目标检测。
*/
float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
(std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
(std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
(std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
(std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
};

if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;

float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}
//为了将置信度高的检测框排在前边
bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
return a.conf > b.conf;
}

void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
int det_size = sizeof(Yolo::Detection) / sizeof(float);
std::map<float, std::vector<Yolo::Detection>> m;
for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh) continue;
Yolo::Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
//std::cout << it->second[0].class_id << " --- " << std::endl;
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
//这是一个用于加载权重文件wts，并将其解析成std::map<std::string,Weights>对象的函数
/*
函数接受一个file函数参数，表示权重文件的路径
创建一个空的std::map<std::string, Weights>对象，用于存储权重信息。

打开权重文件，如果无法打开文件，则输出错误信息并终止程序运行。

从文件中读取权重数据的数量count，并确保count大于0，否则视为无效的权重文件。

进入一个循环，循环次数为count，用于逐个读取权重数据。

对于每个权重数据，首先读取权重的名称和大小（以字节为单位）。

创建一个Weights结构体wt，并为其分配内存以存储权重值。Weights结构体包含权重的数据类型（nvinfer1::DataType::kFLOAT，即浮点型数据）、权重值（wt.values）和权重值的数量（wt.count）。

读取权重值，并将其存储在刚刚分配的内存中。

将wt添加到weightMap中，以权重的名称作为键值，这样就可以通过权重名称进行查找和访问。

循环结束后，所有权重数据都已读取并存储在weightMap中。

返回weightMap，即存储了从权重文件中解析得到的权重数据的std::map<std::string, Weights>对象。

这个函数通常在使用TensorRT进行深度学习模型的推理时使用，用于加载预训练的权重数据，将它们转换为TensorRT所需的格式，并在模型推理过程中使用这些权重数据。
*/
std::map<std::string, Weights> loadWeights(const std::string file) {
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, Weights> weightMap;

// Open weights file
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

// Read number of weight blobs
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");

while (count--)
{
Weights wt{ nvinfer1::DataType::kFLOAT, nullptr, 0 };
uint32_t size;

// Read name and type of blob
std::string name;
input >> name >> std::dec >> size;
wt.type = nvinfer1::DataType::kFLOAT;

// Load blob
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;

wt.count = size;
weightMap[name] = wt;
}

return weightMap;
}
//用于在TensorRT的INetworkDefinition中添加一个2D批归一化（BatchNormalization）层，并根据给定的权重数据进行参数设置
IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
float *gamma = (float*)weightMap[lname + ".weight"].values;
float *beta = (float*)weightMap[lname + ".bias"].values;
float *mean = (float*)weightMap[lname + ".running_mean"].values;
float *var = (float*)weightMap[lname + ".running_var"].values;
int len = weightMap[lname + ".running_var"].count;

float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
scval[i] = gamma[i] / sqrt(var[i] + eps);
}
Weights scale{ nvinfer1::DataType::kFLOAT, scval, len };

float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
}
Weights shift{ nvinfer1::DataType::kFLOAT, shval, len };

float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
pval[i] = 1.0;
}
Weights power{ nvinfer1::DataType::kFLOAT, pval, len };

weightMap[lname + ".scale"] = scale;
weightMap[lname + ".shift"] = shift;
weightMap[lname + ".power"] = power;
IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
assert(scale_1);
return scale_1;
}

ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
Weights emptywts{ nvinfer1::DataType::kFLOAT, nullptr, 0 };
int p = ksize / 2;
IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
assert(conv1);
conv1->setStrideNd(DimsHW{ s, s });
conv1->setPaddingNd(DimsHW{ p, p });
conv1->setNbGroups(g);
IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

// silu = x * sigmoid
auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
assert(sig);
auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
assert(ew);
return ew;
}

ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
auto cat = network->addConcatenation(inputTensors, 4);
auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
return conv;
}

ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
if (shortcut && c1 == c2) {
auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
return ew;
}
return cv2;
}

ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
Weights emptywts{ nvinfer1::DataType::kFLOAT, nullptr, 0 };
int c_ = (int)((float)c2 * e);
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
ITensor *y1 = cv1->getOutput(0);
for (int i = 0; i < n; i++) {
auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
y1 = b->getOutput(0);
}
auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);

ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
auto cat = network->addConcatenation(inputTensors, 2);

IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
lr->setAlpha(0.1);

auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
return cv4;
}

ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
int c_ = (int)((float)c2 * e);
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
ITensor *y1 = cv1->getOutput(0);
for (int i = 0; i < n; i++) {
auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
y1 = b->getOutput(0);
}

ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
auto cat = network->addConcatenation(inputTensors, 2);

auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
return cv3;
}

ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
int c_ = c1 / 2;
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
pool1->setStrideNd(DimsHW{ 1, 1 });
auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
pool2->setStrideNd(DimsHW{ 1, 1 });
auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
pool3->setStrideNd(DimsHW{ 1, 1 });

ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
auto cat = network->addConcatenation(inputTensors, 4);

auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
return cv2;
}

std::vector<float> getAnchors(std::map<std::string, Weights>& weightMap)
{
std::vector<float> anchors_yolo;
Weights Yolo_Anchors = weightMap["model.24.anchor_grid"];
assert(Yolo_Anchors.count == 18);
int each_yololayer_anchorsnum = Yolo_Anchors.count / 3;
const float* tempAnchors = (const float*)(Yolo_Anchors.values);
for (int i = 0; i < Yolo_Anchors.count; i++)
{
if (i < each_yololayer_anchorsnum)
{
anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
}
if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum)))
{
anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
}
if (i >= (2 * each_yololayer_anchorsnum))
{
anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
}
}
return anchors_yolo;
}

IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2)
{
auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
std::vector<float> anchors_yolo = getAnchors(weightMap);
PluginField pluginMultidata[4];
int NetData[4];
NetData[0] = Yolo::CLASS_NUM;
NetData[1] = Yolo::INPUT_W;
NetData[2] = Yolo::INPUT_H;
NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT;
pluginMultidata[0].data = NetData;
pluginMultidata[0].length = 3;
pluginMultidata[0].name = "netdata";
pluginMultidata[0].type = PluginFieldType::kFLOAT32;
int scale[3] = { 8, 16, 32 };
int plugindata[3][8];
std::string names[3];
for (int k = 1; k < 4; k++)
{
plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1];
plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1];
for (int i = 2; i < 8; i++)
{
plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]);
}
pluginMultidata[k].data = plugindata[k - 1];
pluginMultidata[k].length = 8;
names[k - 1] = "yolodata" + std::to_string(k);
pluginMultidata[k].name = names[k - 1].c_str();
pluginMultidata[k].type = PluginFieldType::kFLOAT32;
}
PluginFieldCollection pluginData;
pluginData.nbFields = 4;
pluginData.fields = pluginMultidata;
IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData);
ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) };
auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
return yolo;
}

//char *trtModelStream = nullptr;//read engine
//size_t size = 0;//unsigned int//read engine --size

//false means x<y, true means x>y
bool my_compare(result x, result y){

// if(x.box_.area() > y.box_.area())
float temp_x = pow(x.box_.x+x.box_.width/2-320,2)+pow(x.box_.y+x.box_.height/2-480,2);
float temp_y = pow(y.box_.x+y.box_.width/2-320,2)+pow(y.box_.y+y.box_.height/2-480,2);
if((temp_x)<(temp_y))
return true;
else return false;

}

bool my_compare2(targetPose x, targetPose y){
if((x.distance)<(y.distance))
return true;
else return false;
}

static int get_width(int x, float gw, int divisor = 8) {
//return math.ceil(x / divisor) * divisor
if (int(x * gw) % divisor == 0) {
return int(x * gw);
}
return (int(x * gw / divisor) + 1) * divisor;
}

static int get_depth(int x, float gd) {
if (x == 1) {
return 1;
} else {
return round(x * gd) > 1 ? round(x * gd) : 1;
}
}

ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
INetworkDefinition* network = builder->createNetworkV2(0U);

// Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
assert(data);

std::map<std::string, Weights> weightMap = loadWeights(wts_name);

/* ------ yolov5 backbone------ */
auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");
auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, 9, 13, "model.8");

/* ------ yolov5 head ------ */
auto bottleneck_csp9 = C3(network, weightMap, *spp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.9");
auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");

auto upsample11 = network->addResize(*conv10->getOutput(0));
assert(upsample11);
upsample11->setResizeMode(ResizeMode::kNEAREST);
upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());

ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
auto cat12 = network->addConcatenation(inputTensors12, 2);
auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");

auto upsample15 = network->addResize(*conv14->getOutput(0));
assert(upsample15);
upsample15->setResizeMode(ResizeMode::kNEAREST);
upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());

ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
auto cat16 = network->addConcatenation(inputTensors16, 2);

auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");

// yolo layer 0
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
auto cat19 = network->addConcatenation(inputTensors19, 2);
auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
//yolo layer 1
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
auto cat22 = network->addConcatenation(inputTensors22, 2);
auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*yolo->getOutput(0));

// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
#if defined(USE_FP16)
config->setFlag(BuilderFlag::kFP16);
#elif defined(USE_INT8)
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
assert(builder->platformHasFastInt8());
config->setFlag(BuilderFlag::kINT8);
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
config->setInt8Calibrator(calibrator);
#endif

std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;

// Don't need the network any more
network->destroy();

// Release host memory
for (auto& mem : weightMap)
{
free((void*)(mem.second.values));
}

return engine;
}

//void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, float& gd, float& gw, std::string& wts_name) {
// // Create builder
// IBuilder* builder = createInferBuilder(gLogger);
// IBuilderConfig* config = builder->createBuilderConfig();
//
// // Create model to populate the network, then set the outputs and create an engine
// ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
// assert(engine != nullptr);
//
// // Serialize the engine
// (*modelStream) = engine->serialize();
//
// // Close everything down
// engine->destroy();
// builder->destroy();
// config->destroy();
//}

void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}

int test()
{
std::cout<<"ss"<<std::endl;
}
//目的是加载预先序列化的TensorRT引擎文件，准备执行推理所需的TensorRT运行时、引擎和上下文等。
bool engine_init(std::string engine_name)
{
cudaSetDevice(DEVICE);
std::ifstream file(engine_name, std::ios::binary);//输入文件流创建，二进制方式读取
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
return -1;
}//用于检测流的状态，若engine无法加载则报错
char *trtModelStream = nullptr;
size_t size = 0;
file.seekg(0, file.end);//文件指针指向末尾
size = file.tellg();//读取此时文件指针位置，返回值为int,获得当前大小
file.seekg(0, file.beg);//文件指针回文件首
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);//从输入流中将size个字符转移至trtModelStream
file.close();

//使用createInferRuntime函数创建一个TensorRT的运行时对象runtime
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
//使用deserializeCudaEngine方法从trtModelStream中反序列化得到Cuda引擎对象engine
engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
//使用createExecutionContext方法创建一个推理上下文对象context，用于执行推理。
context = engine->createExecutionContext();
assert(context != nullptr);
//释放掉之前读取引擎文件时分配的内存空间trtModelStream
delete[] trtModelStream;
//检查引擎绑定的数量是否为2，因为通常推理引擎包含输入和输出两个绑定。
assert(engine->getNbBindings() == 2);
//获取输入和输出绑定在引擎中的索引，通常输入索引为0，输出索引为1。
inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);

}
// 执行yolo目标检测的主要函数
std::vector<BoundingBox> yolo_main(cv::Mat &src)
{
// prepare input data 准备用于推理的输入数据和输出数据的静态数组data和prob，
//并创建CUDA设备上的GPU缓冲区buffers用于存储输入和输出数据-----------------
static float prob[BATCH_SIZE * OUTPUT_SIZE];//输出框数
static float data[BATCH_SIZE * 3 * 640 * 640];//三通道输入

void* buffers[2];
// 将输入图像src克隆为img，并检查图像是否为空
cv::Mat img = src.clone();
if (img.empty()) return false;
//在设备上分配输入和输出数据的内存，并创建CUDA流stream用于异步操作
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
//对输入图像进行预处理，将其转换为三通道RGB格式，并将像素值归一化到[0,1]之间，然后将处理后的数据存储到data数组中
//if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue
cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB
long int i = 0;
for (int row = 0; row < INPUT_H; ++row)
{
uchar* uc_pixel = pr_img.data + row * pr_img.step;
for (int col = 0; col < INPUT_W; ++col)
{
data[i] = (float)uc_pixel[2] / 255.0;
data[ i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
data[ i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
// Run inference使用推理上下文context执行推理操作，并在计时的同时计算推理的帧率（FPS）
auto start = std::chrono::system_clock::now();
//对推理结果进行后处理，包括非极大值抑制（NMS）操作，将检测到的目标绘制在输出图像src上，并在图像上显示FPS信息
doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
auto end = std::chrono::system_clock::now();
//fps display
//std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
float tri_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
int tri_fps = 1000/tri_time;
std::string fps = "fps: ";
fps += std::to_string (tri_fps);
cv::Point text;
text.x = 540;
text.y = 460;
cv::putText(src, fps, text, cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0, 0, 255), 1);

   if(std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()>150)
   {
   exit(1);
   }
   std::vector<Yolo::Detection> batch_res;

auto& res = batch_res;
nms(res, &prob[0], CONF_THRESH, NMS_THRESH);

if(res.empty())
{
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));
return false;
}
// result result_sets[res.size()];
//std::vector<result> result_sets;
std::vector<BoundingBox> detected_boxes;
for (size_t j = 0; j < res.size(); j++)
{
cv::Rect r = get_rect(img, res[j].bbox);//框之所在
if(r.area()<150)
{
continue;
}

if((int)res[j].class_id == 0)
{
cv::putText(src, "no_mask", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(0, 0,255 ), 1);
cv::rectangle(src, r, cv::Scalar(0, 0, 255), 2);
}
else if((int)res[j].class_id == 1)
{
cv::putText(src, "mask", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 2, cv::Scalar(0, 0,255 ), 1);
cv::rectangle(src, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
}
BoundingBox temp;
//result temp;
temp.class_ = (int)res[j].class_id;
temp.box_ = r;
detected_boxes.emplace_back(temp);

}
// 输出检测框的坐标信息
for (const auto& box : result_sets) {
std::cout << "Class: " << box.classId << ", X: " << box.rect.x << ", Y: " << box.rect.y
<< ", Width: " << box.rect.width << ", Height: " << box.rect.height << std::endl;
}
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));
return detected_boxes;
}