yolov8-pose人体姿态关键点检测onnxruntime和tensorrt推理

给算法爸爸上香

于 2024-09-06 14:32:42 发布

阅读量246

点赞数 6

分类专栏： deep learning # model deployment 文章标签： YOLO 人体姿态关键点检测

本文链接：https://blog.csdn.net/taifyang/article/details/141691097

版权

deep learning 同时被 2 个专栏收录

116 篇文章 10 订阅

订阅专栏

model deployment

28 篇文章 4 订阅

订阅专栏

onnxruntime推理

导出onnx模型：

from ultralytics import YOLO
model = YOLO("yolov8n-pose.pt")  
model.export(format="onnx")

onnx模型结构如下：
在这里插入图片描述

import cv2
import numpy as np
import onnxruntime


# 调色板
palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
                    [230, 230, 0], [255, 153, 255], [153, 204, 255],
                    [255, 102, 255], [255, 51, 255], [102, 178, 255],
                    [51, 153, 255], [255, 153, 153], [255, 102, 102],
                    [255, 51, 51], [153, 255, 153], [102, 255, 102],
                    [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
                    [255, 255, 255]])
# 17个关键点连接顺序
skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
            [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
            [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
# 骨架颜色
pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
# 关键点颜色
pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]

score_threshold = 0.5  
nms_threshold = 0.5

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), scaleup=True):
    '''  调整图像大小和两边灰条填充  '''
    shape = im.shape[:2]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    # 缩放比例 (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    # 只进行下采样 因为上采样会让图片模糊
    if not scaleup:
        r = min(r, 1.0)
    # 计算pad长宽
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))  # 保证缩放后图像比例不变
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    # 在较小边的两侧进行pad, 而不是在一侧pad
    dw /= 2
    dh /= 2
    # 将原图resize到new_unpad（长边相同，比例相同的新图）
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    # 计算上下两侧的padding
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    # 计算左右两侧的padding
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    # 添加灰条
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return im

def pre_process(img):
    # 归一化 调整通道为（1，3，640，640）
    img = img / 255.
    img = np.transpose(img, (2, 0, 1))
    data = np.expand_dims(img, axis=0)
    return data

def xywh2xyxy(x):
    ''' 中心坐标、w、h ------>>> 左上点，右下点 '''
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

# nms算法
def nms(dets, iou_thresh):
    # dets: N * M, N是bbox的个数，M的前4位是对应的 左上点，右下点
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)  # 求每个bbox的面积
    order = scores.argsort()[::-1]  # 对分数进行倒排序
    keep = []  # 用来保存最后留下来的bboxx下标
    while order.size > 0:
        i = order[0]  # 无条件保留每次迭代中置信度最高的bbox
        keep.append(i)
        # 计算置信度最高的bbox和其他剩下bbox之间的交叉区域
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        # 计算置信度高的bbox和其他剩下bbox之间交叉区域的面积
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        # 求交叉区域的面积占两者（置信度高的bbox和其他bbox）面积和的必烈
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        # 保留ovr小于thresh的bbox，进入下一次迭代。
        inds = np.where(ovr <= iou_thresh)[0]
        # 因为ovr中的索引不包括order[0]所以要向后移动一位
        order = order[inds + 1]
    output = []
    for i in keep:
        output.append(dets[i].tolist())
    return np.array(output)

def xyxy2xywh(x):
    ''' 左上点 右下点 ------>>> 左上点 宽 高 '''
    y = np.copy(x)
    y[:, 2] = x[:, 2] - x[:, 0]  # w
    y[:, 3] = x[:, 3] - x[:, 1]  # h
    return y

def scale_boxes(img1_shape, boxes, img0_shape):
    '''   将预测的坐标信息转换回原图尺度
    :param img1_shape: 缩放后的图像尺度
    :param boxes:  预测的box信息
    :param img0_shape: 原始图像尺度
    '''
    # 将检测框(x y w h)从img1_shape(预测图) 缩放到 img0_shape(原图)
    gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
    pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    boxes[:, 0] -= pad[0]
    boxes[:, 1] -= pad[1]
    boxes[:, :4] /= gain  # 检测框坐标点还原到原图上
    num_kpts = boxes.shape[1] // 3   # 56 // 3 = 18
    for kid in range(2,num_kpts+1):
        boxes[:, kid * 3 - 1] = (boxes[:, kid * 3 - 1] - pad[0]) / gain
        boxes[:, kid * 3 ]  = (boxes[:, kid * 3 ] -  pad[1]) / gain
    clip_boxes(boxes, img0_shape)
    return boxes

def clip_boxes(boxes, shape):
    # 进行一个边界截断，以免溢出 并且将检测框的坐标（左上角x，左上角y，宽度，高度）--->>>（左上角x，左上角y，右下角x，右下角y）
    top_left_x = boxes[:, 0].clip(0, shape[1])
    top_left_y = boxes[:, 1].clip(0, shape[0])
    bottom_right_x = (boxes[:, 0] + boxes[:, 2]).clip(0, shape[1])
    bottom_right_y = (boxes[:, 1] + boxes[:, 3]).clip(0, shape[0])
    boxes[:, 0] = top_left_x      #左上
    boxes[:, 1] = top_left_y
    boxes[:, 2] = bottom_right_x  #右下
    boxes[:, 3] = bottom_right_y

def plot_skeleton_kpts(im, kpts, steps=3):
    num_kpts = len(kpts) // steps  # 51 / 3 =17
    # 画点
    for kid in range(num_kpts):
        r, g, b = pose_kpt_color[kid]
        x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
        conf = kpts[steps * kid + 2]
        if conf > score_threshold:  
            cv2.circle(im, (int(x_coord), int(y_coord)), 5, (int(r), int(g), int(b)), -1)
    # 画骨架
    for sk_id, sk in enumerate(skeleton):
        r, g, b = pose_limb_color[sk_id]
        pos1 = (int(kpts[(sk[0]-1)*steps]), int(kpts[(sk[0]-1)*steps+1]))
        pos2 = (int(kpts[(sk[1]-1)*steps]), int(kpts[(sk[1]-1)*steps+1]))
        conf1 = kpts[(sk[0]-1)*steps+2]
        conf2 = kpts[(sk[1]-1)*steps+2]
        if conf1 > score_threshold and conf2 > score_threshold:  # 对于肢体，相连的两个关键点置信度 必须同时大于 0.5
            cv2.line(im, pos1, pos2, (int(r), int(g), int(b)), thickness=2)


if __name__ == '__main__':
    image = cv2.imread('bus.jpg')
    img = letterbox(image)
    data = pre_process(img)
    session = onnxruntime.InferenceSession('yolov8n-pose.onnx', providers=['CPUExecutionProvider'])
    input_name = session.get_inputs()[0].name
    pred = session.run(None, {input_name: data.astype(np.float32)})[0]
    pred = np.transpose(pred[0], (1, 0))
    pred = pred[pred[:, 4] > score_threshold]                   # 置信度阈值过滤
    bboxs = xywh2xyxy(pred)                                     # 中心宽高转左上点，右下点
    bboxs = nms(bboxs, nms_threshold)                           # NMS处理
    bboxs = xyxy2xywh(np.array(bboxs))                          # 坐标从左上点，右下点 到 左上点，宽，高.
    bboxs = scale_boxes(img.shape, bboxs, image.shape)          # 坐标点还原到原图
    for box in bboxs:
        det_bbox, det_scores, kpts = box[0:4], box[4], box[5:]
        cv2.rectangle(image, (int(det_bbox[0]), int(det_bbox[1])), (int(det_bbox[2]), int(det_bbox[3])), (0, 0, 255), 2)
        cv2.putText(image, "conf:{:.2f}".format(det_scores), (int(det_bbox[0]) + 5, int(det_bbox[1]) - 5), cv2.FONT_HERSHEY_DUPLEX, 0.8, (0, 0, 255), 1)
        plot_skeleton_kpts(image, kpts)
    cv2.imshow("keypoint", image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

#include <iostream>
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>


const int input_width = 640;
const int input_height = 640;
const float score_threshold = 0.5;
const float nms_threshold = 0.5;
const int output_numbox = input_width / 8 * input_height / 8 + input_width / 16 * input_height / 16 + input_width / 32 * input_height / 32;
const std::unordered_multimap<int, int> skeletons = { {16, 14}, { 14, 12 }, {17, 15}, {15, 13}, {12, 13}, {6, 12},
	{7, 13}, {6, 7}, {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7} };


//LetterBox处理
void LetterBox(const cv::Mat& image, cv::Mat& outImage,
	const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114))
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}


//预处理
void pre_process(cv::Mat& image, std::vector<float>& inputs)
{
	cv::Vec4d params;
	cv::Mat letterbox;
	LetterBox(image, letterbox, cv::Size(input_width, input_height));

	cv::cvtColor(letterbox, letterbox, cv::COLOR_BGR2RGB);
	letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);
	std::vector<cv::Mat> split_images;
	cv::split(letterbox, split_images);
	for (size_t i = 0; i < letterbox.channels(); ++i)
	{
		std::vector<float> split_image_data = split_images[i].reshape(1, 1);
		inputs.insert(inputs.end(), split_image_data.begin(), split_image_data.end());
	}
}


//网络推理
void process(const wchar_t* model, std::vector<float>& inputs, std::vector<Ort::Value>& outputs)
{
	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "yolov5n");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(12);//设置线程数
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);//启用模型优化策略

	//CUDA option set
	//OrtCUDAProviderOptions cuda_option;
	//cuda_option.device_id = 0;
	//cuda_option.arena_extend_strategy = 0;
	//cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
	//cuda_option.gpu_mem_limit = SIZE_MAX;
	//cuda_option.do_copy_in_default_stream = 1;
	//session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
	//session_options.AppendExecutionProvider_CUDA(cuda_option);
	Ort::Session session(env, model, session_options);

	std::vector<const char*>  input_node_names;
	for (size_t i = 0; i < session.GetInputCount(); i++)
	{
		input_node_names.push_back("images");
	}

	std::vector<const char*> output_node_names;
	for (size_t i = 0; i < session.GetOutputCount(); i++)
	{
		output_node_names.push_back("output0");
	}

	// create input tensor object from data values
	std::vector<int64_t> input_node_dims = { 1, 3, input_width, input_height };
	auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, inputs.data(), inputs.size(), input_node_dims.data(), input_node_dims.size());

	std::vector<Ort::Value> ort_inputs;
	ort_inputs.push_back(std::move(input_tensor));//右值引用，避免不必要的拷贝和内存分配操作

	// score model & input tensor, get back output tensor	
	outputs = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());
}


//NMS
void nms(std::vector<cv::Rect>& boxes, std::vector<float>& scores, float score_threshold, float nms_threshold, std::vector<int>& indices)
{
	struct BoxScore
	{
		cv::Rect box;
		float score;
		int id;
	};
	std::vector<BoxScore> boxes_scores;
	for (size_t i = 0; i < boxes.size(); i++)
	{
		BoxScore box_conf;
		box_conf.box = boxes[i];
		box_conf.score = scores[i];
		box_conf.id = i;
		if (scores[i] > score_threshold)	boxes_scores.push_back(box_conf);
	}

	std::sort(boxes_scores.begin(), boxes_scores.end(), [](BoxScore a, BoxScore b) { return a.score > b.score; });

	std::vector<float> area(boxes_scores.size());
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		area[i] = boxes_scores[i].box.width * boxes_scores[i].box.height;
	}

	std::vector<bool> isSuppressed(boxes_scores.size(), false);
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		if (isSuppressed[i])  continue;
		for (size_t j = i + 1; j < boxes_scores.size(); ++j)
		{
			if (isSuppressed[j])  continue;

			float x1 = (std::max)(boxes_scores[i].box.x, boxes_scores[j].box.x);
			float y1 = (std::max)(boxes_scores[i].box.y, boxes_scores[j].box.y);
			float x2 = (std::min)(boxes_scores[i].box.x + boxes_scores[i].box.width, boxes_scores[j].box.x + boxes_scores[j].box.width);
			float y2 = (std::min)(boxes_scores[i].box.y + boxes_scores[i].box.height, boxes_scores[j].box.y + boxes_scores[j].box.height);
			float w = (std::max)(0.0f, x2 - x1);
			float h = (std::max)(0.0f, y2 - y1);
			float inter = w * h;
			float ovr = inter / (area[i] + area[j] - inter);

			if (ovr >= nms_threshold)  isSuppressed[j] = true;
		}
	}

	for (int i = 0; i < boxes_scores.size(); ++i)
	{
		if (!isSuppressed[i])	indices.push_back(boxes_scores[i].id);
	}
}


//box缩放到原图尺寸
void scale_box(cv::Rect& box, std::vector<float>& keypoint,  cv::Size size)
{
	float gain = std::min(input_width * 1.0 / size.width, input_height * 1.0 / size.height);
	int pad_w = (input_width - size.width * gain) / 2;
	int pad_h = (input_height - size.height * gain) / 2;
	box.x -= pad_w;
	box.y -= pad_h;
	box.x /= gain;
	box.y /= gain;
	box.width /= gain;
	box.height /= gain;

	for (size_t i = 0; i < keypoint.size() / 3; i++)
	{
		keypoint[3 * i] = (keypoint[3 * i] - pad_w) / gain;
		keypoint[3 * i + 1] = (keypoint[3 * i + 1] - pad_h) / gain;
	}
}


//可视化函数
void draw_result(cv::Mat& image, std::string label, cv::Rect box, std::vector<float> keypoint)
{
	cv::rectangle(image, box, cv::Scalar(255, 0, 0), 1);
	int baseLine;
	cv::Size label_size = cv::getTextSize(label, 1, 1, 1, &baseLine);
	cv::Point tlc = cv::Point(box.x, box.y);
	cv::Point brc = cv::Point(box.x, box.y + label_size.height + baseLine);
	cv::putText(image, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 1);

	for (size_t i = 0; i < keypoint.size() / 3; i++)
	{
		if (keypoint[3 * i + 2] < score_threshold)
			continue;
		cv::circle(image, cv::Point(keypoint[3 * i], keypoint[3 * i + 1]), 4, cv::Scalar(0, 255, 0), -1);
	}

	for (auto skeleton : skeletons)
	{
		cv::Point pos1(keypoint[3 * (skeleton.first - 1)], keypoint[3 * (skeleton.first - 1) + 1]);
		cv::Point pos2(keypoint[3 * (skeleton.second - 1)], keypoint[3 * (skeleton.second - 1) + 1]);
		float conf1 = keypoint[3 * (skeleton.first - 1) + 2];
		float conf2 = keypoint[3 * (skeleton.second - 1) + 2];
		if (conf1 > score_threshold && conf2 > score_threshold)
		{
			cv::line(image, pos1, pos2, cv::Scalar(0, 255, 0), 2);
		}
	}
}


//后处理
void post_process(cv::Mat& image, cv::Mat& result, std::vector<Ort::Value>& outputs)
{
	std::vector<cv::Rect> boxes;
	std::vector<float> scores;
	std::vector<int> class_ids;
	std::vector<std::vector<float>> keypoints;

	for (int i = 0; i < output_numbox; i++)
	{
		float score = (const_cast<float*> (outputs[0].GetTensorData<float>()) + 4 * output_numbox + i)[0];
		if (score < score_threshold)
			continue;
		float x = (const_cast<float*> (outputs[0].GetTensorData<float>()) + 0 * output_numbox + i)[0];
		float y = (const_cast<float*> (outputs[0].GetTensorData<float>()) + 1 * output_numbox + i)[0];
		float w = (const_cast<float*> (outputs[0].GetTensorData<float>()) + 2 * output_numbox + i)[0];
		float h = (const_cast<float*> (outputs[0].GetTensorData<float>()) + 3 * output_numbox + i)[0];
		int left = int(x - 0.5 * w);
		int top = int(y - 0.5 * h);
		int width = int(w);
		int height = int(h);
		cv::Rect box = cv::Rect(left, top, width, height);
		std::vector<float> keypoint(51);
		for (int j = 0; j < keypoint.size(); j++)
		{
			keypoint[j] = (const_cast<float*> (outputs[0].GetTensorData<float>()) + (5 + j) * output_numbox + i)[0];
		}
		scale_box(box, keypoint, image.size());
		boxes.push_back(box);
		scores.push_back(score);
		keypoints.push_back(keypoint);
	}

	std::vector<int> indices;
	nms(boxes, scores, score_threshold, nms_threshold, indices);
	for (int i = 0; i < indices.size(); i++)
	{
		int idx = indices[i];
		cv::Rect box = boxes[idx];
		std::string label = cv::format("%.2f", scores[idx]);
		std::vector<float> keypoint = keypoints[idx];
		draw_result(result, label, box, keypoint);
	}
}


int main(int argc, char* argv[])
{
	cv::Mat image = cv::imread("bus.jpg");
	std::vector<float> inputs;
	pre_process(image, inputs);

	const wchar_t* model = L"yolov8n-pose.onnx";
	std::vector<Ort::Value> outputs;
	process(model, inputs, outputs);

	cv::Mat result = image.clone();
	post_process(image, result, outputs);

	cv::imwrite("result.jpg", result);
	return 0;
}

tensorrt推理

python推理代码：

import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit 
import pycuda.driver as cuda  


# 调色板
palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
                    [230, 230, 0], [255, 153, 255], [153, 204, 255],
                    [255, 102, 255], [255, 51, 255], [102, 178, 255],
                    [51, 153, 255], [255, 153, 153], [255, 102, 102],
                    [255, 51, 51], [153, 255, 153], [102, 255, 102],
                    [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
                    [255, 255, 255]])
# 17个关键点连接顺序
skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
            [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
            [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
# 骨架颜色
pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
# 关键点颜色
pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]

score_threshold = 0.5  
nms_threshold = 0.5


def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    
    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))    
    dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2  # wh padding 
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im

def pre_process(img):
    # 归一化 调整通道为（1，3，640，640）
    img = img / 255.
    img = np.transpose(img, (2, 0, 1))
    data = np.expand_dims(img, axis=0)
    return data

def xywh2xyxy(x):
    ''' 中心坐标、w、h ------>>> 左上点，右下点 '''
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

# nms算法
def nms(dets, iou_thresh):
    # dets: N * M, N是bbox的个数，M的前4位是对应的 左上点，右下点
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)  # 求每个bbox的面积
    order = scores.argsort()[::-1]  # 对分数进行倒排序
    keep = []  # 用来保存最后留下来的bboxx下标
    while order.size > 0:
        i = order[0]  # 无条件保留每次迭代中置信度最高的bbox
        keep.append(i)
        # 计算置信度最高的bbox和其他剩下bbox之间的交叉区域
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        # 计算置信度高的bbox和其他剩下bbox之间交叉区域的面积
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        # 求交叉区域的面积占两者（置信度高的bbox和其他bbox）面积和的必烈
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        # 保留ovr小于thresh的bbox，进入下一次迭代。
        inds = np.where(ovr <= iou_thresh)[0]
        # 因为ovr中的索引不包括order[0]所以要向后移动一位
        order = order[inds + 1]
    output = []
    for i in keep:
        output.append(dets[i].tolist())
    return np.array(output)

def xyxy2xywh(x):
    ''' 左上点 右下点 ------>>> 左上点 宽 高 '''
    y = np.copy(x)
    y[:, 2] = x[:, 2] - x[:, 0]  # w
    y[:, 3] = x[:, 3] - x[:, 1]  # h
    return y

def scale_boxes(img1_shape, boxes, img0_shape):
    '''   将预测的坐标信息转换回原图尺度
    :param img1_shape: 缩放后的图像尺度
    :param boxes:  预测的box信息
    :param img0_shape: 原始图像尺度
    '''
    # 将检测框(x y w h)从img1_shape(预测图) 缩放到 img0_shape(原图)
    gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
    pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    boxes[:, 0] -= pad[0]
    boxes[:, 1] -= pad[1]
    boxes[:, :4] /= gain  # 检测框坐标点还原到原图上
    num_kpts = boxes.shape[1] // 3   # 56 // 3 = 18
    for kid in range(2,num_kpts+1):
        boxes[:, kid * 3 - 1] = (boxes[:, kid * 3 - 1] - pad[0]) / gain
        boxes[:, kid * 3 ]  = (boxes[:, kid * 3 ] -  pad[1]) / gain
    clip_boxes(boxes, img0_shape)
    return boxes

def clip_boxes(boxes, shape):
    top_left_x = boxes[:, 0].clip(0, shape[1])
    top_left_y = boxes[:, 1].clip(0, shape[0])
    bottom_right_x = (boxes[:, 0] + boxes[:, 2]).clip(0, shape[1])
    bottom_right_y = (boxes[:, 1] + boxes[:, 3]).clip(0, shape[0])
    boxes[:, 0] = top_left_x      #左上
    boxes[:, 1] = top_left_y
    boxes[:, 2] = bottom_right_x  #右下
    boxes[:, 3] = bottom_right_y

def plot_skeleton_kpts(im, kpts, steps=3):
    num_kpts = len(kpts) // steps  # 51 / 3 =17
    for kid in range(num_kpts):     # 画点
        r, g, b = pose_kpt_color[kid]
        x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
        conf = kpts[steps * kid + 2]
        if conf > score_threshold:  
            cv2.circle(im, (int(x_coord), int(y_coord)), 5, (int(r), int(g), int(b)), -1)
    for sk_id, sk in enumerate(skeleton):       # 画骨架
        r, g, b = pose_limb_color[sk_id]
        pos1 = (int(kpts[(sk[0]-1)*steps]), int(kpts[(sk[0]-1)*steps+1]))
        pos2 = (int(kpts[(sk[1]-1)*steps]), int(kpts[(sk[1]-1)*steps+1]))
        conf1 = kpts[(sk[0]-1)*steps+2]
        conf2 = kpts[(sk[1]-1)*steps+2]
        if conf1 > score_threshold and conf2 > score_threshold:  # 对于肢体，相连的两个关键点置信度 必须同时大于 0.5
            cv2.line(im, pos1, pos2, (int(r), int(g), int(b)), thickness=2)


if __name__ == '__main__':
    logger = trt.Logger(trt.Logger.WARNING)
    with open("yolov8n-pose.engine", "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()
    inputs_host = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
    outputs_host = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
    inputs_device = cuda.mem_alloc(inputs_host.nbytes)
    outputs_device = cuda.mem_alloc(outputs_host.nbytes)
    stream = cuda.Stream()
    
    image = cv2.imread('bus.jpg', -1)
    input = letterbox(image)
    input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)  #BGR2RGB和HWC2CHW
    input = input / 255.0
    input = np.expand_dims(input, axis=0)     
    np.copyto(inputs_host, input.ravel())
    
    with engine.create_execution_context() as context:
        cuda.memcpy_htod_async(inputs_device, inputs_host, stream)
        context.execute_async_v2(bindings=[int(inputs_device), int(outputs_device)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(outputs_host, outputs_device, stream)
        stream.synchronize()  
        pred = outputs_host.reshape(context.get_binding_shape(1))
        pred = np.transpose(pred[0], (1, 0))
        pred = pred[pred[:, 4] > score_threshold]                   # 置信度阈值过滤
        bboxs = xywh2xyxy(pred)                                     # 中心宽高转左上点，右下点
        bboxs = nms(bboxs, nms_threshold)                           # NMS处理
        bboxs = xyxy2xywh(np.array(bboxs))                          # 坐标从左上点，右下点 到 左上点，宽，高.
        bboxs = scale_boxes(img.shape, bboxs, image.shape)          # 坐标点还原到原图
        for box in bboxs:
            det_bbox, det_scores, kpts = box[0:4], box[4], box[5:]
            cv2.rectangle(image, (int(det_bbox[0]), int(det_bbox[1])), (int(det_bbox[2]), int(det_bbox[3])), (0, 0, 255), 2)
            cv2.putText(image, "conf:{:.2f}".format(det_scores), (int(det_bbox[0]) + 5, int(det_bbox[1]) - 5), cv2.FONT_HERSHEY_DUPLEX, 0.8, (0, 0, 255), 1)
            plot_skeleton_kpts(image, kpts)
        cv2.imwrite('result.jpg', image)

C++推理代码：

#include <iostream>
#include <fstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>


const int input_width = 640;
const int input_height = 640;
const float score_threshold = 0.5;
const float nms_threshold = 0.5;
const int output_numprob = 56;
const int output_numbox = input_width / 8 * input_height / 8 + input_width / 16 * input_height / 16 + input_width / 32 * input_height / 32;
const int input_numel = 1 * 3 * input_width * input_height;
const int output_numel = 1 * output_numprob * output_numbox;
const std::unordered_multimap<int, int> skeletons = { {16, 14}, { 14, 12 }, {17, 15}, {15, 13}, {12, 13}, {6, 12},
	{7, 13}, {6, 7}, {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7} };


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;


std::vector<unsigned char> load_file(const std::string& file)
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0)
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)&data[0], length);
	}
	in.close();
	return data;
}


//LetterBox处理
void LetterBox(const cv::Mat& image, cv::Mat& outImage,
	const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114))
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}


//预处理
void pre_process(cv::Mat& image, float* input_data_host)
{
	cv::Mat letterbox;
	LetterBox(image, letterbox, cv::Size(input_width, input_height));
	letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);

	int image_area = letterbox.cols * letterbox.rows;
	float* pimage = (float*)letterbox.data;
	float* phost_b = input_data_host + image_area * 0;
	float* phost_g = input_data_host + image_area * 1;
	float* phost_r = input_data_host + image_area * 2;
	for (int i = 0; i < image_area; ++i, pimage += 3)
	{
		*phost_r++ = pimage[0];
		*phost_g++ = pimage[1];
		*phost_b++ = pimage[2];
	}
}


//网络推理
void process(std::string model, float* input_data_host, float* output_data_host)
{
	TRTLogger logger;
	auto engine_data = load_file(model);
	auto runtime = nvinfer1::createInferRuntime(logger);
	auto engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());

	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);
	auto execution_context = engine->createExecutionContext();

	float* input_data_device = nullptr;
	cudaMalloc(&input_data_device, sizeof(float) * input_numel);
	cudaMemcpyAsync(input_data_device, input_data_host, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);

	float* output_data_device = nullptr;
	cudaMalloc(&output_data_device, sizeof(float) * output_numel);

	float* bindings[] = { input_data_device, output_data_device };
	execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	cudaStreamDestroy(stream);
	cudaFree(input_data_device);
	cudaFree(output_data_device);
}


//NMS
void nms(std::vector<cv::Rect>& boxes, std::vector<float>& scores, float score_threshold, float nms_threshold, std::vector<int>& indices)
{
	assert(boxes.size() == scores.size());

	struct BoxScore
	{
		cv::Rect box;
		float score;
		int id;
	};
	std::vector<BoxScore> boxes_scores;
	for (size_t i = 0; i < boxes.size(); i++)
	{
		BoxScore box_conf;
		box_conf.box = boxes[i];
		box_conf.score = scores[i];
		box_conf.id = i;
		if (scores[i] > score_threshold)	boxes_scores.push_back(box_conf);
	}

	std::sort(boxes_scores.begin(), boxes_scores.end(), [](BoxScore a, BoxScore b) { return a.score > b.score; });

	std::vector<float> area(boxes_scores.size());
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		area[i] = boxes_scores[i].box.width * boxes_scores[i].box.height;
	}

	std::vector<bool> isSuppressed(boxes_scores.size(), false);
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		if (isSuppressed[i])  continue;
		for (size_t j = i + 1; j < boxes_scores.size(); ++j)
		{
			if (isSuppressed[j])  continue;

			float x1 = (std::max)(boxes_scores[i].box.x, boxes_scores[j].box.x);
			float y1 = (std::max)(boxes_scores[i].box.y, boxes_scores[j].box.y);
			float x2 = (std::min)(boxes_scores[i].box.x + boxes_scores[i].box.width, boxes_scores[j].box.x + boxes_scores[j].box.width);
			float y2 = (std::min)(boxes_scores[i].box.y + boxes_scores[i].box.height, boxes_scores[j].box.y + boxes_scores[j].box.height);
			float w = (std::max)(0.0f, x2 - x1);
			float h = (std::max)(0.0f, y2 - y1);
			float inter = w * h;
			float ovr = inter / (area[i] + area[j] - inter);

			if (ovr >= nms_threshold)  isSuppressed[j] = true;
		}
	}

	for (int i = 0; i < boxes_scores.size(); ++i)
	{
		if (!isSuppressed[i])	indices.push_back(boxes_scores[i].id);
	}
}


//box缩放到原图尺寸
void scale_box(cv::Rect& box, std::vector<float>& keypoint, cv::Size size)
{
	float gain = std::min(input_width * 1.0 / size.width, input_height * 1.0 / size.height);
	int pad_w = (input_width - size.width * gain) / 2;
	int pad_h = (input_height - size.height * gain) / 2;
	box.x -= pad_w;
	box.y -= pad_h;
	box.x /= gain;
	box.y /= gain;
	box.width /= gain;
	box.height /= gain;

	for (size_t i = 0; i < keypoint.size() / 3; i++)
	{
		keypoint[3 * i] = (keypoint[3 * i] - pad_w) / gain;
		keypoint[3 * i + 1] = (keypoint[3 * i + 1] - pad_h) / gain;
	}
}


//可视化函数
void draw_result(cv::Mat& image, std::string label, cv::Rect box, std::vector<float> keypoint)
{
	cv::rectangle(image, box, cv::Scalar(255, 0, 0), 1);
	int baseLine;
	cv::Size label_size = cv::getTextSize(label, 1, 1, 1, &baseLine);
	cv::Point tlc = cv::Point(box.x, box.y);
	cv::Point brc = cv::Point(box.x, box.y + label_size.height + baseLine);
	cv::putText(image, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 1);

	for (size_t i = 0; i < keypoint.size() / 3; i++)
	{
		if (keypoint[3 * i + 2] < score_threshold)
			continue;
		cv::circle(image, cv::Point(keypoint[3 * i], keypoint[3 * i + 1]), 4, cv::Scalar(0, 255, 0), -1);
	}

	for (auto skeleton : skeletons)
	{
		cv::Point pos1(keypoint[3 * (skeleton.first - 1)], keypoint[3 * (skeleton.first - 1) + 1]);
		cv::Point pos2(keypoint[3 * (skeleton.second - 1)], keypoint[3 * (skeleton.second - 1) + 1]);
		float conf1 = keypoint[3 * (skeleton.first - 1) + 2];
		float conf2 = keypoint[3 * (skeleton.second - 1) + 2];
		if (conf1 > score_threshold && conf2 > score_threshold)
		{
			cv::line(image, pos1, pos2, cv::Scalar(0, 255, 0), 2);
		}
	}
}


//后处理
void post_process(const cv::Mat image, cv::Mat& result, float* output_data_host)
{
	std::vector<cv::Rect> boxes;
	std::vector<float> scores;
	std::vector<int> class_ids;
	std::vector<std::vector<float>> keypoints;

	for (int i = 0; i < output_numbox; i++)
	{
		float score = (output_data_host + 4 * output_numbox + i)[0];
		if (score < score_threshold)
			continue;
		float x = (output_data_host + 0 * output_numbox + i)[0];
		float y = (output_data_host + 1 * output_numbox + i)[0];
		float w = (output_data_host + 2 * output_numbox + i)[0];
		float h = (output_data_host + 3 * output_numbox + i)[0];
		int left = int(x - 0.5 * w);
		int top = int(y - 0.5 * h);
		int width = int(w);
		int height = int(h);
		cv::Rect box = cv::Rect(left, top, width, height);
		std::vector<float> keypoint(51);
		for (int j = 0; j < keypoint.size(); j++)
		{
			keypoint[j] = (output_data_host + (5 + j) * output_numbox + i)[0];
		}
		scale_box(box, keypoint, image.size());
		boxes.push_back(box);
		scores.push_back(score);
		keypoints.push_back(keypoint);
	}

	std::vector<int> indices;
	nms(boxes, scores, score_threshold, nms_threshold, indices);
	for (int i = 0; i < indices.size(); i++)
	{
		int idx = indices[i];
		cv::Rect box = boxes[idx];
		std::string label = cv::format("%.2f", scores[idx]);
		std::vector<float> keypoint = keypoints[idx];
		draw_result(result, label, box, keypoint);
	}
}


int main(int argc, char* argv[])
{
	float* inputs = nullptr;
	float* outputs = nullptr;
	cudaMallocHost(&inputs, sizeof(float) * input_numel);
	cudaMallocHost(&outputs, sizeof(float) * output_numel);
	cv::Mat image = cv::imread("bus.jpg");
	pre_process(image, inputs);

	std::string model = "yolov8n-pose.engine";
	process(model, inputs, outputs);

	cv::Mat result = image.clone();
	post_process(image, result, outputs);

	cv::imwrite("result.jpg", result);
	cudaFreeHost(outputs);
	cudaFreeHost(inputs);
	return 0;
}