使用Tensorrt推理Yolov5

朱永博

已于 2024-05-29 21:23:13 修改

阅读量467

点赞数 4

文章标签： YOLO

于 2024-05-21 22:37:12 首次发布

本文链接：https://blog.csdn.net/qq_49595983/article/details/139104210

版权

Python

import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()
def onnx_to_engine(onnx_file_path, engine_file_path, precision_type=None):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 30
    if precision_type == 'fp16':
        config.set_flag(trt.BuilderFlag.FP16)
    else:
        print('WARNING: FP32 is used by default.')
    profile = builder.create_optimization_profile()
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(engine_file_path, 'wb') as f:
        f.write(engine.serialize())
def readClassesNames(file_path):
    with open(file_path, encoding='utf-8') as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names
conf_thresold = 0.25
iou_threshold = 0.45
score_thresold = 0.25
classes_names = 'coco.names'
onnx_path = 'yolov5s.onnx'
engine_path = 'yolov5s.engine'
classes = readClassesNames(classes_names)
image = cv2.imread("bus.jpg")
image_height, image_width = image.shape[:2]
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
if os.path.exists(engine_path):
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    input_shape = engine.get_binding_shape(engine.get_binding_index('images'))
    input_width, input_height = input_shape[2:]
    resized = cv2.resize(image, (input_width, input_height))
    input_image = resized / 255.0
    input_image = input_image.transpose(2, 0, 1)
    input_tensor = input_image[np.newaxis, :, :, :].astype(np.float32)
    start_time = cv2.getTickCount()
    inputs_alloc_buf = []
    outputs_alloc_buf = []
    bindings_alloc_buf = []
    stream_alloc_buf = cuda.Stream()
    context = engine.create_execution_context()
    data_type = []
    for binding in engine:
        if engine.binding_is_input(binding):
            size = input_tensor.shape[0] * input_tensor.shape[1] * input_tensor.shape[2] * input_tensor.shape[3]
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            data_type.append(dtype)
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings_alloc_buf.append(int(device_mem))
            inputs_alloc_buf.append(HostDeviceMem(host_mem, device_mem))
        else:
            size = trt.volume(engine.get_binding_shape(binding)[1:]) * engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, data_type[0])
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings_alloc_buf.append(int(device_mem))
            outputs_alloc_buf.append(HostDeviceMem(host_mem, device_mem))
    inputs_alloc_buf[0].host = input_tensor.reshape(-1)
    for inp in inputs_alloc_buf:
        cuda.memcpy_htod_async(inp.device, inp.host, stream_alloc_buf)
    context.set_binding_shape(0, input_tensor.shape)
    context.execute_async(batch_size=1, bindings=bindings_alloc_buf, stream_handle=stream_alloc_buf.handle)
    for out in outputs_alloc_buf:
        cuda.memcpy_dtoh_async(out.host, out.device, stream_alloc_buf)
    stream_alloc_buf.synchronize()
    net_output = [out.host for out in outputs_alloc_buf]
    predictions = net_output[0].reshape(25200, 85)
    scores = np.max(predictions[:, 4:5], axis=1)
    predictions = predictions[scores > score_thresold, :]
    scores = scores[scores > score_thresold]
    class_ids = np.argmax(predictions[:, 5:], axis=1)
    boxes = predictions[:, :4]
    input_shape = np.array([input_width, input_height, input_width, input_height])
    boxes = np.divide(boxes, input_shape, dtype=np.float32)
    boxes *= np.array([image_width, image_height, image_width, image_height])
    boxes = boxes.astype(np.int32)
    indices = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=conf_thresold, nms_threshold=iou_threshold)
    detections = []
    def xywh2xyxy(x):
        y = np.copy(x)
        y[..., 0] = x[..., 0] - x[..., 2] / 2
        y[..., 1] = x[..., 1] - x[..., 3] / 2
        y[..., 2] = x[..., 0] + x[..., 2] / 2
        y[..., 3] = x[..., 1] + x[..., 3] / 2
        return y
    for (bbox, score, label) in zip(xywh2xyxy(boxes[indices]), scores[indices], class_ids[indices]):
        bbox = bbox.round().astype(np.int32).tolist()
        cls_id = int(label)
        cls = classes[cls_id]
        cv2.rectangle(image, tuple(bbox[:2]), tuple(bbox[2:]), (0, 0, 255), 2, 8)
        cv2.rectangle(image, (bbox[0], (bbox[1] - 20)), (bbox[2], bbox[1]), (0, 255, 255), -1)
        cv2.putText(image, f'{cls}', (bbox[0], bbox[1] - 5),
                    cv2.FONT_HERSHEY_PLAIN, 2, [225, 0, 0], thickness=2)
    end_time = cv2.getTickCount()
    t = (end_time - start_time) / cv2.getTickFrequency()
    fps = 1 / t
    print(f"EStimated FPS: {fps:.2f}")
    cv2.putText(image, 'FPS: {:.2f}'.format(fps), (20, 40), cv2.FONT_HERSHEY_PLAIN, 2, [225, 0, 0], 2, 8);
    cv2.imshow("Python + Tensorrt + Yolov5 推理结果", image)
    cv2.waitKey(0)
else:
    onnx_to_engine(onnx_path, engine_path, 'fp16')

Cpp

#include <windows.h>
#include <fstream>
#include <iostream>
#include <vector>
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <opencv2/opencv.hpp>
std::vector<std::string> read_class_names(std::string path_name)
{
	std::vector<std::string> class_names;
	std::ifstream infile;
	infile.open(path_name.data());   
	assert(infile.is_open());   
	std::string str;
	while (getline(infile, str)) {
		class_names.push_back(str);
		str.clear();
	}
	infile.close();
	return class_names;
}
class Logger : public nvinfer1::ILogger
{
	void log(Severity severity, const char* message)  noexcept
	{
		if (severity != Severity::kINFO)
			std::cout << message << std::endl;
	}
} gLogger;
void onnx_to_engine(std::string onnx_file_path, std::string engine_file_path, std::string type) {
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
	const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
	nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
	parser->parseFromFile(onnx_file_path.c_str(), 2);
	for (int i = 0; i < parser->getNbErrors(); ++i) 
	{
		std::cout << "load error: " << parser->getError(i)->desc() << std::endl;
	}
	printf("tensorRT load mask onnx model successfully!!!...\n");
	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
	config->setMaxWorkspaceSize(1 << 30);
	if (type == "fp16") {
		config->setFlag(nvinfer1::BuilderFlag::kFP16);
	}
	else {
		std::cout << "WARNING: FP32 is used by default." << std::endl;
	}
	nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
	std::cout << "try to save engine file now~~~" << std::endl;
	std::ofstream file_ptr(engine_file_path, std::ios::binary);
	if (!file_ptr) {
		std::cerr << "could not open plan output file" << std::endl;
		return;
	}
	nvinfer1::IHostMemory* model_stream = engine->serialize();
	file_ptr.write(reinterpret_cast<const char*>(model_stream->data()), model_stream->size());
	model_stream->destroy();
	engine->destroy();
	network->destroy();
	parser->destroy();
	std::cout << "convert onnx model to TensorRT engine model successfully!" << std::endl;
}
int main() 
{
	const char* model_path_onnx = "yolov5s.onnx";
	const char* model_path_engine = "yolov5s.engine";
	const char* image_path = "bus.jpg";
	std::string label_path = "coco.names";
	const char* input_node_name = "images";
	const char* output_node_name = "output0";
	int num_ionode = 2;
	std::vector<std::string> class_names;
	float factor;
	std::ifstream f(model_path_engine);
	bool engine_file_exist = f.good();
	std::ifstream file_ptr(model_path_engine, std::ios::binary);
	if (engine_file_exist) {
		size_t size = 0;
		file_ptr.seekg(0, file_ptr.end);	
		size = file_ptr.tellg();	
		file_ptr.seekg(0, file_ptr.beg);	
		char* model_stream = new char[size];
		file_ptr.read(model_stream, size);
		file_ptr.close();
		Logger logger;
		nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
		nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(model_stream, size);
		nvinfer1::IExecutionContext* context = engine->createExecutionContext();
		delete[] model_stream;
		void** data_buffer = new void* [num_ionode];
		int input_node_index = engine->getBindingIndex(input_node_name);
		nvinfer1::Dims input_node_dim = engine->getBindingDimensions(input_node_index);
		size_t input_data_length = input_node_dim.d[1] * input_node_dim.d[2] * input_node_dim.d[3];
		cudaMalloc(&(data_buffer[input_node_index]), input_data_length * sizeof(float));
		int output_node_index = engine->getBindingIndex(output_node_name);
		nvinfer1::Dims output_node_dim = engine->getBindingDimensions(output_node_index);
		size_t output_data_length = output_node_dim.d[1] * output_node_dim.d[2];
		cudaMalloc(&(data_buffer[output_node_index]), output_data_length * sizeof(float));
		cv::Mat image = cv::imread(image_path);
		int max_side_length = std::max(image.cols, image.rows);
		cv::Mat max_image = cv::Mat::zeros(cv::Size(max_side_length, max_side_length), CV_8UC3);
		cv::Rect roi(0, 0, image.cols, image.rows);
		image.copyTo(max_image(roi));
		cv::Size input_node_shape(input_node_dim.d[2], input_node_dim.d[3]);
		int64 start = cv::getTickCount();
		cv::Mat BN_image = cv::dnn::blobFromImage(max_image, 1 / 255.0, input_node_shape, cv::Scalar(0, 0, 0), true, false);
		cudaStream_t stream;
		cudaStreamCreate(&stream);
		std::vector<float> input_data(input_data_length);
		memcpy(input_data.data(), BN_image.ptr<float>(), input_data_length * sizeof(float));
		cudaMemcpyAsync(data_buffer[input_node_index], input_data.data(), input_data_length * sizeof(float), cudaMemcpyHostToDevice, stream);
		context->enqueueV2(data_buffer, stream, nullptr);
		float* result_array = new float[output_data_length];
		cudaMemcpyAsync(result_array, data_buffer[output_node_index], output_data_length * sizeof(float), cudaMemcpyDeviceToHost, stream);
		factor = max_side_length / (float)input_node_dim.d[2];
		class_names = read_class_names(label_path);
		cv::Mat det_output = cv::Mat(25200, 85, CV_32F, result_array);
		std::vector<cv::Rect> position_boxes;
		std::vector<int> classIds;
		std::vector<float> confidences;
		std::cout << det_output.rows << std::endl;
		for (int i = 0; i < det_output.rows; i++) {
			float confidence = det_output.at<float>(i, 4);
			if (confidence < 0.2) {
				continue;
			}
			std::cout << "confidence" << confidence << std::endl;
			cv::Mat classes_scores = det_output.row(i).colRange(5, 85);
			cv::Point classIdPoint;
			double score;
			minMaxLoc(classes_scores, 0, &score, 0, &classIdPoint);
			if (score > 0.25)
			{
				float cx = det_output.at<float>(i, 0);
				float cy = det_output.at<float>(i, 1);
				float ow = det_output.at<float>(i, 2);
				float oh = det_output.at<float>(i, 3);
				int x = static_cast<int>((cx - 0.5 * ow) * factor);
				int y = static_cast<int>((cy - 0.5 * oh) * factor);
				int width = static_cast<int>(ow * factor);
				int height = static_cast<int>(oh * factor);
				cv::Rect box;
				box.x = x;
				box.y = y;
				box.width = width;
				box.height = height;
				position_boxes.push_back(box);
				classIds.push_back(classIdPoint.x);
				confidences.push_back(score);
			}
		}
		std::vector<int> indexes;
		cv::dnn::NMSBoxes(position_boxes, confidences, 0.25, 0.45, indexes);
		for (size_t i = 0; i < indexes.size(); i++) {
			int index = indexes[i];
			int idx = classIds[index];
			cv::rectangle(image, position_boxes[index], cv::Scalar(0, 0, 255), 2, 8);
			cv::rectangle(image, cv::Point(position_boxes[index].tl().x, position_boxes[index].tl().y - 20),
				cv::Point(position_boxes[index].br().x, position_boxes[index].tl().y), cv::Scalar(0, 255, 255), -1);
			cv::putText(image, class_names[idx], cv::Point(position_boxes[index].tl().x, position_boxes[index].tl().y), cv::FONT_HERSHEY_PLAIN, 2.0, cv::Scalar(255, 0, 0), 2, 8);
		}
		float t = (cv::getTickCount() - start) / static_cast<float>(cv::getTickFrequency());
	   	cv::putText(image, cv::format("FPS: %.2f", 1 / t), cv::Point(20, 40), cv::FONT_HERSHEY_PLAIN, 2.0, cv::Scalar(255, 0, 0), 2, 8);
		cv::imshow("C++ + Tensorrt + Yolov5 推理结果", image);
		cv::waitKey(0);
	}
	else
	{
		onnx_to_engine(model_path_onnx, model_path_engine, "fp16");
	}
	return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.18)
project(Yolov5) 	
set(OpenCV_DIR "E:\\Opencv\\opencv_vs\\build") 
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib) 
set(OpenCV_LIBS "opencv_world480d.lib" "opencv_world480.lib")				
set(CMAKE_CUDA_ARCHITECTURES 86)		
find_package(CUDA REQUIRED)			
enable_language(CUDA)  						
find_package(OpenCV QUIET)
include_directories(${CUDA_INCLUDE_DIRS})		
include_directories(${OpenCV_INCLUDE_DIRS}) 
link_directories(${OpenCV_LIB_DIRS})  		
add_executable(Yolov5 ${PROJECT_SOURCE_DIR}/Yolov5.cpp)
target_link_libraries(Yolov5 "nvinfer.lib" "nvinfer_plugin.lib" "nvonnxparser.lib")  	
target_link_libraries(Yolov5 ${OpenCV_LIBS})     		
target_link_libraries(Yolov5 ${CUDA_LIBRARIES})