Python
import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def onnx_to_engine(onnx_file_path, engine_file_path, precision_type=None):
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30
if precision_type == 'fp16':
config.set_flag(trt.BuilderFlag.FP16)
else:
print('WARNING: FP32 is used by default.')
profile = builder.create_optimization_profile()
config.add_optimization_profile(profile)
engine = builder.build_engine(network, config)
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
def readClassesNames(file_path):
with open(file_path, encoding='utf-8') as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
conf_thresold = 0.25
iou_threshold = 0.45
score_thresold = 0.25
classes_names = 'coco.names'
onnx_path = 'yolov5s.onnx'
engine_path = 'yolov5s.engine'
classes = readClassesNames(classes_names)
image = cv2.imread("bus.jpg")
image_height, image_width = image.shape[:2]
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
if os.path.exists(engine_path):
with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
input_shape = engine.get_binding_shape(engine.get_binding_index('images'))
input_width, input_height = input_shape[2:]
resized = cv2.resize(image, (input_width, input_height))
input_image = resized / 255.0
input_image = input_image.transpose(2, 0, 1)
input_tensor = input_image[np.newaxis, :, :, :].astype(np.float32)
start_time = cv2.getTickCount()
inputs_alloc_buf = []
outputs_alloc_buf = []
bindings_alloc_buf = []
stream_alloc_buf = cuda.Stream()
context = engine.create_execution_context()
data_type = []
for binding in engine:
if engine.binding_is_input(binding):
size = input_tensor.shape[0] * input_tensor.shape[1] * input_tensor.shape[2] * input_tensor.shape[3]
dtype = trt.nptype(engine.get_binding_dtype(binding))
data_type.append(dtype)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings_alloc_buf.append(int(device_mem))
inputs_alloc_buf.append(HostDeviceMem(host_mem, device_mem))
else:
size = trt.volume(engine.get_binding_shape(binding)[1:]) * engine.max_batch_size
host_mem = cuda.pagelocked_empty(size, data_type[0])
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings_alloc_buf.append(int(device_mem))
outputs_alloc_buf.append(HostDeviceMem(host_mem, device_mem))
inputs_alloc_buf[0].host = input_tensor.reshape(-1)
for inp in inputs_alloc_buf:
cuda.memcpy_htod_async(inp.device, inp.host, stream_alloc_buf)
context.set_binding_shape(0, input_tensor.shape)
context.execute_async(batch_size=1, bindings=bindings_alloc_buf, stream_handle=stream_alloc_buf.handle)
for out in outputs_alloc_buf:
cuda.memcpy_dtoh_async(out.host, out.device, stream_alloc_buf)
stream_alloc_buf.synchronize()
net_output = [out.host for out in outputs_alloc_buf]
predictions = net_output[0].reshape(25200, 85)
scores = np.max(predictions[:, 4:5], axis=1)
predictions = predictions[scores > score_thresold, :]
scores = scores[scores > score_thresold]
class_ids = np.argmax(predictions[:, 5:], axis=1)
boxes = predictions[:, :4]
input_shape = np.array([input_width, input_height, input_width, input_height])
boxes = np.divide(boxes, input_shape, dtype=np.float32)
boxes *= np.array([image_width, image_height, image_width, image_height])
boxes = boxes.astype(np.int32)
indices = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=conf_thresold, nms_threshold=iou_threshold)
detections = []
def xywh2xyxy(x):
y = np.copy(x)
y[..., 0] = x[..., 0] - x[..., 2] / 2
y[..., 1] = x[..., 1] - x[..., 3] / 2
y[..., 2] = x[..., 0] + x[..., 2] / 2
y[..., 3] = x[..., 1] + x[..., 3] / 2
return y
for (bbox, score, label) in zip(xywh2xyxy(boxes[indices]), scores[indices], class_ids[indices]):
bbox = bbox.round().astype(np.int32).tolist()
cls_id = int(label)
cls = classes[cls_id]
cv2.rectangle(image, tuple(bbox[:2]), tuple(bbox[2:]), (0, 0, 255), 2, 8)
cv2.rectangle(image, (bbox[0], (bbox[1] - 20)), (bbox[2], bbox[1]), (0, 255, 255), -1)
cv2.putText(image, f'{cls}', (bbox[0], bbox[1] - 5),
cv2.FONT_HERSHEY_PLAIN, 2, [225, 0, 0], thickness=2)
end_time = cv2.getTickCount()
t = (end_time - start_time) / cv2.getTickFrequency()
fps = 1 / t
print(f"EStimated FPS: {fps:.2f}")
cv2.putText(image, 'FPS: {:.2f}'.format(fps), (20, 40), cv2.FONT_HERSHEY_PLAIN, 2, [225, 0, 0], 2, 8);
cv2.imshow("Python + Tensorrt + Yolov5 推理结果", image)
cv2.waitKey(0)
else:
onnx_to_engine(onnx_path, engine_path, 'fp16')
Cpp
#include <windows.h>
#include <fstream>
#include <iostream>
#include <vector>
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <opencv2/opencv.hpp>
std::vector<std::string> read_class_names(std::string path_name)
{
std::vector<std::string> class_names;
std::ifstream infile;
infile.open(path_name.data());
assert(infile.is_open());
std::string str;
while (getline(infile, str)) {
class_names.push_back(str);
str.clear();
}
infile.close();
return class_names;
}
class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* message) noexcept
{
if (severity != Severity::kINFO)
std::cout << message << std::endl;
}
} gLogger;
void onnx_to_engine(std::string onnx_file_path, std::string engine_file_path, std::string type) {
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
parser->parseFromFile(onnx_file_path.c_str(), 2);
for (int i = 0; i < parser->getNbErrors(); ++i)
{
std::cout << "load error: " << parser->getError(i)->desc() << std::endl;
}
printf("tensorRT load mask onnx model successfully!!!...\n");
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
config->setMaxWorkspaceSize(1 << 30);
if (type == "fp16") {
config->setFlag(nvinfer1::BuilderFlag::kFP16);
}
else {
std::cout << "WARNING: FP32 is used by default." << std::endl;
}
nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "try to save engine file now~~~" << std::endl;
std::ofstream file_ptr(engine_file_path, std::ios::binary);
if (!file_ptr) {
std::cerr << "could not open plan output file" << std::endl;
return;
}
nvinfer1::IHostMemory* model_stream = engine->serialize();
file_ptr.write(reinterpret_cast<const char*>(model_stream->data()), model_stream->size());
model_stream->destroy();
engine->destroy();
network->destroy();
parser->destroy();
std::cout << "convert onnx model to TensorRT engine model successfully!" << std::endl;
}
int main()
{
const char* model_path_onnx = "yolov5s.onnx";
const char* model_path_engine = "yolov5s.engine";
const char* image_path = "bus.jpg";
std::string label_path = "coco.names";
const char* input_node_name = "images";
const char* output_node_name = "output0";
int num_ionode = 2;
std::vector<std::string> class_names;
float factor;
std::ifstream f(model_path_engine);
bool engine_file_exist = f.good();
std::ifstream file_ptr(model_path_engine, std::ios::binary);
if (engine_file_exist) {
size_t size = 0;
file_ptr.seekg(0, file_ptr.end);
size = file_ptr.tellg();
file_ptr.seekg(0, file_ptr.beg);
char* model_stream = new char[size];
file_ptr.read(model_stream, size);
file_ptr.close();
Logger logger;
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(model_stream, size);
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
delete[] model_stream;
void** data_buffer = new void* [num_ionode];
int input_node_index = engine->getBindingIndex(input_node_name);
nvinfer1::Dims input_node_dim = engine->getBindingDimensions(input_node_index);
size_t input_data_length = input_node_dim.d[1] * input_node_dim.d[2] * input_node_dim.d[3];
cudaMalloc(&(data_buffer[input_node_index]), input_data_length * sizeof(float));
int output_node_index = engine->getBindingIndex(output_node_name);
nvinfer1::Dims output_node_dim = engine->getBindingDimensions(output_node_index);
size_t output_data_length = output_node_dim.d[1] * output_node_dim.d[2];
cudaMalloc(&(data_buffer[output_node_index]), output_data_length * sizeof(float));
cv::Mat image = cv::imread(image_path);
int max_side_length = std::max(image.cols, image.rows);
cv::Mat max_image = cv::Mat::zeros(cv::Size(max_side_length, max_side_length), CV_8UC3);
cv::Rect roi(0, 0, image.cols, image.rows);
image.copyTo(max_image(roi));
cv::Size input_node_shape(input_node_dim.d[2], input_node_dim.d[3]);
int64 start = cv::getTickCount();
cv::Mat BN_image = cv::dnn::blobFromImage(max_image, 1 / 255.0, input_node_shape, cv::Scalar(0, 0, 0), true, false);
cudaStream_t stream;
cudaStreamCreate(&stream);
std::vector<float> input_data(input_data_length);
memcpy(input_data.data(), BN_image.ptr<float>(), input_data_length * sizeof(float));
cudaMemcpyAsync(data_buffer[input_node_index], input_data.data(), input_data_length * sizeof(float), cudaMemcpyHostToDevice, stream);
context->enqueueV2(data_buffer, stream, nullptr);
float* result_array = new float[output_data_length];
cudaMemcpyAsync(result_array, data_buffer[output_node_index], output_data_length * sizeof(float), cudaMemcpyDeviceToHost, stream);
factor = max_side_length / (float)input_node_dim.d[2];
class_names = read_class_names(label_path);
cv::Mat det_output = cv::Mat(25200, 85, CV_32F, result_array);
std::vector<cv::Rect> position_boxes;
std::vector<int> classIds;
std::vector<float> confidences;
std::cout << det_output.rows << std::endl;
for (int i = 0; i < det_output.rows; i++) {
float confidence = det_output.at<float>(i, 4);
if (confidence < 0.2) {
continue;
}
std::cout << "confidence" << confidence << std::endl;
cv::Mat classes_scores = det_output.row(i).colRange(5, 85);
cv::Point classIdPoint;
double score;
minMaxLoc(classes_scores, 0, &score, 0, &classIdPoint);
if (score > 0.25)
{
float cx = det_output.at<float>(i, 0);
float cy = det_output.at<float>(i, 1);
float ow = det_output.at<float>(i, 2);
float oh = det_output.at<float>(i, 3);
int x = static_cast<int>((cx - 0.5 * ow) * factor);
int y = static_cast<int>((cy - 0.5 * oh) * factor);
int width = static_cast<int>(ow * factor);
int height = static_cast<int>(oh * factor);
cv::Rect box;
box.x = x;
box.y = y;
box.width = width;
box.height = height;
position_boxes.push_back(box);
classIds.push_back(classIdPoint.x);
confidences.push_back(score);
}
}
std::vector<int> indexes;
cv::dnn::NMSBoxes(position_boxes, confidences, 0.25, 0.45, indexes);
for (size_t i = 0; i < indexes.size(); i++) {
int index = indexes[i];
int idx = classIds[index];
cv::rectangle(image, position_boxes[index], cv::Scalar(0, 0, 255), 2, 8);
cv::rectangle(image, cv::Point(position_boxes[index].tl().x, position_boxes[index].tl().y - 20),
cv::Point(position_boxes[index].br().x, position_boxes[index].tl().y), cv::Scalar(0, 255, 255), -1);
cv::putText(image, class_names[idx], cv::Point(position_boxes[index].tl().x, position_boxes[index].tl().y), cv::FONT_HERSHEY_PLAIN, 2.0, cv::Scalar(255, 0, 0), 2, 8);
}
float t = (cv::getTickCount() - start) / static_cast<float>(cv::getTickFrequency());
cv::putText(image, cv::format("FPS: %.2f", 1 / t), cv::Point(20, 40), cv::FONT_HERSHEY_PLAIN, 2.0, cv::Scalar(255, 0, 0), 2, 8);
cv::imshow("C++ + Tensorrt + Yolov5 推理结果", image);
cv::waitKey(0);
}
else
{
onnx_to_engine(model_path_onnx, model_path_engine, "fp16");
}
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.18)
project(Yolov5)
set(OpenCV_DIR "E:\\Opencv\\opencv_vs\\build")
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib)
set(OpenCV_LIBS "opencv_world480d.lib" "opencv_world480.lib")
set(CMAKE_CUDA_ARCHITECTURES 86)
find_package(CUDA REQUIRED)
enable_language(CUDA)
find_package(OpenCV QUIET)
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${OpenCV_INCLUDE_DIRS})
link_directories(${OpenCV_LIB_DIRS})
add_executable(Yolov5 ${PROJECT_SOURCE_DIR}/Yolov5.cpp)
target_link_libraries(Yolov5 "nvinfer.lib" "nvinfer_plugin.lib" "nvonnxparser.lib")
target_link_libraries(Yolov5 ${OpenCV_LIBS})
target_link_libraries(Yolov5 ${CUDA_LIBRARIES})