一、源代码和相关CMakeLists.txt
(1) 头文件(Tensor.h)
#include <iostream>
#include <fstream>
#include <sstream>
#include <time.h>
#include <string>
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <NvInferRuntime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/core/utils/logger.hpp>
#include<opencv2/dnn/dnn.hpp>
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;
//日志相关
class Logger : public ILogger
{
virtual void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
}
} gLogger;
class TensorRT_detect
{
public:
//默认构造函数
TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name, const char* output_node_name, string classesFile);
//初始化模型函数
void initialize();
//对图像进行等比例缩放函数
Mat Padding_Resize(Mat srcimg, int t_height, int t_width);
//重载等比例缩放函数
void Padding_Resize();
//析构函数
~TensorRT_detect();
//得到模型需要输入的图像的长宽信息
int getinputh();
int getinputw();
//创建输入输出缓存区
void Create_Buffer();
//检测函数
void detect();
//数据后处理函数
void Post_processing();
//数据前处理函数
void Pre_processing();
Mat dstimg; //缩放后的图
std::vector<float> input_data; //输入
const char* image_path; //测试图片的路径
private:
const char* model_path_engine; //engine模型文件的路径
std::string classesFile; //标签加载
const char* input_node_name; //模型输入节点的名称
const char* output_node_name ; //模型输出节点的名称
std::vector<std::string> class_names; //标签数组
int num_ionode = 0; //模型输入和输出的节点总数
Logger logger;
void** data_buffer; //存储输入输出数据的缓冲区
nvinfer1::IRuntime* runtime; //反序列化引擎
nvinfer1::ICudaEngine* engine; //推理引擎
nvinfer1::IExecutionContext* context; //上下文
int input_node_index; //输入节点对应的编号
nvinfer1::Dims input_node_dim; //输入节点的维度相关的信息
size_t input_data_length; //输入的数据量的大小
int output_node_index; //输出节点对应的编号
nvinfer1::Dims output_node_dim; //输出节点的维度相关信息
size_t output_data_length; //输出的数据量的大小
cudaStream_t stream; //创建cuda数据流
float* result_array; //输出数据
Mat test_img; //测试图片
};
(2) 代码文件
#include "Tensor.h"
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace cv;
using namespace std;
TensorRT_detect::TensorRT_detect(const char* model_path_engine, const char* image_path, const char* input_node_name, const char* output_node_name, string classesFile):
model_path_engine(model_path_engine), image_path(image_path), input_node_name(input_node_name), output_node_name(output_node_name), classesFile(classesFile)
{
//加载标签样本
std::ifstream ifs(classesFile.c_str());
std::string line;
while (getline(ifs, line)) class_names.push_back(line);
cout<<"标签的数量::::"<<class_names.size()<<endl;
//读取模型文件
std::ifstream file_ptr(model_path_engine, std::ios::binary);
if (!file_ptr.good()) {
std::cerr << "模型文件无法打开!" << std::endl;
}
//读取测试图片
test_img = imread(image_path);
size_t size = 0;
file_ptr.seekg(0, file_ptr.end);
size = file_ptr.tellg();
file_ptr.seekg(0, file_ptr.beg);
char* model_stream = new char[size];
file_ptr.read(model_stream, size);
file_ptr.close();
//初始化引擎
runtime = nvinfer1::createInferRuntime(logger);
engine = runtime->deserializeCudaEngine(model_stream, size);
context = engine->createExecutionContext();
}
void TensorRT_detect::initialize()
{
num_ionode = engine->getNbBindings();
data_buffer = new void*[num_ionode];
input_node_index = engine->getBindingIndex(input_node_name);
input_node_dim = engine->getBindingDimensions(input_node_index);
input_data_length = input_node_dim.d[1] * input_node_dim.d[2] * input_node_dim.d[3];
output_node_index = engine->getBindingIndex(output_node_name);
output_node_dim = engine->getBindingDimensions(output_node_index);
output_data_length = output_node_dim.d[1] * output_node_dim.d[2];
input_data.resize(input_data_length);
result_array = new float[output_data_length];
}
Mat TensorRT_detect::Padding_Resize(Mat srcimg, int t_height, int t_width)
{
int srch = srcimg.rows;
int srcw = srcimg.cols;
int newh, neww;
Mat dstimg;
if(srch != srcw){
float scale = (float)srch / srcw;
if(scale > 1){
newh = t_height;
neww = int(t_height / scale);
resize(srcimg, dstimg, Size(neww, newh), INTER_AREA);
int left = int(t_width - neww)*0.5;
copyMakeBorder(dstimg, dstimg, 0, 0, left, t_width - neww - left, BORDER_CONSTANT, 0);
}
else{
neww = t_width;
newh = (int)t_width * scale;
resize(srcimg, dstimg, Size(neww, newh), INTER_AREA);
int top = (t_height - newh) * 0.5;
copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
}
}
else{
resize(srcimg, dstimg, Size(neww, newh), INTER_AREA);
}
return dstimg;
}
void TensorRT_detect::Padding_Resize()
{
int t_height = input_node_dim.d[3];
int t_width = input_node_dim.d[2];
//resize(test_img, dstimg, Size(t_width, t_height), INTER_AREA);
int srch = test_img.rows;
int srcw = test_img.cols;
int newh, neww;
float scale = (float)srch / srcw;
if(scale > 1){
newh = t_height;
neww = int(t_height / scale);
resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
int left = int(t_width - neww)*0.5;
copyMakeBorder(dstimg, dstimg, 0, 0, left, t_width - neww - left, BORDER_CONSTANT, 0);
}
else if(scale < 1){
neww = t_width;
newh = (int)t_width * scale;
clock_t start_time1 = clock();
resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
clock_t end_time1 = clock();
double exec_time1 = static_cast<double>(end_time1 - start_time1) / CLOCKS_PER_SEC;
std::cout << "REsize Execution time: " << exec_time1<< " seconds" << std::endl;
//resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
int top = (t_height - newh) * 0.5;
clock_t start_time2 = clock();
copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
clock_t end_time2 = clock();
double exec_time2 = static_cast<double>(end_time2 - start_time2) / CLOCKS_PER_SEC;
std::cout << "copyMakeBorder Execution time: " << exec_time2<< " seconds" << std::endl;
//copyMakeBorder(dstimg, dstimg, top, t_height - newh - top, 0, 0, BORDER_CONSTANT, 0);
}
else{
resize(test_img, dstimg, Size(neww, newh), INTER_AREA);
}
//imwrite("test.jpg",dstimg);
}
int TensorRT_detect::getinputw()
{
return input_node_dim.d[2];
}
int TensorRT_detect::getinputh()
{
return input_node_dim.d[3];
}
TensorRT_detect::~TensorRT_detect()
{
delete []data_buffer;
if(data_buffer != nullptr){
data_buffer = nullptr;
}
delete []result_array;
if(result_array != nullptr){
result_array = nullptr;
}
}
void TensorRT_detect::Create_Buffer()
{
//创建输入缓存区
cudaError_t err1 = cudaMalloc(&(data_buffer[input_node_index]), input_data_length * sizeof(float));
if (err1 != cudaSuccess) {
std::cout << "Failed to allocate memory for input data: " << cudaGetErrorString(err1) << std::endl;
return;
}
//创建输出缓存区
cudaError_t err2 = cudaMalloc(&(data_buffer[output_node_index]), output_data_length * sizeof(float));
if (err1 != cudaSuccess) {
std::cout << "Failed to allocate memory for input data: " << cudaGetErrorString(err2) << std::endl;
return;
}
//创建其他三个维度的输出缓存区
nvinfer1::Dims output_node_dim1 = engine->getBindingDimensions(output_node_index-1);
size_t output_data_length1 = output_node_dim1.d[1] * output_node_dim1.d[2]* output_node_dim1.d[3]* output_node_dim1.d[4];
//std::cout << "output_data_length1" << output_data_length1 << std::endl;
cudaMalloc(&(data_buffer[output_node_index-1]), output_data_length1 * sizeof(float));
nvinfer1::Dims output_node_dim2 = engine->getBindingDimensions(output_node_index - 2);
size_t output_data_length2 = output_node_dim2.d[1] * output_node_dim2.d[2] * output_node_dim2.d[3] * output_node_dim2.d[4];
//std::cout << "output_data_length2" << output_data_length2 << std::endl;
cudaMalloc(&(data_buffer[output_node_index - 2]), output_data_length2 * sizeof(float));
nvinfer1::Dims output_node_dim3 = engine->getBindingDimensions(output_node_index - 3);
size_t output_data_length3 = output_node_dim3.d[1] * output_node_dim3.d[2] * output_node_dim3.d[3] * output_node_dim3.d[4];
//std::cout << "output_data_length3" << output_data_length3 << std::endl;
cudaMalloc(&(data_buffer[output_node_index - 3]), output_data_length3 * sizeof(float));
cudaStreamCreate(&stream);
}
void TensorRT_detect::Pre_processing()
{
int c1 = input_node_dim.d[1];
int row = input_node_dim.d[2];
int col = input_node_dim.d[3];
int temp = row * col;
for (int c = 0; c < c1; c++)
{
for (int i = 0; i < row; i++)
{
for (int j = 0; j < col; j++)
{
float pix = dstimg.ptr<uchar>(i)[j * 3 + 2 - c];//
input_data[c * temp + i * row + size_t(j)] = pix / 255.0;//
}
}
}
}
void TensorRT_detect::detect()
{
//将数据拷贝至显卡
cudaError_t err3 = cudaMemcpyAsync(data_buffer[input_node_index], input_data.data(), input_data_length * sizeof(float), cudaMemcpyHostToDevice, stream);
if (err3 != cudaSuccess) {
std::cout << "Failed to transfer input data to GPU1: " << cudaGetErrorString(err3) << std::endl;
return;
}
//进行推理
context->enqueueV2(data_buffer, stream, nullptr);
//将数据拷贝至主机
cudaError_t err4 = cudaMemcpyAsync(result_array, data_buffer[output_node_index], output_data_length * sizeof(float), cudaMemcpyDeviceToHost, stream);
if (err4 != cudaSuccess) {
std::cout << "Failed to transfer input data to HOST: " << cudaGetErrorString(err4) << std::endl;
return;
}
}
void TensorRT_detect::Post_processing()
{
std::vector<float> output(result_array, result_array + output_data_length);
std::vector<cv::Rect> boxes;
std::vector<float> confs;
std::vector<int> classIds;
//cout<<"输出矩阵的大小:"<<output.size()<<endl;
int numClasses = (int)output_node_dim.d[2] - 5;
float confThreshold = 0.5;
for (auto it = output.begin(); it != output.begin() + output_data_length; it += output_node_dim.d[2])
{
float clsConf = *(it + 4);//object scores
if (clsConf > confThreshold)
{
int centerX = (int)(*it);
int centerY = (int)(*(it + 1));
int width = (int)(*(it + 2));
int height = (int)(*(it + 3));
int x1 = centerX - width / 2;
int y1 = centerY - height / 2;
boxes.emplace_back(cv::Rect(x1, y1, width, height));
// first 5 element are x y w h and obj confidence
int bestClassId = -1;
float bestConf = 0.0;
for (int i = 5; i < numClasses + 5; i++)
{
if ((*(it + i)) > bestConf)
{
bestConf = it[i];
bestClassId = i - 5;
}
}
//confs.emplace_back(bestConf * clsConf);
confs.emplace_back(clsConf);
classIds.emplace_back(bestClassId);
}
}
//std::cout<<"11111111"<<std::endl;
float iouThreshold = 0.5;
std::vector<int> indices;
// Perform non maximum suppression to eliminate redundant overlapping boxes with
// lower confidences����ֵ����
cv::dnn::NMSBoxes(boxes, confs, confThreshold, iouThreshold, indices);
RNG rng((unsigned)time(NULL));
for (size_t i = 0; i < indices.size(); ++i)
{
int index = indices[i];
int colorR = rng.uniform(0, 255);
int colorG = rng.uniform(0, 255);
int colorB = rng.uniform(0, 255);
//
float scores = round(confs[index] * 100) / 100;
std::ostringstream oss;
oss << scores;
rectangle(dstimg, Point(boxes[index].tl().x, boxes[index].tl().y), Point(boxes[index].br().x, boxes[index].br().y), Scalar(colorR, colorG, colorB), 1.5);
putText(dstimg, class_names[classIds[index]] + " " + oss.str(), Point(boxes[index].tl().x, boxes[index].tl().y - 5), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(colorR, colorG, colorB), 2);
}
}
int main()
{
//路径取决于cmake的路径
TensorRT_detect TD("best.engine", "test.png", "images", "output", "best.txt");
TD.initialize();
TD.Create_Buffer();
TD.Padding_Resize();
for(int i = 0; i < 20; i++)
{
clock_t start_time = clock();
TD.Pre_processing();
clock_t end_time = clock();
double exec_time = static_cast<double>(end_time - start_time) / CLOCKS_PER_SEC;
std::cout << "Precessing Execution time: " << exec_time<< " seconds" << std::endl;
TD.detect();
TD.Post_processing();
clock_t end_time1 = clock();
double exec_time1 = static_cast<double>(end_time1 - start_time) / CLOCKS_PER_SEC;
std::cout << "Execution time: " << exec_time1<< " seconds" << std::endl;
}
return 0;
}
(3)CMakeLists.txt文件
#最低版本要求
cmake_minimum_required(VERSION 3.10)
#项目名称
project(Tensor_Detect)
#设置优化级别
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g")
#添加目标:可执行文件
add_executable(Tensor_Detect "Tensor.cpp")
#add_library(Tensor_Detect SHARED "Tensor_dir/Tensor.cpp" "Tensor_dir/Tensor.h")
#添加自定义代码头文件
target_include_directories(Tensor_Detect PUBLIC "Tensor")
#SET(HELLO_SO "/home/ywy/Tensor/libTensor_Detect.so")
# add OpenCV
find_package(OpenCV REQUIRED)
message("OPENCV_LIBRARIES:${OpenCV_INCLUDE_DIRS}")
INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
message("OpenCV_LIBRARIES:${OpenCV_LIBRARIES}")
# add TensorRT8.6.1
target_include_directories(Tensor_Detect PRIVATE "/home/ywy/TensorRT/TensorRT-8.6.1.6/include")
target_include_directories(Tensor_Detect PRIVATE "/home/ywy/TensorRT/TensorRT-8.6.1.6/samples/common")
set(TENSORRT_LIB_PATH "/home/ywy/TensorRT/TensorRT-8.6.1.6/lib")
file(GLOB LIBS "${TENSORRT_LIB_PATH}/*.so")
#动态库
#add_library(Tensor_Detect SHARED "Tensor_dir/Tensor.cpp" "Tensor_dir/Tensor.h")
# add CUDA 11.8
find_package(CUDA 11.8 REQUIRED)
message("CUDA_LIBRARIES:${CUDA_LIBRARIES}")
message("CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}")
target_include_directories(Tensor_Detect PRIVATE ${CUDA_INCLUDE_DIRS})
# link
target_link_libraries(Tensor_Detect ${LIBS} ${CUDA_LIBRARIES} ${OpenCV_LIBS})
二、遇到的相关问题
( 1 ) 数据前处理时间太长,三个for循环,神奇的地方是将缩放图片放在主函数for循环里面时间就是20ms左右,但是放在for循环外侧,就是4ms左右,比较费解
(2)在安装cuda驱动的时候总是会报错,换一种run的方式,除此以外,如果已经安装了Nvidia的驱动,安装的时候就不要勾选了
(3)TensotRT的模型文件不能跨平台使用,而且与cuda版本和cudnn的版本、TensorRT的版本严格对应,在不同的环境下生成的engine文件可能会有问题