用C++与TensorRT推理,在C,C++环境中进行,可在各种安装了这种环境的边缘设备上运行,需要安装CUDA,CUDNN,以支持GPU,设备要安装英伟达可编程显卡(GPU),同时还要安装TensorRT,TensorRT安装方法在我的另一篇文章中已有说明:推理引擎TensorRT安装与多线程推理(Python)(推理引擎TensorRT安装与多线程推理(Python)_tensorrt多个任务并行推理-CSDN博客)
C++多线程模型推理,可看另外一个作者的文章:TensorRT部署yolov8目标检测任务,地址:https://zhuanlan.zhihu.com/p/681591561?utm_id=0,代码地址:https://github.com/cyberyang123/Learning-TensorRT
我这里在Ubuntu系统上安装了Clion,在Clion中配置运行项目。
在Clion中创建一个C++Executable项目,属于cmake类型项目。
配置CMake options:
在下图中的CMake options中填下这行命令,让CMake能够找到nvcc编译器:
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
如果系统已安装了Opecv并且需要用到,还要回上下面命令:
-DopenCV_DIR=/opt/software/build
两个命令之间用空格分开,如下图:
配置CMakeLists.txt:
我这里CMakeLists.txt中代码如下所示:
cmake_minimum_required(VERSION 3.28)
project(Demo)
set(CMAKE_CXX_STANDARD 17)
set(CUDA_NVCC_FLAGS -g;-G) # c++: error: unrecognized command-line option '-G'
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
dependent library, the next warning / error will give you more info.
/opt/venv/python39venv/lib/python3.9/site-packages/torch/share/cmake)
set(OPENCV_INCLUDE /opt/software/opencv-4.x/include)
set(OPENCV_LIB /opt/software/opencv-4.x/lib)
set(TENSORRT_INCLUDE /opt/software/TensorRT-10.2.0.19/include)
set(TENSORRT_LIB /opt/software/TensorRT-10.2.0.19/lib)
include_directories(${TENSORRT_INCLUDE} ${OPENCV_INCLUDE} ${CUDA_TOOLKIT_ROOT_DIR}/include )
# 收集所有库文件
file(GLOB OPENCV_LIBS ${OPENCV_LIB}/*.so)
# 收集 CUDA 和 TensorRT 库文件
file(GLOB CUDA_LIBS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/*.so)
file(GLOB TENSORRT_LIBS ${TENSORRT_LIB}/*.so)
add_executable(Demo main.cpp
detect.cpp
preprocessing.hpp
yolov8_utils.cpp
)
target_link_libraries(Demo ${OPENCV_LIBS} ${TENSORRT_LIBS} ${CUDA_LIBS})
其他代码类似上面的github中代码,然后在运行设置中添加一个CMake Application类型的设置,再点保存按钮。
如果项目没有编译错误,正常情况下,选这个配置可运行成功。
这是一个YOLOv8目标检测模型,检测结果如下图:
C++中如何用线程池与TensorRT实现多线程推理?下面是相关改动的代码:
在CLion中添加一个ThreadPool.h文件,代码如下:
#include <iostream>
#include <atomic>
#include <thread>
#include <chrono>
#include <mutex>
#include <condition_variable>
#include <functional>
#include <vector>
#include <queue>
// Function will be running inside thread pool
using ThreadTask = std::function<void()>;
class ThreadPool
{
public:
// If threads_num is 0, it will use the same number of CPU cores
// If tasks_num is -1, the number of tasks will be unlimited
ThreadPool(int threads_num = 0, int tasks_num = -1)
{
if(threads_num == 0)
{
max_threads = std::thread::hardware_concurrency();
}
else
{
max_threads = threads_num;
}
max_tasks = tasks_num;
is_running = false;
}
~ThreadPool()
{
WaitForStop();
}
bool AddTask(ThreadTask task)
{
// Scope for lock
{
std::unique_lock<std::mutex> lock(tasks_guard);
if(max_tasks == -1)
{
// Unlimited
tasks.push(task);
}
else
{
if(tasks.size() >= max_tasks)
{
return false;
}
else
{
tasks.push(task);
}
}
}
// Notify thread
tasks_event.notify_one();
return true;
}
bool Start()
{
if(is_running)
{
// Running already
return false;
}
is_running = true;
if(threads.empty())
{
CreateThreads();
}
return true;
}
void WaitForStop()
{
if(!is_running)
{
// I am not running
return;
}
is_running = false;
tasks_event.notify_all();
for(auto &t : threads)
{
// Wait for all threads to exit
t.join();
}
threads.clear();
}
private:
void CreateThreads()
{
for(int i = 0; i < max_threads; i++)
{
threads.push_back(std::thread(&ThreadPool::ThreadRoutine, this));
}
}
// Thread worker function
// Take task from queue, and run it
static void ThreadRoutine(ThreadPool* ptr)
{
if(ptr == nullptr)
{
return;
}
while(ptr->is_running || !ptr->tasks.empty())
{
ThreadTask task;
// Scope for lock
{
// Get task to run
std::unique_lock<std::mutex> lock(ptr->tasks_guard);
while(ptr->tasks.empty())
{
// Wait until task is ready
ptr->tasks_event.wait(lock);
}
// OK, now there is a task ready to run
task = ptr->tasks.front();
ptr->tasks.pop();
}
// Run it
task();
}
}
private:
// Max threads allowed
int max_threads;
// Max tasks inside queue
int max_tasks;
// Vector of threads
std::vector<std::thread> threads;
// Queue of tasks
std::queue<ThreadTask> tasks;
// Flag of runnin status
bool is_running;
// Mutex to protect the tasks queue
std::mutex tasks_guard;
// Condition of tasks event
std::condition_variable tasks_event;
};
然后在原runtime.cu文件中进行修改,修改后的代码如下:
#include <stdio.h>
#include <math.h>
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <memory>
#include <functional>
#include <unistd.h>
#include <chrono>
#include <assert.h>
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <NvInferRuntime.h>
#include <opencv2/opencv.hpp>
#include "yolov8_utils.h"
#include <filesystem>
// 以下示例捕获所有警告消息,但忽略信息性消息
class Logger : public nvinfer1::ILogger {
void log(Severity severity, const char *msg) noexcept override {
// 抑制信息级别的消息
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
// 加载模型文件
std::vector<unsigned char> load_engine_file(const std::string &file_name) {
std::vector<unsigned char> engine_data;
std::ifstream engine_file(file_name, std::ios::binary);
assert(engine_file.is_open() && "Unable to load engine file.");
engine_file.seekg(0, engine_file.end);
int length = engine_file.tellg();
engine_data.resize(length);
engine_file.seekg(0, engine_file.beg);
engine_file.read(reinterpret_cast<char *>(engine_data.data()), length);
return engine_data;
}
const int inputIndex = 0;
const int outputIndex = 1;
namespace fs = std::filesystem;
// 获取当前文件所在的目录路径
std::filesystem::path current_path = std::filesystem::current_path();
void processImage(std::mutex *task_protect, int i, void *buffers[2], cudaStream_t *stream, std::unique_ptr<nvinfer1::IExecutionContext> *context) {
std::lock_guard<std::mutex> guard(*task_protect);
// 读取图片
cv::Mat img;
std::cout << current_path;
std::string file_name =(current_path.parent_path().string())+ ("/img/img"+std::to_string(i)+".jpg");
img = cv::imread(file_name);
if (img.empty()) //检测image有无数据,无数据 image.empty()返回 真
{
std::cout << "Could not open or find the image" << std::endl;
return;
}
cv::Mat LetterBoxImg;
cv::Vec4d params;
LetterBox(img, LetterBoxImg, params, cv::Size(640, 640));
cv::Mat blob;
cv::dnn::blobFromImage(img, blob, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true, false, CV_32F);
// 将图像拷贝到GPU
cudaMemcpyAsync(buffers[inputIndex], blob.data, 3 * 640 * 640 * sizeof(float), cudaMemcpyHostToDevice, *stream);
//执行推理
if ((*context)->enqueueV3(*stream)) {
std::cout << "enqueued successfully!" << std::endl;
}
cudaStreamSynchronize(*stream);
float rst[1][84][8400];
cudaMemcpyAsync(&rst, buffers[outputIndex], 1 * 84 * 8400 * sizeof(float), cudaMemcpyDeviceToHost, *stream);
postprocess(rst, img, params);
}
#include "ThreadPool.h"
int main(int argc, char **argv) {
// 实例化ILogger
Logger logger;
std::unique_ptr<nvinfer1::IRuntime> runtime = std::unique_ptr<nvinfer1::IRuntime>(
nvinfer1::createInferRuntime(logger));
if (runtime == nullptr) { return false; }
std::string file_path = "/home/heyiqiu/PycharmProjects/ultralytics-main/yolov8n_engine_intro.trt";
auto plan = load_engine_file(file_path);
auto engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(plan.data(), plan.size()));
if (engine == nullptr) { return false; }
using namespace nvinfer1;
std::unique_ptr<nvinfer1::IExecutionContext> context = std::unique_ptr<nvinfer1::IExecutionContext>(engine->createExecutionContext());
if (context == nullptr) { return false; }
auto idims = engine->getTensorShape("images");
auto odims = engine->getTensorShape("output0");
nvinfer1::Dims4 inputDims = {idims.d[0], idims.d[1], idims.d[2], idims.d[3]};
nvinfer1::Dims3 outputDims = {idims.d[0], idims.d[1], idims.d[2]};
context->setInputShape("images", inputDims);
void *buffers[2];
cudaMalloc(&buffers[inputIndex], idims.d[0] * idims.d[1] * idims.d[2] * idims.d[3] * sizeof(float));
cudaMalloc(&buffers[outputIndex], odims.d[0] * odims.d[1] * odims.d[2] * sizeof(float));
// 设定数据地址
context->setTensorAddress("images", buffers[inputIndex]);
context->setTensorAddress("output0", buffers[outputIndex]);
// 创建cuda流
cudaStream_t stream;
cudaStreamCreate(&stream);
std::mutex protect_task;
ThreadPool pool(3, -1);
for (int i = 0; i < 2; i++) {
int j=i;
pool.AddTask(std::bind(processImage, &protect_task, j, buffers, &stream, &context));
}
pool.Start();
pool.WaitForStop();
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
}
在上面代码中的processImage函数里面,下面这一行加锁很重要,如果不加锁,多个线程同时操作同一个变量,得到的结果不是预期的,加锁后就能得到预期的结果:
std::lock_guard<std::mutex> guard(*task_protect);