C++中用TensorRT与多线程进行模型推理

C++与TensorRT推理,在C,C++环境中进行,可在各种安装了这种环境的边缘设备上运行,要安装CUDA,CUDNN,支持GPU,设备要安装英伟达可编程显卡(GPU),同时还要安装TensorRT,TensorRT安装方法在我的另一篇文章中已有说明:推理引擎TensorRT安装与多线程推理(Python)(推理引擎TensorRT安装与多线程推理(Python)_tensorrt多个任务并行推理-CSDN博客

C++多线程模型推理,可看另外一个作者的文章:TensorRT部署yolov8目标检测任务,地址:https://zhuanlan.zhihu.com/p/681591561?utm_id=0,代码地址:https://github.com/cyberyang123/Learning-TensorRT

我这里在Ubuntu系统上安装了Clion,在Clion中配置运行项目。

Clion中创建一个C++Executable项目,属于cmake类型项目。

配置CMake options

在下图中的CMake options中填下这行命令,让CMake能够找到nvcc编译器:

-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc

如果系统已安装了Opecv并且需要用到,还要回上下面命令:

-DopenCV_DIR=/opt/software/build

两个命令之间用空格分开,如下图:

配置CMakeLists.txt

我这里CMakeLists.txt中代码如下所示:

cmake_minimum_required(VERSION 3.28)

project(Demo)

set(CMAKE_CXX_STANDARD 17)

set(CUDA_NVCC_FLAGS -g;-G) # c++: error: unrecognized command-line option '-G'

set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)

dependent library, the next warning / error will give you more info.

/opt/venv/python39venv/lib/python3.9/site-packages/torch/share/cmake)

set(OPENCV_INCLUDE /opt/software/opencv-4.x/include)

set(OPENCV_LIB /opt/software/opencv-4.x/lib)

set(TENSORRT_INCLUDE /opt/software/TensorRT-10.2.0.19/include)

set(TENSORRT_LIB /opt/software/TensorRT-10.2.0.19/lib)

include_directories(${TENSORRT_INCLUDE} ${OPENCV_INCLUDE} ${CUDA_TOOLKIT_ROOT_DIR}/include )

# 收集所有库文件

file(GLOB OPENCV_LIBS ${OPENCV_LIB}/*.so)

# 收集 CUDA TensorRT 库文件

file(GLOB CUDA_LIBS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/*.so)

file(GLOB TENSORRT_LIBS ${TENSORRT_LIB}/*.so)

add_executable(Demo main.cpp

detect.cpp

preprocessing.hpp

yolov8_utils.cpp

)

target_link_libraries(Demo ${OPENCV_LIBS} ${TENSORRT_LIBS} ${CUDA_LIBS})

其他代码类似上面的github中代码,然后在运行设置中添加一个CMake Application类型的设置,再点保存按钮。

如果项目没有编译错误,正常情况下,选这个配置可运行成功。

这是一个YOLOv8目标检测模型,检测结果如下图:

C++中如何用线程池与TensorRT实现多线程推理?下面是相关改动的代码:

CLion中添加一个ThreadPool.h文件,代码如下:

#include <iostream>

#include <atomic>

#include <thread>

#include <chrono>

#include <mutex>

#include <condition_variable>

#include <functional>

#include <vector>

#include <queue>

// Function will be running inside thread pool

using ThreadTask = std::function<void()>;

class ThreadPool

{

public:

// If threads_num is 0, it will use the same number of CPU cores

// If tasks_num is -1, the number of tasks will be unlimited

ThreadPool(int threads_num = 0, int tasks_num = -1)

{

if(threads_num == 0)

{

max_threads = std::thread::hardware_concurrency();

}

else

{

max_threads = threads_num;

}

max_tasks = tasks_num;

is_running = false;

}

~ThreadPool()

{

WaitForStop();

}

bool AddTask(ThreadTask task)

{

// Scope for lock

{

std::unique_lock<std::mutex> lock(tasks_guard);

if(max_tasks == -1)

{

// Unlimited

tasks.push(task);

}

else

{

if(tasks.size() >= max_tasks)

{

return false;

}

else

{

tasks.push(task);

}

}

}

// Notify thread

tasks_event.notify_one();

return true;

}

bool Start()

{

if(is_running)

{

// Running already

return false;

}

is_running = true;

if(threads.empty())

{

CreateThreads();

}

return true;

}

void WaitForStop()

{

if(!is_running)

{

// I am not running

return;

}

is_running = false;

tasks_event.notify_all();

for(auto &t : threads)

{

// Wait for all threads to exit

t.join();

}

threads.clear();

}

private:

void CreateThreads()

{

for(int i = 0; i < max_threads; i++)

{

threads.push_back(std::thread(&ThreadPool::ThreadRoutine, this));

}

}

// Thread worker function

// Take task from queue, and run it

static void ThreadRoutine(ThreadPool* ptr)

{

if(ptr == nullptr)

{

return;

}

while(ptr->is_running || !ptr->tasks.empty())

{

ThreadTask task;

// Scope for lock

{

// Get task to run

std::unique_lock<std::mutex> lock(ptr->tasks_guard);

while(ptr->tasks.empty())

{

// Wait until task is ready

ptr->tasks_event.wait(lock);

}

// OK, now there is a task ready to run

task = ptr->tasks.front();

ptr->tasks.pop();

}

// Run it

task();

}

}

private:

// Max threads allowed

int max_threads;

// Max tasks inside queue

int max_tasks;

// Vector of threads

std::vector<std::thread> threads;

// Queue of tasks

std::queue<ThreadTask> tasks;

// Flag of runnin status

bool is_running;

// Mutex to protect the tasks queue

std::mutex tasks_guard;

// Condition of tasks event

std::condition_variable tasks_event;

};

然后在原runtime.cu文件中进行修改,修改后的代码如下:

#include <stdio.h>

#include <math.h>

#include <string>

#include <iostream>

#include <fstream>

#include <vector>

#include <memory>

#include <functional>

#include <unistd.h>

#include <chrono>

#include <assert.h>

#include <NvInfer.h>

#include <NvOnnxParser.h>

#include <NvInferRuntime.h>

#include <opencv2/opencv.hpp>

#include "yolov8_utils.h"

#include <filesystem>

// 以下示例捕获所有警告消息,但忽略信息性消息

class Logger : public nvinfer1::ILogger {

void log(Severity severity, const char *msg) noexcept override {

// 抑制信息级别的消息

if (severity <= Severity::kWARNING)

std::cout << msg << std::endl;

}

};

// 加载模型文件

std::vector<unsigned char> load_engine_file(const std::string &file_name) {

std::vector<unsigned char> engine_data;

std::ifstream engine_file(file_name, std::ios::binary);

assert(engine_file.is_open() && "Unable to load engine file.");

engine_file.seekg(0, engine_file.end);

int length = engine_file.tellg();

engine_data.resize(length);

engine_file.seekg(0, engine_file.beg);

engine_file.read(reinterpret_cast<char *>(engine_data.data()), length);

return engine_data;

}

const int inputIndex = 0;

const int outputIndex = 1;

namespace fs = std::filesystem;

// 获取当前文件所在的目录路径

std::filesystem::path current_path = std::filesystem::current_path();

void processImage(std::mutex *task_protect, int i, void *buffers[2], cudaStream_t *stream, std::unique_ptr<nvinfer1::IExecutionContext> *context) {

std::lock_guard<std::mutex> guard(*task_protect);

// 读取图片

cv::Mat img;

std::cout << current_path;

std::string file_name =(current_path.parent_path().string())+ ("/img/img"+std::to_string(i)+".jpg");

img = cv::imread(file_name);

if (img.empty()) //检测image有无数据,无数据 image.empty()返回 真

{

std::cout << "Could not open or find the image" << std::endl;

return;

}

cv::Mat LetterBoxImg;

cv::Vec4d params;

LetterBox(img, LetterBoxImg, params, cv::Size(640, 640));

cv::Mat blob;

cv::dnn::blobFromImage(img, blob, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true, false, CV_32F);

// 将图像拷贝到GPU

cudaMemcpyAsync(buffers[inputIndex], blob.data, 3 * 640 * 640 * sizeof(float), cudaMemcpyHostToDevice, *stream);

//执行推理

if ((*context)->enqueueV3(*stream)) {

std::cout << "enqueued successfully!" << std::endl;

}

cudaStreamSynchronize(*stream);

float rst[1][84][8400];

cudaMemcpyAsync(&rst, buffers[outputIndex], 1 * 84 * 8400 * sizeof(float), cudaMemcpyDeviceToHost, *stream);

postprocess(rst, img, params);

}

#include "ThreadPool.h"

int main(int argc, char **argv) {

// 实例化ILogger

Logger logger;

std::unique_ptr<nvinfer1::IRuntime> runtime = std::unique_ptr<nvinfer1::IRuntime>(

nvinfer1::createInferRuntime(logger));

if (runtime == nullptr) { return false; }

std::string file_path = "/home/heyiqiu/PycharmProjects/ultralytics-main/yolov8n_engine_intro.trt";

auto plan = load_engine_file(file_path);

auto engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(plan.data(), plan.size()));

if (engine == nullptr) { return false; }

using namespace nvinfer1;

std::unique_ptr<nvinfer1::IExecutionContext> context = std::unique_ptr<nvinfer1::IExecutionContext>(engine->createExecutionContext());

if (context == nullptr) { return false; }

auto idims = engine->getTensorShape("images");

auto odims = engine->getTensorShape("output0");

nvinfer1::Dims4 inputDims = {idims.d[0], idims.d[1], idims.d[2], idims.d[3]};

nvinfer1::Dims3 outputDims = {idims.d[0], idims.d[1], idims.d[2]};

context->setInputShape("images", inputDims);

void *buffers[2];

cudaMalloc(&buffers[inputIndex], idims.d[0] * idims.d[1] * idims.d[2] * idims.d[3] * sizeof(float));

cudaMalloc(&buffers[outputIndex], odims.d[0] * odims.d[1] * odims.d[2] * sizeof(float));

// 设定数据地址

context->setTensorAddress("images", buffers[inputIndex]);

context->setTensorAddress("output0", buffers[outputIndex]);

// 创建cuda

cudaStream_t stream;

cudaStreamCreate(&stream);

std::mutex protect_task;

ThreadPool pool(3, -1);

for (int i = 0; i < 2; i++) {

int j=i;

pool.AddTask(std::bind(processImage, &protect_task, j, buffers, &stream, &context));

}

pool.Start();

pool.WaitForStop();

cudaStreamDestroy(stream);

cudaFree(buffers[inputIndex]);

cudaFree(buffers[outputIndex]);

}

在上面代码中的processImage函数里面,下面这一行加锁很重要,如果不加锁,多个线程同时操作同一个变量,得到的结果不是预期的,加锁后就能得到预期的结果:

std::lock_guard<std::mutex> guard(*task_protect);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值