C++中用TensorRT与多线程进行模型推理

何宜秋

于 2024-10-05 19:42:38 发布

阅读量331

点赞数 10

文章标签：人工智能深度学习 c++ ai

本文链接：https://blog.csdn.net/heyiqiunet/article/details/142717900

版权

用C++与TensorRT推理，在C,C++环境中进行，可在各种安装了这种环境的边缘设备上运行，需要安装CUDA,CUDNN，以支持GPU，设备要安装英伟达可编程显卡（GPU），同时还要安装TensorRT,TensorRT安装方法在我的另一篇文章中已有说明：推理引擎TensorRT安装与多线程推理（Python）（推理引擎TensorRT安装与多线程推理（Python）_tensorrt多个任务并行推理-CSDN博客）

C++多线程模型推理，可看另外一个作者的文章：TensorRT部署yolov8目标检测任务，地址：https://zhuanlan.zhihu.com/p/681591561?utm_id=0，代码地址：https://github.com/cyberyang123/Learning-TensorRT

我这里在Ubuntu系统上安装了Clion，在Clion中配置运行项目。

在Clion中创建一个C++Executable项目，属于cmake类型项目。

配置CMake options：

在下图中的CMake options中填下这行命令，让CMake能够找到nvcc编译器：

-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc

如果系统已安装了Opecv并且需要用到，还要回上下面命令：

-DopenCV_DIR=/opt/software/build

两个命令之间用空格分开，如下图：

配置CMakeLists.txt：

我这里CMakeLists.txt中代码如下所示：

cmake_minimum_required(VERSION 3.28)

project(Demo)

set(CMAKE_CXX_STANDARD 17)

set(CUDA_NVCC_FLAGS -g;-G) # c++: error: unrecognized command-line option '-G'

set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)

dependent library, the next warning / error will give you more info.

/opt/venv/python39venv/lib/python3.9/site-packages/torch/share/cmake)

set(OPENCV_INCLUDE /opt/software/opencv-4.x/include)

set(OPENCV_LIB /opt/software/opencv-4.x/lib)

set(TENSORRT_INCLUDE /opt/software/TensorRT-10.2.0.19/include)

set(TENSORRT_LIB /opt/software/TensorRT-10.2.0.19/lib)

include_directories(${TENSORRT_INCLUDE} ${OPENCV_INCLUDE} ${CUDA_TOOLKIT_ROOT_DIR}/include )

# 收集所有库文件

file(GLOB OPENCV_LIBS ${OPENCV_LIB}/*.so)

# 收集 CUDA 和 TensorRT 库文件

file(GLOB CUDA_LIBS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/*.so)

file(GLOB TENSORRT_LIBS ${TENSORRT_LIB}/*.so)

add_executable(Demo main.cpp

detect.cpp

preprocessing.hpp

yolov8_utils.cpp

)

target_link_libraries(Demo ${OPENCV_LIBS} ${TENSORRT_LIBS} ${CUDA_LIBS})

其他代码类似上面的github中代码，然后在运行设置中添加一个CMake Application类型的设置，再点保存按钮。

如果项目没有编译错误，正常情况下，选这个配置可运行成功。

这是一个YOLOv8目标检测模型，检测结果如下图：

C++中如何用线程池与TensorRT实现多线程推理？下面是相关改动的代码：

在CLion中添加一个ThreadPool.h文件，代码如下：

#include <iostream>

#include <atomic>

#include <thread>

#include <chrono>

#include <mutex>

#include <condition_variable>

#include <functional>

#include <vector>

#include <queue>

// Function will be running inside thread pool

using ThreadTask = std::function<void()>;

class ThreadPool

{

public:

// If threads_num is 0, it will use the same number of CPU cores

// If tasks_num is -1, the number of tasks will be unlimited

ThreadPool(int threads_num = 0, int tasks_num = -1)

{

if(threads_num == 0)

{

max_threads = std::thread::hardware_concurrency();

}

else

{

max_threads = threads_num;

}

max_tasks = tasks_num;

is_running = false;

}

~ThreadPool()

{

WaitForStop();

}

bool AddTask(ThreadTask task)

{

// Scope for lock

{

std::unique_lock<std::mutex> lock(tasks_guard);

if(max_tasks == -1)

{

// Unlimited

tasks.push(task);

}

else

{

if(tasks.size() >= max_tasks)

{

return false;

}

else

{

tasks.push(task);

}

// Notify thread

tasks_event.notify_one();

return true;

}

bool Start()

{

if(is_running)

{

// Running already

return false;

}

is_running = true;

if(threads.empty())

{

CreateThreads();

}

return true;

}

void WaitForStop()

{

if(!is_running)

{

// I am not running

return;

}

is_running = false;

tasks_event.notify_all();

for(auto &t : threads)

{

// Wait for all threads to exit

t.join();

}

threads.clear();

}

private:

void CreateThreads()

{

for(int i = 0; i < max_threads; i++)

{

threads.push_back(std::thread(&ThreadPool::ThreadRoutine, this));

}

// Thread worker function

// Take task from queue, and run it

static void ThreadRoutine(ThreadPool* ptr)

{

if(ptr == nullptr)

{

return;

}

while(ptr->is_running || !ptr->tasks.empty())

{

ThreadTask task;

// Scope for lock

{

// Get task to run

std::unique_lock<std::mutex> lock(ptr->tasks_guard);

while(ptr->tasks.empty())

{

// Wait until task is ready

ptr->tasks_event.wait(lock);

}

// OK, now there is a task ready to run

task = ptr->tasks.front();

ptr->tasks.pop();

}

// Run it

task();

}

private:

// Max threads allowed

int max_threads;

// Max tasks inside queue

int max_tasks;

// Vector of threads

std::vector<std::thread> threads;

// Queue of tasks

std::queue<ThreadTask> tasks;

// Flag of runnin status

bool is_running;

// Mutex to protect the tasks queue

std::mutex tasks_guard;

// Condition of tasks event

std::condition_variable tasks_event;

};

然后在原runtime.cu文件中进行修改，修改后的代码如下：

#include <stdio.h>

#include <math.h>

#include <string>

#include <iostream>

#include <fstream>

#include <vector>

#include <memory>

#include <functional>

#include <unistd.h>

#include <chrono>

#include <assert.h>

#include <NvInfer.h>

#include <NvOnnxParser.h>

#include <NvInferRuntime.h>

#include <opencv2/opencv.hpp>

#include "yolov8_utils.h"

#include <filesystem>

// 以下示例捕获所有警告消息，但忽略信息性消息

class Logger : public nvinfer1::ILogger {

void log(Severity severity, const char *msg) noexcept override {

// 抑制信息级别的消息

if (severity <= Severity::kWARNING)

std::cout << msg << std::endl;

}

};

// 加载模型文件

std::vector<unsigned char> load_engine_file(const std::string &file_name) {

std::vector<unsigned char> engine_data;

std::ifstream engine_file(file_name, std::ios::binary);

assert(engine_file.is_open() && "Unable to load engine file.");

engine_file.seekg(0, engine_file.end);

int length = engine_file.tellg();

engine_data.resize(length);

engine_file.seekg(0, engine_file.beg);

engine_file.read(reinterpret_cast<char *>(engine_data.data()), length);

return engine_data;

}

const int inputIndex = 0;

const int outputIndex = 1;

namespace fs = std::filesystem;

// 获取当前文件所在的目录路径

std::filesystem::path current_path = std::filesystem::current_path();

void processImage(std::mutex *task_protect, int i, void *buffers[2], cudaStream_t *stream, std::unique_ptr<nvinfer1::IExecutionContext> *context) {

std::lock_guard<std::mutex> guard(*task_protect);

// 读取图片

cv::Mat img;

std::cout << current_path;

std::string file_name =(current_path.parent_path().string())+ ("/img/img"+std::to_string(i)+".jpg");

img = cv::imread(file_name);

if (img.empty()) //检测image有无数据，无数据 image.empty()返回真

{

std::cout << "Could not open or find the image" << std::endl;

return;

}

cv::Mat LetterBoxImg;

cv::Vec4d params;

LetterBox(img, LetterBoxImg, params, cv::Size(640, 640));

cv::Mat blob;

cv::dnn::blobFromImage(img, blob, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true, false, CV_32F);

// 将图像拷贝到GPU

cudaMemcpyAsync(buffers[inputIndex], blob.data, 3 * 640 * 640 * sizeof(float), cudaMemcpyHostToDevice, *stream);

//执行推理

if ((*context)->enqueueV3(*stream)) {

std::cout << "enqueued successfully!" << std::endl;

}

cudaStreamSynchronize(*stream);

float rst[1][84][8400];

cudaMemcpyAsync(&rst, buffers[outputIndex], 1 * 84 * 8400 * sizeof(float), cudaMemcpyDeviceToHost, *stream);

postprocess(rst, img, params);

}

#include "ThreadPool.h"

int main(int argc, char **argv) {

// 实例化ILogger

Logger logger;

std::unique_ptr<nvinfer1::IRuntime> runtime = std::unique_ptr<nvinfer1::IRuntime>(

nvinfer1::createInferRuntime(logger));

if (runtime == nullptr) { return false; }

std::string file_path = "/home/heyiqiu/PycharmProjects/ultralytics-main/yolov8n_engine_intro.trt";

auto plan = load_engine_file(file_path);

auto engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(plan.data(), plan.size()));

if (engine == nullptr) { return false; }

using namespace nvinfer1;

std::unique_ptr<nvinfer1::IExecutionContext> context = std::unique_ptr<nvinfer1::IExecutionContext>(engine->createExecutionContext());

if (context == nullptr) { return false; }

auto idims = engine->getTensorShape("images");

auto odims = engine->getTensorShape("output0");

nvinfer1::Dims4 inputDims = {idims.d[0], idims.d[1], idims.d[2], idims.d[3]};

nvinfer1::Dims3 outputDims = {idims.d[0], idims.d[1], idims.d[2]};

context->setInputShape("images", inputDims);

void *buffers[2];

cudaMalloc(&buffers[inputIndex], idims.d[0] * idims.d[1] * idims.d[2] * idims.d[3] * sizeof(float));

cudaMalloc(&buffers[outputIndex], odims.d[0] * odims.d[1] * odims.d[2] * sizeof(float));

// 设定数据地址

context->setTensorAddress("images", buffers[inputIndex]);

context->setTensorAddress("output0", buffers[outputIndex]);

// 创建cuda流

cudaStream_t stream;

cudaStreamCreate(&stream);

std::mutex protect_task;

ThreadPool pool(3, -1);

for (int i = 0; i < 2; i++) {

int j=i;

pool.AddTask(std::bind(processImage, &protect_task, j, buffers, &stream, &context));

}

pool.Start();

pool.WaitForStop();

cudaStreamDestroy(stream);

cudaFree(buffers[inputIndex]);

cudaFree(buffers[outputIndex]);

}

在上面代码中的processImage函数里面，下面这一行加锁很重要，如果不加锁，多个线程同时操作同一个变量，得到的结果不是预期的，加锁后就能得到预期的结果：

std::lock_guard<std::mutex> guard(*task_protect);

何宜秋

关注

10
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫