Yolov11的TensorRT部署(C++版本)
一、环境配置
需要配置好CUDA、CUDNN环境,还要装好TensorRT环境
二、模型准备
首先你需要一个ONNX模型文件,我是Pytorch->ONNX
from ultralytics import YOLO
model = YOLO("./yolov11m.pt")
if __name__ == '__main__':
model.export(format="onnx")
得到yolov11m.onnx文件,接下来就是ONNX->trt
找到TensorRT安装位置
TensorRT-8.6.4.3/bin/trtexec --onnx=yolov8n.onnx --saveEigine=yolov8n.trt
生成engines文件
三、模型部署
3.1 反序列化模型
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
3.2 初始化CUDA缓存
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
std::string cuda_post_process) {
assert(engine->getNbBindings() == 2);
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
*output_buffer_host = new float[kBatchSize * kOutputSize];
}
3.3 推理
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
auto start = std::chrono::system_clock::now();
context.enqueueV2(buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
3.4 主函数
int main(int argc, char** argv) {
cudaSetDevice(kGpuId);
std::string wts_name="";"D:/py/ultralytics-main/runs/train/exp/weights/best.wts";
std::string engine_name="E:yolov11/bin64/release/best-1024.engines";;
std::string img_dir="D:/HK/fh/images/";
std::string cuda_post_process="c";
std::string type="m";
int model_bboxes;
float gd = 0.50f, gw = 1.00f;
int max_channels = 512;
// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
std::cout<<model_bboxes<<__LINE__;
// Prepare cpu and gpu buffers
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device, cuda_post_process);
// batch predict
// Get a batch of images
std::vector<cv::Mat> img_batch;
cv::Mat img = cv::imread("D:/images/hz195-g7_0_20240826_124545_2_2048_2448.jpg");
img_batch.push_back(img);
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
}
// Draw bounding boxes
draw_bbox(img_batch, res_batch);
// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::Mat tmp;
cv::resize(img_batch[j],tmp,cv::Size(960,960));
cv::imshow("1",tmp);
cv::waitKey(0);
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;
return 0;
}