Yolov11的TensorRT部署（C++版本）

qq_42322398

已于 2024-10-29 15:03:47 修改

阅读量2.6k

点赞数 12

文章标签： python 深度学习

于 2024-10-29 14:50:59 首次发布

本文链接：https://blog.csdn.net/qq_42322398/article/details/143328147

版权

Yolov11的TensorRT部署（C++版本）

一、环境配置

    需要配置好CUDA、CUDNN环境，还要装好TensorRT环境

二、模型准备

    首先你需要一个ONNX模型文件，我是Pytorch->ONNX

from ultralytics import YOLO

model = YOLO("./yolov11m.pt")

if __name__ == '__main__':
     model.export(format="onnx")

得到yolov11m.onnx文件，接下来就是ONNX->trt

找到TensorRT安装位置

TensorRT-8.6.4.3/bin/trtexec --onnx=yolov8n.onnx --saveEigine=yolov8n.trt

生成engines文件

三、模型部署

3.1 反序列化模型

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
                       IExecutionContext** context) {
   std::ifstream file(engine_name, std::ios::binary);
   if (!file.good()) {
       std::cerr << "read " << engine_name << " error!" << std::endl;
       assert(false);
   }
   size_t size = 0;
   file.seekg(0, file.end);
   size = file.tellg();
   file.seekg(0, file.beg);
   char* serialized_engine = new char[size];
   assert(serialized_engine);
   file.read(serialized_engine, size);
   file.close();

   *runtime = createInferRuntime(gLogger);
   assert(*runtime);
   *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
   assert(*engine);
   *context = (*engine)->createExecutionContext();
   assert(*context);
   delete[] serialized_engine;
}

3.2 初始化CUDA缓存

void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
                  float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
                  std::string cuda_post_process) {
  assert(engine->getNbBindings() == 2);
  const int inputIndex = engine->getBindingIndex(kInputTensorName);
  const int outputIndex = engine->getBindingIndex(kOutputTensorName);
  assert(inputIndex == 0);
  assert(outputIndex == 1);
  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
  CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
  *output_buffer_host = new float[kBatchSize * kOutputSize];

}

3.3 推理

void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
         float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {

  auto start = std::chrono::system_clock::now();
  context.enqueueV2(buffers, stream, nullptr);
  if (cuda_post_process == "c") {
      CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                 stream));
      auto end = std::chrono::system_clock::now();
      std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                << "ms" << std::endl;
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

3.4 主函数

int main(int argc, char** argv) {
   cudaSetDevice(kGpuId);
   std::string wts_name="";"D:/py/ultralytics-main/runs/train/exp/weights/best.wts";
   std::string engine_name="E:yolov11/bin64/release/best-1024.engines";;
   std::string img_dir="D:/HK/fh/images/";
   std::string cuda_post_process="c";
   std::string type="m";
   int model_bboxes;

   float gd = 0.50f, gw = 1.00f;
   int max_channels = 512;
   
   // Deserialize the engine from file
   IRuntime* runtime = nullptr;
   ICudaEngine* engine = nullptr;
   IExecutionContext* context = nullptr;
   deserialize_engine(engine_name, &runtime, &engine, &context);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   cuda_preprocess_init(kMaxInputImageSize);
   auto out_dims = engine->getBindingDimensions(1);
   model_bboxes = out_dims.d[0];
   std::cout<<model_bboxes<<__LINE__;
   // Prepare cpu and gpu buffers
   float* device_buffers[2];
   float* output_buffer_host = nullptr;
   float* decode_ptr_host = nullptr;
   float* decode_ptr_device = nullptr;

   // Read images from directory
   std::vector<std::string> file_names;
   if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
       std::cerr << "read_files_in_dir failed." << std::endl;
       return -1;
   }

   prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                  &decode_ptr_device, cuda_post_process);

   // batch predict

   // Get a batch of images
   std::vector<cv::Mat> img_batch;
   cv::Mat img = cv::imread("D:/images/hz195-g7_0_20240826_124545_2_2048_2448.jpg");
   img_batch.push_back(img);
   // Preprocess
   cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
   // Run inference
   infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
         decode_ptr_device, model_bboxes, cuda_post_process);

   std::vector<std::vector<Detection>> res_batch;
   if (cuda_post_process == "c") {
       // NMS
       batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
   }
   // Draw bounding boxes
   draw_bbox(img_batch, res_batch);
   // Save images
   for (size_t j = 0; j < img_batch.size(); j++) {
       cv::Mat tmp;
       cv::resize(img_batch[j],tmp,cv::Size(960,960));
       cv::imshow("1",tmp);
       cv::waitKey(0);
   }


   // Release stream and buffers
   cudaStreamDestroy(stream);
   CUDA_CHECK(cudaFree(device_buffers[0]));
   CUDA_CHECK(cudaFree(device_buffers[1]));
   CUDA_CHECK(cudaFree(decode_ptr_device));
   delete[] decode_ptr_host;
   delete[] output_buffer_host;
   cuda_preprocess_destroy();
   // Destroy the engine
   delete context;
   delete engine;
   delete runtime;

   return 0;
}