模型部署流程
1. 模型准备
pytorch -> (onnx) -> trt engine
trtexec --onnx=output.onnx --saveEngine=outfp32.engine --workspace=2048 --minShapes=x:1x3x224x224 --optShapes=x:1x3x224x224 --maxShapes=x:1x3x224x224
trtexec --onnx=output.onnx --saveEngine=outfp16.engine --workspace=2048 --minShapes=x:1x3x224x224 --optShapes=x:1x3x224x224 --maxShapes=x:1x3x224x224 --fp16
trtexec --onnx=output.onnx --saveEngine=outfpi8.engine --workspace=2048 --minShapes=x:1x3x224x224 --optShapes=x:1x3x224x224 --maxShapes=x:1x3x224x224 --int8
trtexec --onnx=output.onnx --saveEngine=outfpbest.engine --workspace=2048 --minShapes=x:1x3x224x224 --optShapes=x:1x3x224x224 --maxShapes=x:1x3x224x224 --best
2. 准备图片输入
- 尺寸适配:图片固定长宽比 resize + padding 到模型要求的输入尺寸
- 归一化:减均值,除方差 -> float 格式
- 展开:按channel展开成一维float数组 (size = 3×w×h)
- 最终输入就是一维数组
3. 结果输出
- 根据网络结构有几个output head,就绑定几个buffer
void doInference(IExecutionContext& context, float* input, float* output, const int output_size, Size input_shape) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT); int mBatchSize = engine.getMaxBatchSize(); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(1, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); }
- decode输出:阈值处理、非极大值抑制、还原Box位置和尺寸、还原关键点位置
参考
https://blog.csdn.net/HaoZiHuang/article/details/125859167
https://blog.csdn.net/weixin_42492254/article/details/126028199
https://github.com/ifzhang/ByteTrack/blob/main/deploy/TensorRT/cpp/src/bytetrack.cpp