使用C++API部署推理(重点)
step1:创建runtime
step2:反序列化创建engine
step3:创建context
step4:获取输入输出索引
step5:创建buffers
step6:为输入输出开辟GPU显存
step7:创建cuda流
step8:从CPU到GPU----拷贝input数据
step9:异步推理
step10:从GPU到CPU----拷贝output数据
step10:同步cuda流
step11:释放资源
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(modelData, modelSize, nullptr);
assert(engine != nullptr);
printf("Bindings after deserializing:\n");
for (int bi = 0; bi < engine->getNbBindings(); bi++)
{
if (engine->bindingIsInput(bi) == true)
{
printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi));
}
else
{
printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
}
}
IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
void* buffers[2];
buffers[inputIndex] = inputBuffer;
buffers[outputIndex] = outputBuffer;
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * inputDim.c() * inputDim.h() * inputDim.w() * sizeof(float)));
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float)));
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex],
input,
batchSize * inputDim.c() * inputDim.h() * inputDim.w() * sizeof(float),
cudaMemcpyHostToDevice,
stream));
context->enqueueV2(buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output,
buffers[outputIndex],
batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float),
cudaMemcpyDeviceToHost,
stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
cudaStreamDestroy(stream);
context->destroy();
engine->destroy();
runtime->destroy();
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));