TenserRT（六）： TENSORRT 模型构建与推理

shchojj

已于 2023-08-02 13:12:36 修改

阅读量730

点赞数

文章标签：人工智能

于 2023-08-01 11:09:15 首次发布

原文链接：https://mmdeploy.readthedocs.io/zh_CN/latest/tutorial/06_introduction_to_tensorrt.html

版权

第六章： TensorRT 模型构建与推理 — mmdeploy 1.2.0 文档

GitHub - PRBonn/rangenet_lib: Inference module for RangeNet++ (milioto2019iros, chen2019iros)

Deploy by TensorRT | KAMISAMA'S SPACE

https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake

TensorRT 简介

tensorrt提供量化感知训练和离线量化功能，用户可以选择INT8和FP16两种优化模式，目前可能是在NVIDIA GPU部署最快的推理模型。

安装 TensorRT

下载CUDA和CUDNN（可以直接百度一下tensorrt安装，包括里面的whl也安装一下）

模型构建

使用tensorrt生成模型主要有两种方式：

1、直接通过tensort的API逐层搭建网络；（一个网络层一个网络层的方式搭建，和pytorch中一一对应）

2、将中间表示的模型（ONNX）转换成tensorrt模型。

直接构建

使用 Python API 构建

import tensorrt as trt

verbose = True
IN_NAME = 'input'
OUT_NAME = 'output'
IN_H = 224
IN_W = 224
BATCH_SIZE = 1

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
with trt.Builder(TRT_LOGGER) as builder,\
        builder.create_builder_config() as config, \
        builder.create_network(EXPLICIT_BATCH) as network:
    #tensorrt.Builder中的create_builder_config构建config
    #tensrort.Builder中的create_network构建network
    # define network网络的主体
    input_tensor = network.add_input( name=IN_NAME, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H, IN_W))#定义输入的名称
    pool = network.add_pooling(input=input_tensor, type=trt.PoolingType.MAX, window_size=(2, 2))
    pool.stride = (2, 2)
    pool.get_output(0).name = OUT_NAME
    network.mark_output(pool.get_output(0))#定义输出的名称

    # serialize the model to engine file
    profile = builder.create_optimization_profile()#ternsorrt.Builder的create_optimization_profile函数设置最大最小值，已接受不同分辨率的输入输出
    profile.set_shape_input('input', *[[BATCH_SIZE, 3, IN_H, IN_W]]*3)
    builder.max_batch_size = 1
    config.max_workspace_size = 1 << 30#最大工作空间
    engine = builder.build_engine(network, config)
    with open('model_python_trt.engine', mode='wb') as f:
        f.write(bytearray(engine.serialize()))#将构建好的网络序列化，并保存成本地文件
        print("generating file done!")

使用 C++ API 构建

项目中追加一个cmake文件夹，以便可以直接使用find_package

https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake

# This module defines the following variables:
#
# ::
#
#   TensorRT_INCLUDE_DIRS
#   TensorRT_LIBRARIES
#   TensorRT_FOUND
#
# ::
#
#   TensorRT_VERSION_STRING - version (x.y.z)
#   TensorRT_VERSION_MAJOR  - major version (x)
#   TensorRT_VERSION_MINOR  - minor version (y)
#   TensorRT_VERSION_PATCH  - patch version (z)
#
# Hints
# ^^^^^
# A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look.
#
set(_TensorRT_SEARCHES)

if(TensorRT_ROOT)
  set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_ROOT} NO_DEFAULT_PATH)
  list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
endif()

# appends some common paths
set(_TensorRT_SEARCH_NORMAL
  PATHS "/usr"
)
list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)

# Include dir
foreach(search ${_TensorRT_SEARCHES})
  find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
endforeach()
if(NOT TensorRT_LIBRARY)
	foreach(search ${_TensorRT_SEARCHES})
		find_library(INF_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
	endforeach()
	foreach(search ${_TensorRT_SEARCHES})
		find_library(PAR_LIBRARY NAMES  nvonnxparser ${${search}} PATH_SUFFIXES lib)
	endforeach()
	set(TensorRT_LIBRARY ${INF_LIBRARY} "${PAR_LIBRARY}")
endif()

#message("TensorRT_LIBRARY ${TensorRT_LIBRARY}")
#message("TensorRT_LIBRARIES ${TensorRT_LIBRARIES}")
mark_as_advanced(TensorRT_INCLUDE_DIR)

if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")

    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
endif()

include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)

if(TensorRT_FOUND)
  set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})

  if(NOT TensorRT_LIBRARIES)
    set(TensorRT_LIBRARIES ${TensorRT_LIBRARY})
  endif()

  if(NOT TARGET TensorRT::TensorRT)
    add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
    set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
    set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
  endif()
endif()

然后在总的工程中添加cmakelists.txt文件

cmake_minimum_required (VERSION 3.18 FATAL_ERROR)

project(trtDemo)
#set project output directory
string(REGEX REPLACE "/code" "" PROJECT_OUTPUT_DIR ${PROJECT_SOURCE_DIR})
set(PROJECT_OUTPUT_DIR "${PROJECT_OUTPUT_DIR}/output")
set(EXECUTABLE_OUTPUT_PATH "${PROJECT_OUTPUT_DIR}")
set(LIBRARY_OUTPUT_PATH "${PROJECT_OUTPUT_DIR}")
message("output path: ${PROJECT_OUTPUT_DIR}")

#debug
set(CMAKE_BUILD_TYPE DEBUG)

set(CMAKE_CXX_FLAGS "-Wall")#Turn on all warnings
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")#Enables debugging information and disables optimization
set(CMAKE_CXX_FLAGS_RELEASE "-g -O3")#Enable Advanced optimization
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -D_MWAITXINTRIN_H_INCLUDED")

# Use C++11
set(CMAKE_CXX_STANDARD 11)

# tensorrt
# CMake path
# For using Find TensorRT script
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)

find_package(CUDA REQUIRED)
find_package(TensorRT REQUIRED)

include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${TensorRT_INCLUDE_DIRS})

include(${PROJECT_SOURCE_DIR}/common/src.cmake)
include_directories(${trt_common_header})

set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)

message("TENSORRT_LIBRARIES: ${TENSORRT_LIBRARIES}")
message("TensorRT_LIBRARY: ${TensorRT_LIBRARY}")

set(CUDA_MODULE_LOADING "LAZY" CACHE STRING "Description of the environment variable.")

subdirs(common)
add_executable(trtDemo main.cpp)
target_link_libraries(trtDemo PRIVATE  common ${CUDA_LIBRARIES} ${TensorRT_LIBRARY})

set(dir "${PROJECT_SOURCE_DIR}/common")

set(trt_common_header 
	${dir}/logger.h
	${dir}/ErrorRecorder.h
	${dir}/sampleOptions.h
)
set(trt_common_src ${trt_common_header}

	${dir}/logger.cpp
)

IR 转换模型

除了直接通过tensorRT的API逐层搭建网络并序列化模型外，tensorrt还支持将中间表示的模型ONNX转换成TensorRT模型

#include <fstream>
#include <iostream>



#include "NvInfer.h"
#include "common/logger.h"

using namespace nvinfer1;
using namespace sample;

const char* IN_NAME = "input";
const char* OUT_NAME = "output";
static const int IN_H = 224;
static const int IN_W = 224;
static const int BATCH_SIZE = 1;
static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);

int main(int argc, char** argv)
{


	// Create builder
	Logger m_logger;//tensorrt的samples/common/logger.h文件里已经实现了Logger.h的子类
	IBuilder* builder = createInferBuilder(m_logger);//需要传入ILogger类实例，但ILogger是一个抽象类，需要用户继承该类并实现内部虚函数。
	IBuilderConfig* config = builder->createBuilderConfig();

	// Create model to populate the network
	INetworkDefinition* network = builder->createNetworkV2(EXPLICIT_BATCH);
	ITensor* input_tensor = network->addInput(IN_NAME, DataType::kFLOAT, Dims4{ BATCH_SIZE, 3, IN_H, IN_W });
	IPoolingLayer* pool = network->addPoolingNd(*input_tensor, PoolingType::kMAX, DimsHW{ 2, 2 });
	pool->setStrideNd(DimsHW{ 2, 2 });
	pool->getOutput(0)->setName(OUT_NAME);
	network->markOutput(*pool->getOutput(0));

	// Build engine，需要多次调用IOptimizationProfile的setDimensions设置模型输入的尺寸
	IOptimizationProfile* profile = builder->createOptimizationProfile();
	profile->setDimensions(IN_NAME, OptProfileSelector::kMIN, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
	profile->setDimensions(IN_NAME, OptProfileSelector::kOPT, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
	profile->setDimensions(IN_NAME, OptProfileSelector::kMAX, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
	config->setMaxWorkspaceSize(1 << 20);
	ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

	// Serialize the model to engine file
	IHostMemory* modelStream{ nullptr };
	assert(engine != nullptr);
	modelStream = engine->serialize();

	std::ofstream p("model.engine", std::ios::binary);
	if (!p) {
		std::cerr << "could not open output file to save model" << std::endl;
		return -1;
	}
	p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
	std::cout << "generating file done!" << std::endl;

	// Release resources
	//destroy的顺序不对会引发错误。
	modelStream->destroy();
	engine->destroy();
	network->destroy();
	config->destroy();
	builder->destroy();

	return 0;
}

使用 Python API 转换

import torch
import onnx
import tensorrt as trt


onnx_model = 'model.onnx'

class NaiveModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = torch.nn.MaxPool2d(2, 2)

    def forward(self, x):
        return self.pool(x)

device = torch.device('cuda:0')

# generate ONNX model
torch.onnx.export(NaiveModel(), torch.randn(1, 3, 224, 224), onnx_model, input_names=['input'], output_names=['output'], opset_version=11)
onnx_model = onnx.load(onnx_model)

# create builder and network
logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(EXPLICIT_BATCH)

# parse onnx
parser = trt.OnnxParser(network, logger)

if not parser.parse(onnx_model.SerializeToString()):
    error_msgs = ''
    for error in range(parser.num_errors):
        error_msgs += f'{parser.get_error(error)}\n'
    raise RuntimeError(f'Failed to parse onnx, {error_msgs}')

config = builder.create_builder_config()
config.max_workspace_size = 1<<20
profile = builder.create_optimization_profile()
# 如果有多batch、多输入、动态shape的需求，可以通过多次调用set_shape函数进行设置
#输入节点名称、可接受的最小输入尺寸、最优的输入尺寸、可接受的最大输入尺寸。一般都是递增关系
profile.set_shape('input', [1,3 ,224 ,224], [1,3,224, 224], [1,3 ,224 ,224])
config.add_optimization_profile(profile)
# create engine
with torch.cuda.device(device):
    engine = builder.build_engine(network, config)

with open('model.engine', mode='wb') as f:
    f.write(bytearray(engine.serialize()))
    print("generating file done!")

使用 C++ API 转换

#include <fstream>
#include <iostream>

#include <NvInfer.h>
#include <NvOnnxParser.h>
#include "../common/logger.h"

using namespace nvinfer1;
using namespace nvonnxparser;
using namespace sample;

int main(int argc, char** argv)
{
	// Create builder
	Logger m_logger;
	IBuilder* builder = createInferBuilder(m_logger);
	const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
	IBuilderConfig* config = builder->createBuilderConfig();

	// Create model to populate the network
	INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

	// Parse ONNX file
	IParser* parser = nvonnxparser::createParser(*network, m_logger);
	bool parser_status = parser->parseFromFile("F:/localwork/cpp/trtDemo/output/Debug/model.onnx", static_cast<int>(ILogger::Severity::kWARNING));

	// Get the name of network input
	Dims dim = network->getInput(0)->getDimensions();
	if (dim.d[0] == -1)  // -1 means it is a dynamic model
	{
		const char* name = network->getInput(0)->getName();
		IOptimizationProfile* profile = builder->createOptimizationProfile();
		profile->setDimensions(name, OptProfileSelector::kMIN, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
		profile->setDimensions(name, OptProfileSelector::kOPT, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
		profile->setDimensions(name, OptProfileSelector::kMAX, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
		config->addOptimizationProfile(profile);
	}


	// Build engine
	config->setMaxWorkspaceSize(1 << 20);
	ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

	// Serialize the model to engine file
	IHostMemory* modelStream{ nullptr };
	assert(engine != nullptr);
	modelStream = engine->serialize();

	std::ofstream p("model.engine", std::ios::binary);
	if (!p) {
		std::cerr << "could not open output file to save model" << std::endl;
		return -1;
	}
	p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
	std::cout << "generate file success!" << std::endl;

	// Release resources
	modelStream->destroy();
	engine->destroy();
	network->destroy();
	config->destroy();
	builder->destroy();
	return 0;
}

模型推理

使用 Python API 推理

from typing import Union, Optional, Sequence,Dict,Any

import torch
import tensorrt as trt

class TRTWrapper(torch.nn.Module):
    def __init__(self,engine: Union[str, trt.ICudaEngine], output_names: Optional[Sequence[str]] = None) -> None:
        super().__init__()
        self.engine = engine
        if isinstance(self.engine, str):
            with trt.Logger() as logger, trt.Runtime(logger) as runtime:
                with open(self.engine, mode='rb') as f:
                    engine_bytes = f.read()
                self.engine = runtime.deserialize_cuda_engine(engine_bytes)
        self.context = self.engine.create_execution_context()
        names = [_ for _ in self.engine]
        input_names = list(filter(self.engine.binding_is_input, names))
        self._input_names = input_names
        self._output_names = output_names

        if self._output_names is None:
            output_names = list(set(names) - set(input_names))
            self._output_names = output_names

    def forward(self, inputs: Dict[str, torch.Tensor]):
        assert self._input_names is not None
        assert self._output_names is not None
        bindings = [None] * (len(self._input_names) + len(self._output_names))
        profile_id = 0
        for input_name, input_tensor in inputs.items():
            # check if input shape is valid
            profile = self.engine.get_profile_shape(profile_id, input_name)
            assert input_tensor.dim() == len( profile[0]), 'Input dim is different from engine profile.'
            for s_min, s_input, s_max in zip(profile[0], input_tensor.shape, profile[2]):
                assert s_min <= s_input <= s_max,  'Input shape should be between '  f'{profile[0]} and {profile[2]}' \
                    + f' but get {tuple(input_tensor.shape)}.'
            idx = self.engine.get_binding_index(input_name)

            # All input tensors must be gpu variables
            assert 'cuda' in input_tensor.device.type
            input_tensor = input_tensor.contiguous()
            if input_tensor.dtype == torch.long:
                input_tensor = input_tensor.int()
            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
            bindings[idx] = input_tensor.contiguous().data_ptr()

        # create output tensors
        outputs = {}
        for output_name in self._output_names:
            idx = self.engine.get_binding_index(output_name)
            dtype = torch.float32
            shape = tuple(self.context.get_binding_shape(idx))

            device = torch.device('cuda')
            output = torch.empty(size=shape, dtype=dtype, device=device)
            outputs[output_name] = output
            bindings[idx] = output.data_ptr()
        self.context.execute_async_v2(bindings,  torch.cuda.current_stream().cuda_stream)
        return outputs

model = TRTWrapper('model.engine', ['output'])
output = model(dict(input = torch.randn(1, 3, 224, 224).cuda()))#输入1x3x224x224
print(output)#输1x3x112x112

使用 C++ API 推理

#include <fstream>
#include <iostream>

#include <NvInfer.h>
#include <../common/logger.h>

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

using namespace nvinfer1;
using namespace sample;

const char* IN_NAME = "input";
const char* OUT_NAME = "output";
static const int IN_H = 224;
static const int IN_W = 224;
static const int BATCH_SIZE = 1;
static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);


void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
        const ICudaEngine& engine = context.getEngine();

        // Pointers to input and output device buffers to pass to engine.
        // Engine requires exactly IEngine::getNbBindings() number of buffers.
        assert(engine.getNbBindings() == 2);
        void* buffers[2];

        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
        const int inputIndex = engine.getBindingIndex(IN_NAME);
        const int outputIndex = engine.getBindingIndex(OUT_NAME);

        // Create GPU buffers on device
        CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));
        CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 3 * IN_H * IN_W /4 * sizeof(float)));

        // Create stream
        cudaStream_t stream;
        CHECK(cudaStreamCreate(&stream));

        // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
        CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));
        context.enqueue(batchSize, buffers, stream, nullptr);
        CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 3 * IN_H * IN_W / 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
        cudaStreamSynchronize(stream);

        // Release stream and buffers
        cudaStreamDestroy(stream);
        CHECK(cudaFree(buffers[inputIndex]));
        CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
        // create a model using the API directly and serialize it to a stream
        char *trtModelStream{ nullptr };
        size_t size{ 0 };

        std::ifstream file("F:/localwork/cpp/trtDemo/output/Debug/model.engine", std::ios::binary);
        if (file.good()) {
                file.seekg(0, file.end);
                size = file.tellg();
                file.seekg(0, file.beg);
                trtModelStream = new char[size];
                assert(trtModelStream);
                file.read(trtModelStream, size);
                file.close();
        }

        Logger m_logger;
        IRuntime* runtime = createInferRuntime(m_logger);
        assert(runtime != nullptr);
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
        assert(engine != nullptr);
        IExecutionContext* context = engine->createExecutionContext();
        assert(context != nullptr);

        // generate input data
        float data[BATCH_SIZE * 3 * IN_H * IN_W];
        for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)
                data[i] = 1;

        // Run inference
        float prob[BATCH_SIZE * 3 * IN_H * IN_W /4];
        doInference(*context, data, prob, BATCH_SIZE);

        // Destroy the engine
        context->destroy();
        engine->destroy();
        runtime->destroy();
        return 0;
}

总结

1、 TensorRT 的 API 逐层搭建网络

2、ONNX模型转换成 TensorRT 的模型

3、 Python TensorRT 模型的构建及推理。

4、 C++ TensorRT 模型的构建及推理，