Pytorch-Onnx-Tensorrt模型转换教程案例

最新推荐文章于 2025-03-15 18:13:11 发布

沐雲小哥

最新推荐文章于 2025-03-15 18:13:11 发布

阅读量1.1k

点赞数 3

分类专栏： Tensorrt 文章标签： pytorch 深度学习 python

本文链接：https://blog.csdn.net/weixin_44533869/article/details/125223704

版权

Tensorrt 专栏收录该内容

1 篇文章

订阅专栏

该博客介绍了如何将预训练的Resnet-50模型从PyTorch转换为ONNX格式，并进一步优化为TensorRT引擎以实现高效的GPU推理。首先，通过torch.onnx.export将模型导出为ONNX模型，然后使用ONNX Checker验证模型正确性。接着，通过ONNX Runtime进行优化并比较优化前后的输出。最后，详细展示了如何构建和运行TensorRT引擎，包括动态输入的处理，以及与PyTorch模型的速度对比。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Pytorch-Onnx的转换

本案例Resnet-50网络为例：

import onnx
import torch
import torchvision


# 1. 定义模型
model = torchvision.models.resnet50(pretrained=True).cuda()

# 2.定义输入&输出
input_names = ['input']
output_names = ['output']
image = torch.randn(1, 3, 224, 224).cuda()

# 3.pt转onnx
onnx_file = "./resnet50.onnx"
torch.onnx.export(model, image, onnx_file, verbose=False,
                  input_names=input_names, output_names=output_names,
                  opset_version=11,
                  dynamic_axes={"input":{0: "batch_size"}, "output":{0: "batch_size"},})

# 4.检查onnx计算图
net = onnx.load("./resnet50.onnx")
onnx.checker.check_model(net)           # 检查文件模型是否正确

# 5.优化前后对比&验证
# 优化前
model.eval()
with torch.no_grad():
    output1 = model(image)

# 优化后
import onnxruntime

image = torch.randn(4, 3, 256, 224).cuda()
session = onnxruntime.InferenceSession("./resnet50.onnx")
session.get_modelmeta()
output2 = session.run(['output'], {"input": image.cpu().numpy()})
print("{}vs{}".format(output1.mean(), output2[0].mean()))

Onnx -trt 的转换

# coding utf-8
import os
import time
import torch
import torchvision
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """
        host_mem: cpu memory
        device_mem: gpu memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def build_engine(onnx_file_path, engine_file_path, max_batch_size=1, fp16_mode=False, save_engine=False):
    """
    Args:
      max_batch_size: 预先指定大小好分配显存
      fp16_mode:      是否采用FP16
      save_engine:    是否保存引擎
    return:
      ICudaEngine
    """
    if os.path.exists(engine_file_path):
        print("Reading engine from file: {}".format(engine_file_path))
        with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())  # 反序列化

    # 如果是动态输入，需要显式指定EXPLICIT_BATCH
    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    # builder创建计算图 INetworkDefinition
    with trt.Builder(TRT_LOGGER) as builder,  builder.create_network(EXPLICIT_BATCH) as network,  builder.create_builder_config() as config, trt.OnnxParser(network, TRT_LOGGER) as parser:  # 使用onnx的解析器绑定计算图
        builder.max_workspace_size = 1 << 60  # ICudaEngine执行时GPU最大需要的空间
        builder.max_batch_size = max_batch_size  # 执行时最大可以使用的batchsize
        builder.fp16_mode = fp16_mode
        config.max_workspace_size = 1 << 30  # 1G

        # 动态输入profile优化
        profile = builder.create_optimization_profile()
        profile.set_shape("input", (1, 3, 224, 224), (8, 3, 224, 224), (8, 3, 224, 224))
        config.add_optimization_profile(profile)

        # 解析onnx文件，填充计算图
        if not os.path.exists(onnx_file_path):
            quit("ONNX file {} not found!".format(onnx_file_path))
        print('loading onnx file from path {} ...'.format(onnx_file_path))
        with open(onnx_file_path, 'rb') as model:
            print("Begining onnx file parsing")
            if not parser.parse(model.read()):  # 解析onnx文件
                print('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))  # 打印解析错误日志
                return None

        last_layer = network.get_layer(network.num_layers - 1)
        # Check if last layer recognizes it's output
        if not last_layer.get_output(0):
            # If not, then mark the output using TensorRT API
            network.mark_output(last_layer.get_output(0))
        print("Completed parsing of onnx file")

        # 使用builder创建CudaEngine
        print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
        # engine=builder.build_cuda_engine(network)    # 非动态输入使用
        engine = builder.build_engine(network, config)  # 动态输入使用
        print("Completed creating Engine")
        if save_engine:
            with open(engine_file_path, 'wb') as f:
                f.write(engine.serialize())
        return engine


def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size  # 非动态输入
        # size = trt.volume(engine.get_binding_shape(binding))                       # 动态输入
        size = abs(size)  # 上面得到的size(0)可能为负数，会导致OOM
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)  # 创建锁业内存
        device_mem = cuda.mem_alloc(host_mem.nbytes)  # cuda分配空间
        bindings.append(int(device_mem))  # binding在计算图中的缓冲地址
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream


def inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    # 如果创建network时显式指定了batchsize，使用execute_async_v2, 否则使用execute_async
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # gpu to cpu
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs


if __name__ == '__main__':
    onnx_file_path = "resnet50.onnx"
    fp16_mode = False
    max_batch_size = 1
    trt_engine_path = "resnet50.trt"

    # 1.创建cudaEngine
    engine = build_engine(onnx_file_path, trt_engine_path, max_batch_size, fp16_mode)

    # 2.将引擎应用到不同的GPU上配置执行环境
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)

    # 3.推理
    output_shape = (max_batch_size, 1000)
    dummy_input = np.ones([1, 3, 224, 224], dtype=np.float32)
    inputs[0].host = dummy_input.reshape(-1)

    # 如果是动态输入，需以下设置
    context.set_binding_shape(0, dummy_input.shape)

    t1 = time.time()
    trt_outputs = inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    t2 = time.time()
    # 由于tensorrt输出为一维向量，需要reshape到指定尺寸
    feat = postprocess_the_outputs(trt_outputs[0], output_shape)

    # 4.速度对比
    model = torchvision.models.resnet50(pretrained=True).cuda()
    model = model.eval()
    dummy_input = torch.zeros((1, 3, 224, 224), dtype=torch.float32).cuda()
    t3 = time.time()
    feat_2 = model(dummy_input)
    t4 = time.time()
    feat_2 = feat_2.cpu().data.numpy()

    mse = np.mean((feat - feat_2) ** 2)
    print("TensorRT engine time cost: {}".format(t2 - t1))
    print("PyTorch model time cost: {}".format(t4 - t3))
    print('MSE Error = {}'.format(mse))