pytorch模型-转-onnx-转-tensorrt实际操作详解

环境为CUDA10.2/torch1.6.0

运行:
-1_torch2onnx.py torch模型转为onnx格式
-2_onnx2trt.py  onnx转trt格式
-3_inference.py 推理trt格式文件
 

第一步、配置tensorrt环境,具体参见以下链接:

https://zongxp.blog.csdn.net/article/details/86077553

第二步、pytorch模型转onnx(保证输入尺寸和训练相同):

注意:需要事先训练得到pth模型,也要知道模型的输入输出,比如此次我用的是resnet18,我设置的输入是(1,3,32,32),即batch_size=1,通道数为3,尺寸为32*32,简而言之,就是一次输入一张32*32的彩色RGB图像。

import torch
from resnet import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def model_converter():
    model = resnet18(num_classes=2)
    model.load_state_dict(torch.load("resnet18.pth"))
    model.to(device)  # 这里保存的是完整模型
    model.eval()

    dummy_input = torch.randn(1, 3, 32, 32, device=device)
    input_names = ['data']
    output_names = ['fc']
    torch.onnx.export(model, dummy_input, 'resnet18.onnx',
                      export_params=True,
                      verbose=True,
                      input_names=input_names,
                      output_names=output_names)

model_converter()

第二步、onnx转tensorrt格式(固定模式,改路径即可):

import os
import tensorrt as trt

TRT_LOGGER = trt.Logger()
model_path = 'resnet18.onnx'
engine_file_path = "resnet18.trt"
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)  # batchsize=1

with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) \
        as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    builder.max_workspace_size = 1 << 28
    builder.max_batch_size = 1
    if not os.path.exists(model_path):
        print('ONNX file {} not found.'.format(model_path))
        exit(0)
    print('Loading ONNX file from path {}...'.format(model_path))
    with open(model_path, 'rb') as model:
        print('Beginning ONNX file parsing')
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))

    network.get_input(0).shape = [1, 3, 32, 32]
    print('Completed parsing of ONNX file')
    engine = builder.build_cuda_engine(network)
    with open(engine_file_path, "wb") as f:
        f.write(engine.serialize())

第三步、使用trt模型推理(本质上只需要写自己的前处理代码即可):

重点在3_inference.py推理文件:
tensorrt7.0推理步骤:
1>输入前处理:和训练的前处理过程,保证输入的图片格式和训练一致
2>分配内存:allocate_buffers函数,不需要改动
3>推理函数:do_inference_v2函数,不需要改动,如果需要多张推理,则可用do_inference函数
4>结果:tensorrt推理结果为列表

import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import numpy as np
import os
import tensorrt as trt
import time
from  PIL import Image

TRT_LOGGER = trt.Logger()
engine_file_path = "resnet18.trt"

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# 推理函数,固定函数
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

i = 0
j = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    print(inputs,outputs,bindings,stream)
    dir = "val/white/"

    for name in os.listdir(dir):
        # 前处理部分
        t1 = time.clock()
        image_path = os.path.join(dir,name)

        img = Image.open(image_path)
        img = np.array(img)
        img = img.transpose((1, 0, 2))
        img = img.transpose((2, 1, 0))
        img = img.astype(np.float32) / 255.0

        img = img[np.newaxis, :, :].astype(np.float32)
        print(img.shape)
        img = np.ascontiguousarray(img)
        # 前处理结束

        # 开始推理
        inputs[0].host = img
        trt_outputs = do_inference_v2(context, bindings=bindings, \
                                      inputs=inputs, outputs=outputs, stream=stream)
        print(trt_outputs)

        # 结果判断
        if trt_outputs[0][0] > trt_outputs[0][1]:
            print("0")
            i = i +1
        else:
            print("1")
            j = j +1
        print("Time:",time.clock()-t1)

m = 0
n = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    print(inputs,outputs,bindings,stream)
    dir = "val/yellow/"

    for name in os.listdir(dir):
        t1 = time.clock()
        image_path = os.path.join(dir,name)

        image = cv2.imread(image_path)
        img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        img = np.array(img)
        img = img.transpose((1, 0, 2))
        img = img.transpose((2, 1, 0))
        img = img.astype(np.float32) / 255.0

        img = img[np.newaxis, :, :].astype(np.float32)
        print(img.shape)
        img = np.ascontiguousarray(img)
        inputs[0].host = img
        # 开始推理
        trt_outputs = do_inference_v2(context, bindings=bindings, \
                                      inputs=inputs, outputs=outputs, stream=stream)
        print(trt_outputs)
        if trt_outputs[0][0] > trt_outputs[0][1]:
            print("0")
            m = m+1
        else:
            print("1")
            n = n +1
        print("Time:",time.clock()-t1)
#
print("i = ",i)
print("j = ",j)
print("m = ",m)
print("n = ",n)

如果能正常的输出一个列表,则成功。

  • 4
    点赞
  • 40
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值