pytorch模型-转-onnx-转-tensorrt实际操作详解

最新推荐文章于 2024-06-02 15:17:02 发布

辣大辣条

最新推荐文章于 2024-06-02 15:17:02 发布

阅读量2.9k

点赞数 4

分类专栏：模型裁剪量化图像分类 tensorrt

本文链接：https://blog.csdn.net/pangxing6491/article/details/113250368

版权

图像分类同时被 3 个专栏收录

25 篇文章 0 订阅

订阅专栏

模型裁剪量化

12 篇文章 0 订阅

订阅专栏

tensorrt

4 篇文章 0 订阅

订阅专栏

环境为CUDA10.2/torch1.6.0

运行：
-1_torch2onnx.py　torch模型转为onnx格式
-2_onnx2trt.py　　onnx转trt格式
-3_inference.py　推理trt格式文件

第一步、配置tensorrt环境，具体参见以下链接：

https://zongxp.blog.csdn.net/article/details/86077553

第二步、pytorch模型转onnx（保证输入尺寸和训练相同）:

注意：需要事先训练得到pth模型，也要知道模型的输入输出，比如此次我用的是resnet１８，我设置的输入是（１，３，３２，３２），即batch_size=1,通道数为３，尺寸为３２＊３２，简而言之，就是一次输入一张３２＊３２的彩色ＲＧＢ图像。

import torch
from resnet import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def model_converter():
    model = resnet18(num_classes=2)
    model.load_state_dict(torch.load("resnet18.pth"))
    model.to(device)  # 这里保存的是完整模型
    model.eval()

    dummy_input = torch.randn(1, 3, 32, 32, device=device)
    input_names = ['data']
    output_names = ['fc']
    torch.onnx.export(model, dummy_input, 'resnet18.onnx',
                      export_params=True,
                      verbose=True,
                      input_names=input_names,
                      output_names=output_names)

model_converter()

第二步、onnx转tensorrt格式(固定模式，改路径即可)：

import os
import tensorrt as trt

TRT_LOGGER = trt.Logger()
model_path = 'resnet18.onnx'
engine_file_path = "resnet18.trt"
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)  # batchsize=1

with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) \
        as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    builder.max_workspace_size = 1 << 28
    builder.max_batch_size = 1
    if not os.path.exists(model_path):
        print('ONNX file {} not found.'.format(model_path))
        exit(0)
    print('Loading ONNX file from path {}...'.format(model_path))
    with open(model_path, 'rb') as model:
        print('Beginning ONNX file parsing')
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))

    network.get_input(0).shape = [1, 3, 32, 32]
    print('Completed parsing of ONNX file')
    engine = builder.build_cuda_engine(network)
    with open(engine_file_path, "wb") as f:
        f.write(engine.serialize())

第三步、使用ｔｒｔ模型推理（本质上只需要写自己的前处理代码即可）：

重点在3_inference.py推理文件：
tensorrt7.0推理步骤：
１>输入前处理：和训练的前处理过程，保证输入的图片格式和训练一致
２>分配内存：allocate_buffers函数，不需要改动
３>推理函数：do_inference_v2函数，不需要改动，如果需要多张推理，则可用do_inference函数
４>结果：tensorrt推理结果为列表

import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import numpy as np
import os
import tensorrt as trt
import time
from  PIL import Image

TRT_LOGGER = trt.Logger()
engine_file_path = "resnet18.trt"

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# 推理函数，固定函数
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

i = 0
j = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    print(inputs,outputs,bindings,stream)
    dir = "val/white/"

    for name in os.listdir(dir):
        # 前处理部分
        t1 = time.clock()
        image_path = os.path.join(dir,name)

        img = Image.open(image_path)
        img = np.array(img)
        img = img.transpose((1, 0, 2))
        img = img.transpose((2, 1, 0))
        img = img.astype(np.float32) / 255.0

        img = img[np.newaxis, :, :].astype(np.float32)
        print(img.shape)
        img = np.ascontiguousarray(img)
        # 前处理结束

        # 开始推理
        inputs[0].host = img
        trt_outputs = do_inference_v2(context, bindings=bindings, \
                                      inputs=inputs, outputs=outputs, stream=stream)
        print(trt_outputs)

        # 结果判断
        if trt_outputs[0][0] > trt_outputs[0][1]:
            print("0")
            i = i +1
        else:
            print("1")
            j = j +1
        print("Time:",time.clock()-t1)

m = 0
n = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    print(inputs,outputs,bindings,stream)
    dir = "val/yellow/"

    for name in os.listdir(dir):
        t1 = time.clock()
        image_path = os.path.join(dir,name)

        image = cv2.imread(image_path)
        img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        img = np.array(img)
        img = img.transpose((1, 0, 2))
        img = img.transpose((2, 1, 0))
        img = img.astype(np.float32) / 255.0

        img = img[np.newaxis, :, :].astype(np.float32)
        print(img.shape)
        img = np.ascontiguousarray(img)
        inputs[0].host = img
        # 开始推理
        trt_outputs = do_inference_v2(context, bindings=bindings, \
                                      inputs=inputs, outputs=outputs, stream=stream)
        print(trt_outputs)
        if trt_outputs[0][0] > trt_outputs[0][1]:
            print("0")
            m = m+1
        else:
            print("1")
            n = n +1
        print("Time:",time.clock()-t1)
#
print("i = ",i)
print("j = ",j)
print("m = ",m)
print("n = ",n)

如果能正常的输出一个列表，则成功。

辣大辣条

关注

4
点赞
踩
40

收藏

觉得还不错? 一键收藏
2
评论
pytorch模型-转-onnx-转-tensorrt实际操作详解

第一步、配置tensorrt环境，具体参见以下链接：https://zongxp.blog.csdn.net/article/details/86077553第二步、pytorch模型转onnx:注意：需要事先训练得到pth模型，也要知道模型的输入输出，比如此次我用的是resnet１８，我设置的输入是（１，３，３２，３２），即batch_size=1,通道数为３，尺寸为３２＊３２，简而言之，就是一次输入一张３２＊３２的彩色ＲＧＢ图像。import torchfrom resnet imp
复制链接

扫一扫