分类模型,pytorch->onnx->engine

import torch
from model import resnet18
import os
 
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1' 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
def model_converter():
    # model = resnet18(num_classes=2)
    # model.load_state_dict(torch.load("resnet18.pth"))
    # model.to(device)  # 这里保存的是完整模型
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = torch.load("resnet18.pth")
    model.to(device)
    model.eval()
    dummy_input = torch.randn(1, 3, 224, 224).to(device)
    input_names = ['input']
    output_names = ['output']
    torch.onnx.export(model, dummy_input, 'resnet181.onnx',
                    export_params=True,
                    verbose=True,
                    opset_version=11,
                    do_constant_folding=False,
                    input_names=input_names,
                    output_names=output_names,
                    training=2)
                    #dynamic_axes={'input' : {0 : 'batch_size'},    # variable lenght axes
                                                        #'output' : {0 : 'batch_size'}})
 
model_converter()

onnx转engine 到trt运行如下:

import os
import pycuda.autoinit 
import pycuda.driver as cuda
import tensorrt as trt
import torch 
import time 
from PIL import Image
import cv2,os
import torchvision 
import numpy as np
from scipy.special import softmax

### get_img_np_nchw h和postprocess_the_output函数根据需要进行修改
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0' 
TRT_LOGGER = trt.Logger()
def softmax(out_np, dim):
    s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    return s_value

def get_img_np_nchw(img_path):
    image = Image.open(img_path).convert('RGB')
    image = np.asarray(image, dtype='float32')
    image = cv2.resize(np.array(image),(224, 224), interpolation = cv2.INTER_CUBIC)
    img_in = np.transpose(image, (2, 0, 1)).astype(np.float32)  # (3, 224 224)
    img_in /= 255.0  # 归一化[0, 1]
    miu = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
    std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
    img_in -= miu
    img_in /= std
    img_in = img_in[np.newaxis]
    image = np.tile(img_in,(max_batch_size, 1, 1, 1))
    return image

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """host_mom指代cpu内存,device_mem指代GPU内存
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",fp16_mode=False, int8_mode=False,save_engine=False):
    """
    params max_batch_size:      预先指定大小好分配显存
    params onnx_file_path:      onnx文件路径
    params engine_file_path:    待保存的序列化的引擎文件路径
    params fp16_mode:           是否采用FP16
    params int8_mode:           是否采用INT8
    params save_engine:         是否保存引擎
    returns:                    ICudaEngine
    """
    # 如果已经存在序列化之后的引擎,则直接反序列化得到cudaEngine
    if os.path.exists(engine_file_path):
        print("Reading engine from file: {}".format(engine_file_path))
        with open(engine_file_path, 'rb') as f, \
            trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())  # 反序列化
    else:  # 由onnx创建cudaEngine
        
        # 使用logger创建一个builder 
        # builder创建一个计算图 INetworkDefinition
        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.

        with trt.Builder(TRT_LOGGER) as builder, \
            builder.create_network(explicit_batch) as network,  \
            trt.OnnxParser(network, TRT_LOGGER) as parser, \
            builder.create_builder_config() as config: # 使用onnx的解析器绑定计算图,后续将通过解析填充计算图
            profile = builder.create_optimization_profile()
            # profile.set_shape("input", (1, 3, 224, 224),(4,3,224,224),(8,3,224,224))
            # config.add_optimization_profile(profile)

            config.max_workspace_size = 1<<28  # 预先分配的工作空间大小,即ICudaEngine执行时GPU最大需要的空间
            builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
            builder.fp16_mode = fp16_mode
            # 解析onnx文件,填充计算图
            if not os.path.exists(onnx_file_path):
                quit("ONNX file {} not found!".format(onnx_file_path))
            print('loading onnx file from path {} ...'.format(onnx_file_path))
            # with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
            #     print("Begining onnx file parsing")
            #     parser.parse(model.read())  # 解析onnx文件
            parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的方法

            print("Completed parsing of onnx file")
            # 填充计算图完成后,则使用builder从计算图中创建CudaEngine
            print("Building an engine from file{}' this may take a while...".format(onnx_file_path))

            #################
            # import pdb;pdb.set_trace()
            print(network.get_layer(network.num_layers-1).get_output(0).shape)
            # network.mark_output(network.get_layer(network.num_layers -1).get_output(0))
            engine = builder.build_engine(network,config)  # 注意,这里的network是INetworkDefinition类型,即填充后的计算图
            print("Completed creating Engine")
            if save_engine:  #保存engine供以后直接反序列化使用
                with open(engine_file_path, 'wb') as f:
                    f.write(engine.serialize())  # 序列化
            return engine

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

def postprocess_the_outputs(outputs, shape_of_output):
    outputs = outputs.reshape(*shape_of_output)
    out = np.argmax(softmax(outputs,axis=1)[0,...],axis=0)
    # import pdb;pdb.set_trace()
    return out
# 验证TensorRT模型是否正确
onnx_model_path = 'resnet18.onnx'
max_batch_size = 1
img_path = '0001.jpg'

image = get_img_np_nchw(img_path)
fp16_mode = False

trt_engine_path = 'resnet18.engine'
# Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
# Create the context for this engine
context = engine.create_execution_context()

# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindings

# Do inference
inputs[0].host = image.reshape(-1)
shape_of_output = (max_batch_size, 3)
# inputs[1].host = ... for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
# feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
labels_sm = softmax(trt_outputs, dim=1)
print(labels_sm)
labels_max = np.argmax(labels_sm, axis=1)
print(labels_max)
print('TensorRT ok')




  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值