TensorRT加速方法示例代码

部分详细流程请阅读TensorRT加速,本文在此博客基础上做了点补充。本示例主要做语义分割,因此输入大小为(3,512,512),输出为(512,512)

import sys
sys.path.insert(0, '.')
import argparse
import torch
import torch.nn as nn
from PIL import Image
import numpy as np
import cv2

import lib.transform_cv2 as T
from lib.models import model_factory
from configs import set_cfg_from_file
import onnx
import onnxruntime

torch.set_grad_enabled(False)
np.random.seed(123)

# args
parse = argparse.ArgumentParser()
parse.add_argument('--config', dest='config', type=str, default='configs/bisenetv1_steel_t.py',)
parse.add_argument('--weight-path', type=str, default='./res/model_final_120_0.864089846611023.pth',)
parse.add_argument('--img-path', dest='img_path', type=str, default='./datasets/steel_total/image/train/11_7.jpg',)
args = parse.parse_args()
cfg = set_cfg_from_file(args.config)

# define model
net = model_factory[cfg.model_type](cfg.n_cats, aux_mode='pred')
net.load_state_dict(torch.load(args.weight_path, map_location='cpu'), strict=False)
# 构造模型实例
net.eval()

# 定义输入名称,list结构,可能有多个输入
input_names = ['input']
# 定义输出名称,list结构,可能有多个输出
output_names = ['output']
# 构造输入用以验证onnx模型的正确性
input = torch.rand(1, 3, 512, 512)
output_path = "bisenet.onnx"
# 导出
torch.onnx.export(net, input, output_path,
                  export_params=True,
                  opset_version=11,
                  do_constant_folding=True,
                  input_names=input_names,
                  output_names=output_names)

# 加载 ONNX 模型
onnx_model = onnx.load("bisenet.onnx")
onnx_model_graph = onnx_model.graph
onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString())

# 使用随机张量测试 ONNX 模型
x = torch.randn(1, 3, 512, 512).numpy()
onnx_output = onnx_session.run(output_names, {input_names[0]: x})[0]

print(f"PyTorch output: {net(torch.from_numpy(x)).detach().numpy()[0, :5]}")
print(f"ONNX output: {onnx_output[0, :5]}")

以下参考博客TensorRT加速方法介绍(python pytorch模型)

import torch
import torchvision
from PIL import Image
from torchvision import transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import pdb
import os
import numpy as np
import cv2

# This logger is required to build an engine
TRT_LOGGER = trt.Logger()

filename = "./datasets/steel_total/image/train/11_1.jpg"
engine_file_path = "bisenet_engine.trt"6 


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    # Run inference.
    t_model = time.perf_counter()
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    print(f'only one line cost:{time.perf_counter() - t_model:.8f}s')

    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    # Synchronize the stream
    stream.synchronize()

    # Return only the host outputs.
    return [out.host for out in outputs]


print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

# create the context for this engine
context = engine.create_execution_context()

# allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindings

normalize = transforms.Normalize(mean=(0.3442322, 0.3442322, 0.3442322), # city, rgb
    std=(0.21136102, 0.21136102, 0.21136102))

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(), normalize]
)

t_model = time.perf_counter()

# 读图
img = Image.open("./datasets/steel_total/image/train/11_1.jpg")
#print(img.size)

# 对图像进行归一化
img_p = transform(img)
#print(img_p.shape)

# 增加一个维度
img_normalize = torch.unsqueeze(img_p, 0)
#print(img_normalize.shape)

# output
#shape_of_output = (512, 512)

# covert to numpy
img_normalize_np = img_normalize.cpu().data.numpy()

# Load data to the buffer
inputs[0].host = img_normalize_np
#print(inputs[0].host.shape)

# Do Inference
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)  # numpy data
print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')

print(len(trt_outputs))

pred = trt_outputs[0].reshape(512, 512)*255
#pred = palette[out]
cv2.imwrite('./res.jpg', pred)







评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值