TensorRT加速方法示例代码

最新推荐文章于 2023-09-04 13:28:33 发布

打团小能手

最新推荐文章于 2023-09-04 13:28:33 发布

阅读量299

点赞数

文章标签：人工智能深度学习 pytorch

本文链接：https://blog.csdn.net/qq_43844342/article/details/131631063

版权

部分详细流程请阅读TensorRT加速，本文在此博客基础上做了点补充。本示例主要做语义分割，因此输入大小为（3，512，512），输出为（512，512）

import sys
sys.path.insert(0, '.')
import argparse
import torch
import torch.nn as nn
from PIL import Image
import numpy as np
import cv2

import lib.transform_cv2 as T
from lib.models import model_factory
from configs import set_cfg_from_file
import onnx
import onnxruntime

torch.set_grad_enabled(False)
np.random.seed(123)

# args
parse = argparse.ArgumentParser()
parse.add_argument('--config', dest='config', type=str, default='configs/bisenetv1_steel_t.py',)
parse.add_argument('--weight-path', type=str, default='./res/model_final_120_0.864089846611023.pth',)
parse.add_argument('--img-path', dest='img_path', type=str, default='./datasets/steel_total/image/train/11_7.jpg',)
args = parse.parse_args()
cfg = set_cfg_from_file(args.config)

# define model
net = model_factory[cfg.model_type](cfg.n_cats, aux_mode='pred')
net.load_state_dict(torch.load(args.weight_path, map_location='cpu'), strict=False)
# 构造模型实例
net.eval()

# 定义输入名称，list结构，可能有多个输入
input_names = ['input']
# 定义输出名称，list结构，可能有多个输出
output_names = ['output']
# 构造输入用以验证onnx模型的正确性
input = torch.rand(1, 3, 512, 512)
output_path = "bisenet.onnx"
# 导出
torch.onnx.export(net, input, output_path,
                  export_params=True,
                  opset_version=11,
                  do_constant_folding=True,
                  input_names=input_names,
                  output_names=output_names)

# 加载 ONNX 模型
onnx_model = onnx.load("bisenet.onnx")
onnx_model_graph = onnx_model.graph
onnx_session = onnxruntime.InferenceSession(onnx_model.SerializeToString())

# 使用随机张量测试 ONNX 模型
x = torch.randn(1, 3, 512, 512).numpy()
onnx_output = onnx_session.run(output_names, {input_names[0]: x})[0]

print(f"PyTorch output: {net(torch.from_numpy(x)).detach().numpy()[0, :5]}")
print(f"ONNX output: {onnx_output[0, :5]}")

以下参考博客TensorRT加速方法介绍（python pytorch模型）

import torch
import torchvision
from PIL import Image
from torchvision import transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import pdb
import os
import numpy as np
import cv2

# This logger is required to build an engine
TRT_LOGGER = trt.Logger()

filename = "./datasets/steel_total/image/train/11_1.jpg"
engine_file_path = "bisenet_engine.trt"6 


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    # Run inference.
    t_model = time.perf_counter()
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    print(f'only one line cost:{time.perf_counter() - t_model:.8f}s')

    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    # Synchronize the stream
    stream.synchronize()

    # Return only the host outputs.
    return [out.host for out in outputs]


print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

# create the context for this engine
context = engine.create_execution_context()

# allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindings

normalize = transforms.Normalize(mean=(0.3442322, 0.3442322, 0.3442322), # city, rgb
    std=(0.21136102, 0.21136102, 0.21136102))

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.ToTensor(), normalize]
)

t_model = time.perf_counter()

# 读图
img = Image.open("./datasets/steel_total/image/train/11_1.jpg")
#print(img.size)

# 对图像进行归一化
img_p = transform(img)
#print(img_p.shape)

# 增加一个维度
img_normalize = torch.unsqueeze(img_p, 0)
#print(img_normalize.shape)

# output
#shape_of_output = (512, 512)

# covert to numpy
img_normalize_np = img_normalize.cpu().data.numpy()

# Load data to the buffer
inputs[0].host = img_normalize_np
#print(inputs[0].host.shape)

# Do Inference
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)  # numpy data
print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')

print(len(trt_outputs))

pred = trt_outputs[0].reshape(512, 512)*255
#pred = palette[out]
cv2.imwrite('./res.jpg', pred)