import torch
import torchvision
from PIL import Image
from torchvision import transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import pdb
import os
import numpy as np
import cv2
# This logger is required to build an engine
TRT_LOGGER = trt.Logger()
filename ="./datasets/steel_total/image/train/11_1.jpg"
engine_file_path ="bisenet_engine.trt"6classHostDeviceMem(object):def__init__(self, host_mem, device_mem):"""Within this context, host_mom means the cpu memory and device means the GPU memory
"""
self.host = host_mem
self.device = device_mem
def__str__(self):return"Host:\n"+str(self.host)+"\nDevice:\n"+str(self.device)def__repr__(self):return self.__str__()defallocate_buffers(engine):
inputs =[]
outputs =[]
bindings =[]
stream = cuda.Stream()for binding in engine:
size = trt.volume(engine.get_binding_shape(binding))* engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)# Append the device buffer to device bindings.
bindings.append(int(device_mem))# Append to the appropriate list.if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))else:
outputs.append(HostDeviceMem(host_mem, device_mem))return inputs, outputs, bindings, stream
defdo_inference(context, bindings, inputs, outputs, stream, batch_size=1):# Transfer data from CPU to the GPU.[cuda.memcpy_htod_async(inp.device, inp.host, stream)for inp in inputs]# Run inference.
t_model = time.perf_counter()
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)print(f'only one line cost:{time.perf_counter()- t_model:.8f}s')# Transfer predictions back from the GPU.[cuda.memcpy_dtoh_async(out.host, out.device, stream)for out in outputs]# Synchronize the stream
stream.synchronize()# Return only the host outputs.return[out.host for out in outputs]print("Reading engine from file {}".format(engine_file_path))withopen(engine_file_path,"rb")as f, trt.Runtime(TRT_LOGGER)as runtime:
engine = runtime.deserialize_cuda_engine(f.read())# create the context for this engine
context = engine.create_execution_context()# allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine)# input, output: host # bindings
normalize = transforms.Normalize(mean=(0.3442322,0.3442322,0.3442322),# city, rgb
std=(0.21136102,0.21136102,0.21136102))
transform = transforms.Compose([
transforms.Resize(512),
transforms.ToTensor(), normalize])
t_model = time.perf_counter()# 读图
img = Image.open("./datasets/steel_total/image/train/11_1.jpg")#print(img.size)# 对图像进行归一化
img_p = transform(img)#print(img_p.shape)# 增加一个维度
img_normalize = torch.unsqueeze(img_p,0)#print(img_normalize.shape)# output#shape_of_output = (512, 512)# covert to numpy
img_normalize_np = img_normalize.cpu().data.numpy()# Load data to the buffer
inputs[0].host = img_normalize_np
#print(inputs[0].host.shape)# Do Inference
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)# numpy dataprint(f'do inference cost:{time.perf_counter()- t_model:.8f}s')print(len(trt_outputs))
pred = trt_outputs[0].reshape(512,512)*255#pred = palette[out]
cv2.imwrite('./res.jpg', pred)