p
y
t
h
o
n
使用
T
e
n
s
o
r
R
T
引擎
python使用TensorRT引擎
python使用TensorRT引擎
安装库
pip install pycuda -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
pip install tensorrt-8.2.3.0-cp36-none-win_amd64.whl
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/e7a4d173e64620e2d90c01e7741e9e39.png)
转ONNX
import torch.nn as nn
import torch
from collections import OrderedDict
import torch
import torchvision.models as models
from torch import nn
import numpy as np
import torch
from torchvision import models
from torch import nn
def bilinear_kernel(in_channels, out_channels, kernel_size):
"""Define a bilinear kernel according to in channels and out channels.
Returns:
return a bilinear filter tensor
"""
factor = (kernel_size + 1) // 2
if kernel_size % 2 == 1:
center = factor - 1
else:
center = factor - 0.5
og = np.ogrid[:kernel_size, :kernel_size]
bilinear_filter = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype=np.float32)
weight[range(in_channels), range(out_channels), :, :] = bilinear_filter
return torch.from_numpy(weight)
pretrained_net = models.vgg16_bn(pretrained=False)
class FCN(nn.Module):
def __init__(self, num_classes):
super().__init__()
self.stage1 = pretrained_net.features[:7]
self.stage2 = pretrained_net.features[7:14]
self.stage3 = pretrained_net.features[14:24]
self.stage4 = pretrained_net.features[24:34]
self.stage5 = pretrained_net.features[34:]
self.scores1 = nn.Conv2d(512, num_classes, 1)
self.scores2 = nn.Conv2d(512, num_classes, 1)
self.scores3 = nn.Conv2d(128, num_classes, 1)
self.conv_trans1 = nn.Conv2d(512, 256, 1)
self.conv_trans2 = nn.Conv2d(256, num_classes, 1)
self.upsample_8x = nn.ConvTranspose2d(num_classes, num_classes, 16, 8, 4, bias=False)
self.upsample_8x.weight.data = bilinear_kernel(num_classes, num_classes, 16)
self.upsample_2x_1 = nn.ConvTranspose2d(512, 512, 4, 2, 1, bias=False)
self.upsample_2x_1.weight.data = bilinear_kernel(512, 512, 4)
self.upsample_2x_2 = nn.ConvTranspose2d(256, 256, 4, 2, 1, bias=False)
self.upsample_2x_2.weight.data = bilinear_kernel(256, 256, 4)
def forward(self, x):
s1 = self.stage1(x)
s2 = self.stage2(s1)
s3 = self.stage3(s2)
s4 = self.stage4(s3)
s5 = self.stage5(s4)
scores1 = self.scores1(s5)
s5 = self.upsample_2x_1(s5)
add1 = s5 + s4
scores2 = self.scores2(add1)
add1 = self.conv_trans1(add1)
add1 = self.upsample_2x_2(
add1)
add2 = add1 + s3
output = self.conv_trans2(add2)
output = self.upsample_8x(
output)
return output
vgg16_pretrained = models.vgg16(pretrained=False)
def decoder(input_channel, output_channel, num=3):
if num == 3:
decoder_body = nn.Sequential(
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, output_channel, 3, padding=1))
elif num == 2:
decoder_body = nn.Sequential(
nn.Conv2d(input_channel, input_channel, 3, padding=1),
nn.Conv2d(input_channel, output_channel, 3, padding=1))
return decoder_body
class VGG16_deconv(torch.nn.Module):
def __init__(self, num_classes=8):
super(VGG16_deconv, self).__init__()
pool_list = [4, 9, 16, 23, 30]
for index in pool_list:
vgg16_pretrained.features[index].return_indices = True
self.encoder1 = vgg16_pretrained.features[:4]
self.pool1 = vgg16_pretrained.features[4]
self.encoder2 = vgg16_pretrained.features[5:9]
self.pool2 = vgg16_pretrained.features[9]
self.encoder3 = vgg16_pretrained.features[10:16]
self.pool3 = vgg16_pretrained.features[16]
self.encoder4 = vgg16_pretrained.features[17:23]
self.pool4 = vgg16_pretrained.features[23]
self.encoder5 = vgg16_pretrained.features[24:30]
self.pool5 = vgg16_pretrained.features[30]
self.decoder5 = decoder(512, 512)
self.unpool5 = nn.MaxUnpool2d(2, 2)
self.decoder4 = decoder(512, 256)
self.unpool4 = nn.MaxUnpool2d(2, 2)
self.decoder3 = decoder(256, 128)
self.unpool3 = nn.MaxUnpool2d(2, 2)
self.decoder2 = decoder(128, 64, 2)
self.unpool2 = nn.MaxUnpool2d(2, 2)
self.decoder1 = decoder(64, num_classes, 2)
self.unpool1 = nn.MaxUnpool2d(2, 2)
def forward(self, x):
encoder1 = self.encoder1(x);
output_size1 = encoder1.size()
pool1, indices1 = self.pool1(encoder1);
encoder2 = self.encoder2(pool1);
output_size2 = encoder2.size()
pool2, indices2 = self.pool2(encoder2);
encoder3 = self.encoder3(pool2);
output_size3 = encoder3.size()
pool3, indices3 = self.pool3(encoder3);
encoder4 = self.encoder4(pool3);
output_size4 = encoder4.size()
pool4, indices4 = self.pool4(encoder4);
encoder5 = self.encoder5(pool4);
output_size5 = encoder5.size()
pool5, indices5 = self.pool5(encoder5);
unpool5 = self.unpool5(input=pool5, indices=indices5, output_size=output_size5);
decoder5 = self.decoder5(unpool5);
unpool4 = self.unpool4(input=decoder5, indices=indices4, output_size=output_size4);
decoder4 = self.decoder4(unpool4);
unpool3 = self.unpool3(input=decoder4, indices=indices3, output_size=output_size3);
decoder3 = self.decoder3(unpool3);
unpool2 = self.unpool2(input=decoder3, indices=indices2, output_size=output_size2);
decoder2 = self.decoder2(unpool2);
unpool1 = self.unpool1(input=decoder2, indices=indices1, output_size=output_size1);
decoder1 = self.decoder1(unpool1);
return decoder1
net = VGG16_deconv(num_classes=3)
net.eval()
net.load_state_dict(torch.load(r'.\best_model.pth'))
trace = torch.jit.trace(net, torch.randn(1, 3, 640, 640))
torch.jit.save(trace,'FCN_model2.pt')
model = torch.load('FCN_model2.pt')
model.eval()
model.cuda()
input_x = torch.randn(1,3,640,640).cuda()
res = torch.onnx.export(model,
input_x,
'FCN_model2.onnx',
input_names=['inputs'],
output_names=['outputs'],
opset_version=11,
dynamic_axes={"inputs": {0: "bs",1: "channel",2: "h",3: "w"}},
)
转引擎
import tensorrt as trt
import os
import common
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger()
trt.init_libnvinfer_plugins(TRT_LOGGER,"")
'''
动态大小输入参数设置
'''
mix_size = (1, 3, 128, 128)
common_size = (1, 3, 640, 640)
max_size = (1, 3, 2048, 2048)
'''
静态大小输入
onnx模型转TensorRT的engine
'''
def get_engine(onnx_file_path, engine_file_path=""):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
def build_engine():
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
builder.create_builder_config() as config, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
trt.Runtime(TRT_LOGGER) as runtime:
config.max_workspace_size = 1 << 32
builder.max_batch_size = 1
if not os.path.exists(onnx_file_path):
print(
"ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
)
exit(0)
print("Loading ONNX file from path {}...".format(onnx_file_path))
with open(onnx_file_path, "rb") as model:
print("Beginning ONNX file parsing")
if not parser.parse(model.read()):
print("ERROR: Failed to parse the ONNX file.")
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
profile = builder.create_optimization_profile()
print("network.get_input(0).name:", network.get_input(0).name)
profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
(1, 3, 648, 648))
config.add_optimization_profile(profile)
inputs = [network.get_input(i) for i in range(network.num_inputs)]
print("input",inputs)
outputs = [network.get_output(i) for i in range(network.num_outputs)]
print("out:",outputs)
print("Completed parsing of ONNX file")
print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
with open(engine_file_path, "wb") as f:
f.write(plan)
return engine
if os.path.exists(engine_file_path):
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine()
self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(engine)
self.context = engine.create_execution_context()
def get_DynEngine(onnx_file_path, engine_file_path):
'''
Attempts to load a serialized engine if available,
otherwise build a new TensorRT engine as save it
'''
def build_engine():
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(common.EXPLICIT_BATCH)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, TRT_LOGGER)
runtime = trt.Runtime(TRT_LOGGER)
print("common.EXPLICIT_BATCH:",common.EXPLICIT_BATCH)
config.max_workspace_size = 10 << 30
print("max_workspace_size:",config.max_workspace_size)
builder.max_batch_size = 1
if not os.path.exists(onnx_file_path):
print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
exit(0)
print(f'Loading ONNX file from path {onnx_file_path}...')
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR:Failed to parse the ONNX file')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
inputs = [network.get_input(i) for i in range(network.num_inputs)]
print("input",inputs)
outputs = [network.get_output(i) for i in range(network.num_outputs)]
print("out:",outputs)
print("Network Description")
for input in inputs:
batch_size = input.shape[0]
print("Input '{}' with shape {} and dtype {} . ".format(input.name, input.shape, input.dtype))
for output in outputs:
print("Output '{}' with shape {} and dtype {} . ".format(output.name, output.shape, output.dtype))
profile = builder.create_optimization_profile()
print("network.get_input(0).name:",network.get_input(0).name)
profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
(1, 3, 648, 648))
config.add_optimization_profile(profile)
print('Completed parsing the ONNX file')
print(f'Building an engine from file {onnx_file_path}; this may take a while...')
engine = builder.build_engine(network, config)
print('Completed creating Engine')
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
return engine
if os.path.exists(engine_file_path):
print(f'Reading engine from file {engine_file_path}')
build_engine()
with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine()
if __name__ == "__main__":
"""Create a TensorRT engine for seg and run inference."""
from datetime import datetime
startTime = datetime.now()
onnx_file_path = "FCN_model2.onnx"
engine_file_path = "model222.engine"
get_DynEngine(onnx_file_path, engine_file_path)
endTime = datetime.now()
duringTime = endTime - startTime
print(duringTime)
引擎推理
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import torch
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import torch.nn.functional as F
'''
step1:创建logger:日志记录器
'''
TRT_LOGGER = trt.Logger()
import cv2
'''
输入数据-前处理
'''
def preprocess(image):
mean = np.array([0.485, 0.456, 0.406]).astype('float32')
stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev
return np.moveaxis(data, 2, 0)
'''
模型输出数据-后处理
'''
def postprocess(data):
num_classes = 21
palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
img = Image.fromarray(data.astype('uint8'), mode='P')
img.putpalette(colors)
return img
'''
#step2:创建runtime并反序列化生成engine
'''
def load_engine(engine_file_path):
assert os.path.exists(engine_file_path)
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
'''
显示图像
'''
def pred2show(mask,iii):
path_color2class_table = r".\color2class_table.csv"
dataframe = pd.read_csv(path_color2class_table)
list_rgb = []
list_class_id = []
for i in range(len(dataframe)):
rgb = list(dataframe.iloc[i][2:])
class_id = int(dataframe.iloc[i][0])
list_rgb.append(rgb)
list_class_id.append(class_id)
for i in range(len(list_rgb)):
list_rgb[i] = i*255
dict_color2class = dict(zip(list_class_id, list_rgb))
crop_size = (640, 640)
pred = np.empty([crop_size[0], crop_size[1]], dtype=int)
height = mask.shape[0]
weight = mask.shape[1]
for row in range(height):
for col in range(weight):
pred[row,col] = np.array(dict_color2class[mask[row,col]])
cv2.imwrite(output_file,pred)
img_show = cv2.imread("test"+str(iii)+".png")
cv2.imshow("test",img_show)
cv2.waitKey(0)
'''
推理
'''
def infer(engine, input_file, output_file):
print("Reading input image from file {}".format(input_file))
with Image.open(input_file) as img:
img =img.resize((640, 640), Image.ANTIALIAS)
input_image = preprocess(img)
image_width = img.width
image_height = img.height
with engine.create_execution_context() as context:
context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
bindings = []
for binding in engine:
binding_idx = engine.get_binding_index(binding)
size = trt.volume(context.get_binding_shape(binding_idx))
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffer = np.ascontiguousarray(input_image)
input_memory = cuda.mem_alloc(input_image.nbytes)
bindings.append(int(input_memory))
else:
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = cuda.mem_alloc(output_buffer.nbytes)
bindings.append(int(output_memory))
stream = cuda.Stream()
cuda.memcpy_htod_async(input_memory, input_buffer, stream)
import datetime
startTime = datetime.datetime.now()
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
endTime = datetime.datetime.now()
durTime = 'funtion time use:%dms' % (
(endTime - startTime).seconds * 1000 + (endTime - startTime).microseconds / 1000)
print(durTime)
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
stream.synchronize()
print(output_buffer)
res = np.reshape(output_buffer, (2,image_height, image_width))
print(res)
'''
动态输入推理
'''
import common
def infer2(engine, input_file, output_file):
with Image.open(input_file) as img:
img =img.resize((640, 640), Image.ANTIALIAS)
input_image = preprocess(img)
width = img.width
height = img.height
context = engine.create_execution_context()
inputs, outputs, bindings, stream = common.allocate_buffers(engine, (height, width))
context.active_optimization_profile = 0
origin_inputshape = context.get_binding_shape(0)
if origin_inputshape[-1] == -1:
origin_inputshape[-2], origin_inputshape[-1] = (height, width)
context.set_binding_shape(0, (origin_inputshape))
print(f'Running inference on image {input_file}...')
tmpImg = input_image[np.newaxis, :, :, :]
inputs[0].host = np.ascontiguousarray(tmpImg)
trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
trt_outputs = np.reshape(trt_outputs, (3,height, width))
print(trt_outputs)
if __name__ == '__main__':
flag = 0
if flag:
engine_file = "model_seg.engine"
input_file = r".\liver\train\image\0.png"
output_file = "output.png"
img = Image.open(input_file)
print("Running TensorRT inference for Seg")
with load_engine(engine_file) as engine:
infer(engine, input_file, output_file)
else:
engine_file = "model222.engine"
input_file = r".\liver\train\image\0.png"
output_file = "output.png"
with load_engine(engine_file) as engine:
infer2(engine,input_file,output_file)