环境为CUDA10.2/torch1.6.0
运行:
-1_torch2onnx.py torch模型转为onnx格式
-2_onnx2trt.py onnx转trt格式
-3_inference.py 推理trt格式文件
第一步、配置tensorrt环境,具体参见以下链接:
https://zongxp.blog.csdn.net/article/details/86077553
第二步、pytorch模型转onnx(保证输入尺寸和训练相同):
注意:需要事先训练得到pth模型,也要知道模型的输入输出,比如此次我用的是resnet18,我设置的输入是(1,3,32,32),即batch_size=1,通道数为3,尺寸为32*32,简而言之,就是一次输入一张32*32的彩色RGB图像。
import torch
from resnet import resnet18
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def model_converter():
model = resnet18(num_classes=2)
model.load_state_dict(torch.load("resnet18.pth"))
model.to(device) # 这里保存的是完整模型
model.eval()
dummy_input = torch.randn(1, 3, 32, 32, device=device)
input_names = ['data']
output_names = ['fc']
torch.onnx.export(model, dummy_input, 'resnet18.onnx',
export_params=True,
verbose=True,
input_names=input_names,
output_names=output_names)
model_converter()
第二步、onnx转tensorrt格式(固定模式,改路径即可):
import os
import tensorrt as trt
TRT_LOGGER = trt.Logger()
model_path = 'resnet18.onnx'
engine_file_path = "resnet18.trt"
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) # batchsize=1
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) \
as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 28
builder.max_batch_size = 1
if not os.path.exists(model_path):
print('ONNX file {} not found.'.format(model_path))
exit(0)
print('Loading ONNX file from path {}...'.format(model_path))
with open(model_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
network.get_input(0).shape = [1, 3, 32, 32]
print('Completed parsing of ONNX file')
engine = builder.build_cuda_engine(network)
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
第三步、使用trt模型推理(本质上只需要写自己的前处理代码即可):
重点在3_inference.py推理文件:
tensorrt7.0推理步骤:
1>输入前处理:和训练的前处理过程,保证输入的图片格式和训练一致
2>分配内存:allocate_buffers函数,不需要改动
3>推理函数:do_inference_v2函数,不需要改动,如果需要多张推理,则可用do_inference函数
4>结果:tensorrt推理结果为列表
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import numpy as np
import os
import tensorrt as trt
import time
from PIL import Image
TRT_LOGGER = trt.Logger()
engine_file_path = "resnet18.trt"
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# 推理函数,固定函数
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
i = 0
j = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
inputs, outputs, bindings, stream = allocate_buffers(engine)
print(inputs,outputs,bindings,stream)
dir = "val/white/"
for name in os.listdir(dir):
# 前处理部分
t1 = time.clock()
image_path = os.path.join(dir,name)
img = Image.open(image_path)
img = np.array(img)
img = img.transpose((1, 0, 2))
img = img.transpose((2, 1, 0))
img = img.astype(np.float32) / 255.0
img = img[np.newaxis, :, :].astype(np.float32)
print(img.shape)
img = np.ascontiguousarray(img)
# 前处理结束
# 开始推理
inputs[0].host = img
trt_outputs = do_inference_v2(context, bindings=bindings, \
inputs=inputs, outputs=outputs, stream=stream)
print(trt_outputs)
# 结果判断
if trt_outputs[0][0] > trt_outputs[0][1]:
print("0")
i = i +1
else:
print("1")
j = j +1
print("Time:",time.clock()-t1)
m = 0
n = 0
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, \
runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
inputs, outputs, bindings, stream = allocate_buffers(engine)
print(inputs,outputs,bindings,stream)
dir = "val/yellow/"
for name in os.listdir(dir):
t1 = time.clock()
image_path = os.path.join(dir,name)
image = cv2.imread(image_path)
img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
img = np.array(img)
img = img.transpose((1, 0, 2))
img = img.transpose((2, 1, 0))
img = img.astype(np.float32) / 255.0
img = img[np.newaxis, :, :].astype(np.float32)
print(img.shape)
img = np.ascontiguousarray(img)
inputs[0].host = img
# 开始推理
trt_outputs = do_inference_v2(context, bindings=bindings, \
inputs=inputs, outputs=outputs, stream=stream)
print(trt_outputs)
if trt_outputs[0][0] > trt_outputs[0][1]:
print("0")
m = m+1
else:
print("1")
n = n +1
print("Time:",time.clock()-t1)
#
print("i = ",i)
print("j = ",j)
print("m = ",m)
print("n = ",n)
如果能正常的输出一个列表,则成功。