使用TensorRT Python API加速MaskRCNN
环境
Jetson AGX Xavier Linux Jetpack 4.5.1
cuda 10.2
cudnn 8.0
TensorRT 7.1.3
构建引擎
有两种方案,一种是直接在代码中使用uff模型文件构建引擎进行推理,另一种是利用uff模型文件生成引擎文件engine再调用,其实道理差不多,都是要构建引擎再推理,只是一种在线构建调用,一种离线生成再调用
方案一 在线构建引擎
可参考tensorrt/sample/python/end_to_end_tensorflow_mnist/
下的样例代码
from PIL import Image
import numpy as np
import os
import tensorrt as trt
import common
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
class ModelData(object):
# uff模型文件
MODEL_FILE = 'mrcnn_nchw.uff'
# 网络输入大小
INPUT_SHAPE = (3, 1024, 1024)
DTYPE = trt.float16
# 网络输入节点名称
INPUT_NAME ="input_image"
# 网络输出节点名称
OUTPUT_NAME = "mrcnn_mask/Sigmoid"
# 从uff构建引擎
def build_engine_uff(model_file):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_workspace_size = 1<<30
parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
parser.register_output(ModelData.OUTPUT_NAME)
parser.parse(model_file, network)
builder.build_cuda_engine(network)
return builder.build_cuda_engine(network)
# 输入图像处理
def load_normalized_test_case(test_image, pagelocked_buffer):
# 将图片转换为CHW
def normalize_image(image):
c, h, w = ModelData.INPUT_SHAPE
image_arr = np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(ModelData.DTYPE)).ravel()
return (image_arr / 255.0 - 0.45) / 0.225
np.copyto(pagelocked_buffer, normalize_image(Image.open(test_image)))
return test_image
if __name__ == '__main__':
imagepath = '/Mask_RCNN/samples/images/*.jpg'
uff_model_file = ModelData.MODEL_FILE
# Build a TensorRT engine.
trt.init_libnvinfer_plugins(TRT_LOGGER, '') # 加载所有自定义的plugin
with build_engine_uff(uff_model_file) as engine:
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
with engine.create_execution_context() as context:
case_num = load_normalized_test_case(imagepath, pagelocked_buffer=inputs[0].host)
# For more information on performing inference, refer to the introductory samples.
# The common.do_inference function will return a list of outputs - we only have one in this case.
# 执行推理
output = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
box = output[0]
mask = output[1]
box = box.reshape(100, 6)
mask = mask.reshape(100, 15, 28, 28)
print(box)
print(mask)
# 下面就是box和mask的后期处理了
方案二 引擎文件调用
使用trtexec
可执行文件生成引擎文件,直接在终端执行命令./trtexec
会打印出详细的参数说明
./trtexec --uff=/home/siu/tensorrt/bin/maskrcnn/mrcnn_nchw.uff --saveEngine=/home/siu/tensorrt/bin/maskrcnn/mrcnn_nchw.engine --workspace=8192 --uffInput=input_image,3,1024,1024 --output=mrcnn_detection,mrcnn_mask/Sigmoid --plugins=/home/siu/TensorRT/build1/out/libnvinfer_plugin.so
转换后就可以直接使用引擎文件推理了
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import argparse
import cv2
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
def allocate_buffers(engine, batch_size, is_explicit_batch=False):
bindings = []
inputs = []
outputs = []
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
if engine.binding_is_input(binding): # Determine whether a binding is an input binding.
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
# 图片预处理,需要将图片转为nchw格式
def preprocess_image(imagepath):
bgr_image = cv2.imread(imagepath)
origin_img = bgr_image
new_height = 512
new_width = 512
pad_img = cv2.resize(origin_img, (new_height, new_width))
pad_img = pad_img[:, :, ::-1].transpose(2, 0, 1) # 转为nchw
pad_img = pad_img.astype(np.float32)
pad_img /= 255.0
pad_img = np.ascontiguousarray(pad_img) #ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快
pad_img = np.expand_dims(pad_img, axis=0)
return pad_img
# 执行推理
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
# Transfer data from CPU to the GPU.
t1 = time.time()
# htod: host to device 将数据由cpu复制到gpu device
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
# 当创建network时显式指定了batchsize, 则使用execute_async_v2, 否则使用execute_async
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# gpu to cpu
# Synchronize the stream
stream.synchronize()
t2 = time.time()
# Return only the host outputs.
return [out.host for out in outputs], (t2-t1)
def profile_trt(engine, imagepath, batch_size):
inputs, outputs, bindings = allocate_buffers(engine, batch_size, True)
stream = cuda.Stream()
context = engine.create_execution_context()
testimages = preprocess_image(imagepath)
input_img_array = np.array(testimages)
inputs[0].host = input_img_array
result = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size)
return result
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--imagepath", default= '/Mask_RCNN/samples/images/*.jpg',type=str, help="please input your imagepath")
parser.add_argument("--batch_size", default= 1, type=int, help="please input inference batchsize")
args = parser.parse_args()
engine_file = '/Mask_RCNN/mrcnn_nchw.engine' # 引擎文件路径
trt.init_libnvinfer_plugins(TRT_LOGGER, '') # 加载所有自定义的plugin
load_engine = True
if load_engine:
runtime = trt.Runtime(TRT_LOGGER)
with open(engine_file, "rb") as f:
trt_engine = runtime.deserialize_cuda_engine(f.read())
result = profile_trt(trt_engine, args.imagepath, args.batch_size)
box = result[0]
mask = result[1]
box = box.reshape(100, 6)
mask = mask.reshape(100, 15, 28, 28)
print(box)
print(mask)
# 下面就是box和mask的后期处理了