构建TensorRT引擎加速MaskRCNN

最新推荐文章于 2021-10-27 17:31:15 发布

YuSuen

最新推荐文章于 2021-10-27 17:31:15 发布

阅读量897

点赞数 2

文章标签： tensorflow 深度学习 pytorch

本文链接：https://blog.csdn.net/sinat_24424445/article/details/119057606

版权

使用TensorRT Python API加速MaskRCNN

环境
构建引擎
- 方案一在线构建引擎
- 方案二引擎文件调用

环境

Jetson AGX Xavier Linux Jetpack 4.5.1
cuda 10.2
cudnn 8.0
TensorRT 7.1.3

构建引擎

有两种方案，一种是直接在代码中使用uff模型文件构建引擎进行推理，另一种是利用uff模型文件生成引擎文件engine再调用，其实道理差不多，都是要构建引擎再推理，只是一种在线构建调用，一种离线生成再调用

方案一在线构建引擎

可参考tensorrt/sample/python/end_to_end_tensorflow_mnist/下的样例代码

from PIL import Image
import numpy as np
import os
import tensorrt as trt
import common
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class ModelData(object):
    # uff模型文件
    MODEL_FILE = 'mrcnn_nchw.uff'
    # 网络输入大小
    INPUT_SHAPE = (3, 1024, 1024)
    DTYPE = trt.float16
    # 网络输入节点名称
    INPUT_NAME ="input_image"
    # 网络输出节点名称
    OUTPUT_NAME = "mrcnn_mask/Sigmoid"

# 从uff构建引擎
def build_engine_uff(model_file):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
        builder.max_workspace_size = 1<<30
        parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
        parser.register_output(ModelData.OUTPUT_NAME)
        parser.parse(model_file, network)
        builder.build_cuda_engine(network)
        return builder.build_cuda_engine(network)

# 输入图像处理
def load_normalized_test_case(test_image, pagelocked_buffer):
    # 将图片转换为CHW
    def normalize_image(image):
        c, h, w = ModelData.INPUT_SHAPE
        image_arr = np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(ModelData.DTYPE)).ravel()
        return (image_arr / 255.0 - 0.45) / 0.225

    np.copyto(pagelocked_buffer, normalize_image(Image.open(test_image)))
    return test_image

if __name__ == '__main__':

    imagepath = '/Mask_RCNN/samples/images/*.jpg'
    uff_model_file = ModelData.MODEL_FILE
    # Build a TensorRT engine.
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')  # 加载所有自定义的plugin
    with build_engine_uff(uff_model_file) as engine:
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            case_num = load_normalized_test_case(imagepath, pagelocked_buffer=inputs[0].host)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            # 执行推理
            output = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
            box = output[0]
            mask = output[1]
            box = box.reshape(100, 6)
            mask = mask.reshape(100, 15, 28, 28)

            print(box)
            print(mask)

            # 下面就是box和mask的后期处理了

方案二引擎文件调用

使用trtexec可执行文件生成引擎文件，直接在终端执行命令./trtexec会打印出详细的参数说明

./trtexec --uff=/home/siu/tensorrt/bin/maskrcnn/mrcnn_nchw.uff --saveEngine=/home/siu/tensorrt/bin/maskrcnn/mrcnn_nchw.engine --workspace=8192 --uffInput=input_image,3,1024,1024 --output=mrcnn_detection,mrcnn_mask/Sigmoid --plugins=/home/siu/TensorRT/build1/out/libnvinfer_plugin.so

转换后就可以直接使用引擎文件推理了

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import argparse
import cv2

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

def allocate_buffers(engine, batch_size, is_explicit_batch=False):
    bindings = []

    inputs = []
    outputs = []

    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):  # Determine whether a binding is an input binding.
            inputs.append(HostDeviceMem(host_mem, device_mem))

        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings

# 图片预处理，需要将图片转为nchw格式
def preprocess_image(imagepath):
    bgr_image = cv2.imread(imagepath)
    origin_img = bgr_image

    new_height = 512
    new_width = 512
    pad_img = cv2.resize(origin_img, (new_height, new_width))
    pad_img = pad_img[:, :, ::-1].transpose(2, 0, 1)   # 转为nchw
    pad_img = pad_img.astype(np.float32)
    pad_img /= 255.0
    pad_img = np.ascontiguousarray(pad_img)  #ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组，使得运行速度更快
    pad_img = np.expand_dims(pad_img, axis=0)

    return pad_img

# 执行推理
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer data from CPU to the GPU.
    t1 = time.time()
    # htod： host to device 将数据由cpu复制到gpu device
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    # 当创建network时显式指定了batchsize， 则使用execute_async_v2, 否则使用execute_async
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # gpu to cpu
    # Synchronize the stream
    stream.synchronize()
    t2 = time.time()
    # Return only the host outputs.
    return [out.host for out in outputs], (t2-t1)


def profile_trt(engine, imagepath, batch_size):
    inputs, outputs, bindings = allocate_buffers(engine, batch_size, True)
    stream = cuda.Stream()
    context = engine.create_execution_context()
    testimages = preprocess_image(imagepath)
    input_img_array = np.array(testimages)
    inputs[0].host = input_img_array
    result = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size)

    return result

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--imagepath", default= '/Mask_RCNN/samples/images/*.jpg',type=str, help="please input your imagepath")
    parser.add_argument("--batch_size", default= 1, type=int, help="please input inference batchsize")
    args = parser.parse_args()

    engine_file = '/Mask_RCNN/mrcnn_nchw.engine' # 引擎文件路径
    trt.init_libnvinfer_plugins(TRT_LOGGER, '') # 加载所有自定义的plugin

    load_engine = True
    if load_engine:
        runtime = trt.Runtime(TRT_LOGGER)
        with open(engine_file, "rb") as f:
            trt_engine = runtime.deserialize_cuda_engine(f.read())

        result = profile_trt(trt_engine, args.imagepath, args.batch_size)

        box = result[0]
        mask = result[1]
        box = box.reshape(100, 6)
        mask = mask.reshape(100, 15, 28, 28)
        
        print(box)
        print(mask)

        # 下面就是box和mask的后期处理了

YuSuen

关注

2
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
构建TensorRT引擎加速MaskRCNN

使用TensorRT Python API加速MaskRCNN环境构建引擎在线构建引擎引擎文件生成加速MaskRCNN环境Jetson AGX Xavier Linux Jetpack 4.5.1cuda 10.2cudnn 8.0TensorRT 7.1.3构建引擎有两种方案，一种是直接在代码中使用uff模型文件构建引擎进行推理，另一种是利用uff模型文件生成引擎文件engine再调用，其实道理差不多，都是要构建引擎再推理，只是一种在线构建调用，一种离线生成再调用在线构建引擎可参考te
复制链接

扫一扫