pytorch转tensorRT步骤

三叔家的猫
已于 2022-05-05 10:24:04 修改
阅读量1w
点赞数 10
分类专栏：笔记文章标签：深度学习 cuda
于 2020-12-18 14:56:34 首次发布
本文链接：https://blog.csdn.net/qq_39056987/article/details/111362848
版权
笔记专栏收录该内容
32 篇文章
订阅专栏
序言

最近在摸索pytorch转tensorRT部署，看了很多示例代码，把步骤总结了一下。各种模型转换都大同小异，有所收获，记录一下。
一、转换流程

pytorch转tensorRT步骤：
使用pytorch训练得到pt文件；
将pt文件转换为onnx中间件；
使用onnxsim.simplify对转换后的onnx进行简化；
解析onnx文件构建trt推理引擎；
加载引擎执行推理，为引擎输入、输出、模型分配空间；
将待推理的数据（预处理后的img数据）赋值给inputs（引擎输入）;
执行推理，拿到outputs；
对outputs后处理，根据构建引擎时的格式取出输出，reshape到指定格式（和torch推理后的格式一样）；
然后知道该怎么做了吧。
部署整体流程：
导出网络定义以及相关权重；
解析网络定义以及相关权重；
根据显卡算子构造出最优执行计划；
将执行计划序列化存储；
反序列化执行计划；
进行推理
贴一段流程代码，来自 yolov5-tensorrt：
import sys
sys.path.append('yolov5')

from models.common import *
from utils.torch_utils import *
from utils.datasets import *
from yolo import *
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
import copy
import numpy as np
import os
from onnxsim import simplify
import onnx
import struct
import yaml
import torchvision

device = select_device('0')        # cuda
weights = 'yolov5s.pt'         # 权重
model_config = 'yolov5s.yaml'      # 模型yaml
TRT_LOGGER = trt.Logger(trt.Logger.INFO)        # 需要这个记录器来构建一个引擎
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
image_loader = LoadImages('yolov5/inference/images', img_size=640)      # 推理文件夹
image_loader.__iter__()
_, input_img, _, _ = image_loader.__next__()      # 迭代图片输入
input_img = input_img.astype(np.float)         # 输入图片
input_img /= 255.0
input_img = np.expand_dims(input_img, axis=0)

with open(model_config) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)  # model dict         模型加载
    num_classes = cfg['nc']

# nms config
conf_thres = 0.4
iou_thres = 0.5
max_det = 300
# nms GPU
topK = 512    # max supported is 4096, if conf_thres is low, such as 0.001, use larger number.
keepTopK = max_det


def GiB(val):
    return val * 1 << 30


#  different from yolov5/utils/non_max_suppression, xywh2xyxy(x[:, :4]) is no longer needed (contained in Detect())
def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, merge=False, classes=None, agnostic=False):
    """Performs Non-Maximum Suppression (NMS) on inference results
	这个是yolov5的nms处理部分，不是trt处理的，可以不用理
    Returns:
         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
    """
    if prediction.dtype is torch.float16:
        prediction = prediction.float()  # to FP32

    nc = prediction[0].shape[1] - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Settings
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
    max_det = 300  # maximum number of detections per image
    time_limit = 10.0  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)

    t = time.time()
    output = [None] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = x[:, :4] #xywh2xyxy(x[:, :4])

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:] > conf_thres).nonzero().t()
            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
        else:  # best class only
            conf, j = x[:, 5:].max(1, keepdim=True)
            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Apply finite constraint
        # if not torch.isfinite(x).all():
        #     x = x[torch.isfinite(x).all(1)]

        # If none remain process next image
        n = x.shape[0]  # number of boxes
        if not n:
            continue

        # Sort by confidence
        # x = x[x[:, 4].argsort(descending=True)]

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                weights = iou * scores[None]  # box weights
                x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
                if redundant:
                    i = i[iou.sum(1) > 1]  # require redundancy
            except:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
                print(x, i, x.shape, i.shape)
                pass

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            break  # time limit exceeded

    return output


def load_model():      # 模型加载
    # Load model
    model = Model(model_config)
    ckpt = torch.load(weights, map_location=torch.device('cpu'))
    ckpt['model'] = \
                {k: v for k, v in ckpt['model'].state_dict().items() if model.state_dict()[k].numel() == v.numel()}
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    return model


def export_onnx(model, batch_size):             # 模型转onnx
    _,_,x,y = input_img.shape
    img = torch.zeros((batch_size, 3, x, y))
    torch.onnx.export(model, (img), 'yolov5_{}.onnx'.format(batch_size), 
           input_names=["data"], output_names=["prediction"], verbose=True, opset_version=11, operator_export_type=torch.onnx.OperatorExportTypes.ONNX
    )


def simplify_onnx(onnx_path):             # onnx模型简化
    model = onnx.load(onnx_path)
    model_simp, check = simplify(model)
    assert check, "Simplified ONNX model could not be validated"
    onnx.save(model_simp, onnx_path)


def build_engine(onnx_path, using_half):
    """
    ONNX模型转换为trt序列化模型
    Args:
        onnx_path:
        using_half:

    Returns:

    """
    trt.init_libnvinfer_plugins(None, '')
    engine_file = onnx_path.replace(".onnx", ".engine")
    if os.path.exists(engine_file):            # 存在序列化文件直接加载返回，否则构建引擎
        with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_batch_size = 1 # always 1 for explicit batch
        config = builder.create_builder_config()
        config.max_workspace_size = GiB(1)
        if using_half:
            config.set_flag(trt.BuilderFlag.FP16)
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None
        
        previous_output = network.get_output(0)
        network.unmark_output(previous_output)

        # slice boxes, obj_score, class_scores
        strides = trt.Dims([1,1,1])
        starts = trt.Dims([0,0,0])
        bs, num_boxes, _ = previous_output.shape
        shapes = trt.Dims([bs, num_boxes, 4])
        boxes = network.add_slice(previous_output, starts, shapes, strides)
        starts[2] = 4
        shapes[2] = 1
        obj_score = network.add_slice(previous_output, starts, shapes, strides)
        starts[2] = 5
        shapes[2] = num_classes
        scores = network.add_slice(previous_output, starts, shapes, strides)

        indices = network.add_constant(trt.Dims([num_classes]), trt.Weights(np.zeros(num_classes, np.int32)))
        gather_layer = network.add_gather(obj_score.get_output(0), indices.get_output(0), 2)

        # scores = obj_score * class_scores => [bs, num_boxes, nc]
        updated_scores = network.add_elementwise(gather_layer.get_output(0), scores.get_output(0), trt.ElementWiseOperation.PROD)

        # reshape box to [bs, num_boxes, 1, 4]
        reshaped_boxes = network.add_shuffle(boxes.get_output(0))
        reshaped_boxes.reshape_dims = trt.Dims([0,0,1,4])

        # add batchedNMSPlugin, inputs:[boxes:(bs, num, 1, 4), scores:(bs, num, 1)]
        trt.init_libnvinfer_plugins(TRT_LOGGER, "")
        registry = trt.get_plugin_registry()
        assert(registry)
        creator = registry.get_plugin_creator("BatchedNMS_TRT", "1")
        assert(creator)
        fc = []
        fc.append(trt.PluginField("shareLocation", np.array([1], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("backgroundLabelId", np.array([-1], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("numClasses", np.array([num_classes], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("topK", np.array([topK], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("keepTopK", np.array([keepTopK], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("scoreThreshold", np.array([conf_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
        fc.append(trt.PluginField("iouThreshold", np.array([iou_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
        fc.append(trt.PluginField("isNormalized", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
        fc.append(trt.PluginField("clipBoxes", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
        
        fc = trt.PluginFieldCollection(fc) 
        nms_layer = creator.create_plugin("nms_layer", fc)

        layer = network.add_plugin_v2([reshaped_boxes.get_output(0), updated_scores.get_output(0)], nms_layer)
        layer.get_output(0).name = "num_detections"
        layer.get_output(1).name = "nmsed_boxes"
        layer.get_output(2).name = "nmsed_scores"
        layer.get_output(3).name = "nmsed_classes"
        for i in range(4):
            network.mark_output(layer.get_output(i))

        return builder.build_engine(network, config)


def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]):  
	# 分配缓冲区，这一步是通用的
    inputs = []
    outputs = []
    bindings = []

    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:
        dims = engine.get_binding_shape(binding)
        print(dims)
        if dims[0] == -1:
            assert(len(dynamic_shapes) > 0)
            dims[0] = dynamic_shapes[0]
        size = trt.volume(dims) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings


def profile_trt(engine, batch_size, num_warmups=10, num_iters=100):         # trt推理
    assert(engine is not None)  
    input_img_array = np.array([input_img] * batch_size)      # 输入

    yolo_inputs, yolo_outputs, yolo_bindings = allocate_buffers(engine, True)       # 内存分配
    
    stream = cuda.Stream()     # pycuda 操作缓冲区
    with engine.create_execution_context() as context:
        
        total_duration = 0.
        total_compute_duration = 0.
        total_pre_duration = 0.
        total_post_duration = 0.
        for iteration in range(num_iters):
            pre_t = time.time()
            # set host data
            img = torch.from_numpy(input_img_array).float().numpy()
            yolo_inputs[0].host = img         # 输入赋值到inputs
            #  Transfer data from CPU to the GPU.  将数据从CPU转移到GPU。
            [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in yolo_inputs]
            # 线程同步
            stream.synchronize()
            start_t = time.time()
            # 执行模型推理
            context.execute_async_v2(bindings=yolo_bindings, stream_handle=stream.handle)
            stream.synchronize()
            end_t = time.time()
            # Transfer predictions back from the GPU.从GPU传回的传输预测。
            [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in yolo_outputs]
            stream.synchronize()
            post_t = time.time()

            duration = post_t - pre_t
            compute_duration = end_t - start_t
            pre_duration = start_t - pre_t
            post_duration = post_t - end_t
            if iteration >= num_warmups:
                total_duration += duration
                total_compute_duration += compute_duration
                total_post_duration += post_duration
                total_pre_duration += pre_duration
        
        print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
        print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
        print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
        print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))
        
        # 取出outputs，reshape成输出格式
        num_det = int(yolo_outputs[0].host[0, ...])
        boxes = np.array(yolo_outputs[1].host).reshape(batch_size, -1, 4)[0, 0:num_det, 0:4]
        scores = np.array(yolo_outputs[2].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
        classes = np.array(yolo_outputs[3].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
        
        return [np.concatenate([boxes, scores, classes], -1)]

def profile_torch(model, using_half, batch_size, num_warmups=10, num_iters=100):            # 正常的torch推理，这一步主要是和trt推理做个对比

    model.to(device)
    
    total_duration = 0.
    total_compute_duration = 0.
    total_pre_duration = 0.
    total_post_duration = 0.
    if using_half:
        model.half()
    for iteration in range(num_iters):
        pre_t = time.time()
        # set host data
        img = torch.from_numpy(input_img).float().to(device)
        if using_half:
            img = img.half()
        start_t = time.time()
        _ = model(img)
        output = non_max_suppression(_[0], conf_thres, iou_thres)
        end_t = time.time()
        [i.cpu() for i in _]
        post_t = time.time()

        duration = post_t - pre_t
        compute_duration = end_t - start_t
        pre_duration = start_t - pre_t
        post_duration = post_t - end_t
        if iteration >= num_warmups:
            total_duration += duration
            total_compute_duration += compute_duration
            total_post_duration += post_duration
            total_pre_duration += pre_duration
    
    print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
    print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
    print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
    print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))

    return [output[0].cpu().numpy()]


if __name__ == '__main__':
    batch_size = 1     # only works for TRT. perf reported by torch is working on non-batched data.
    using_half = False
    onnx_path = 'yolov5_{}.onnx'.format(batch_size)
    
    with torch.no_grad():
        model = load_model()            # 模型加载
        export_onnx(model, batch_size)   # 转onnx
        simplify_onnx(onnx_path)     # onnx简化

        trt_result = profile_trt(build_engine(onnx_path, using_half), batch_size, 10, 100)      # trt推理
        if using_half:
            model.half()
        torch_result = profile_torch(model, using_half, batch_size, 10, 100)      # torch推理
        
        print(trt_result)
        print(torch_result)