python使用TensorRT引擎

最新推荐文章于 2024-01-25 16:25:02 发布

*Major*

最新推荐文章于 2024-01-25 16:25:02 发布

阅读量1k

点赞数 1

文章标签： python numpy 深度学习

本文链接：https://blog.csdn.net/qq_41375318/article/details/127393172

版权

$p y t h o n 使用 T e n sor RT 引擎$

安装库

pip install  pycuda -i http://pypi.douban.com/simple/  --trusted-host pypi.douban.com

pip install tensorrt-8.2.3.0-cp36-none-win_amd64.whl

在这里插入图片描述

转ONNX

import torch.nn as nn
import torch
from collections import OrderedDict
import torch
import torchvision.models as models
from torch import nn

import numpy as np
import torch
from torchvision import models
from torch import nn


def bilinear_kernel(in_channels, out_channels, kernel_size):
    """Define a bilinear kernel according to in channels and out channels.
    Returns:
        return a bilinear filter tensor
    """
    factor = (kernel_size + 1) // 2
    if kernel_size % 2 == 1:
        center = factor - 1
    else:
        center = factor - 0.5
    og = np.ogrid[:kernel_size, :kernel_size]
    bilinear_filter = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype=np.float32)
    weight[range(in_channels), range(out_channels), :, :] = bilinear_filter
    return torch.from_numpy(weight)


pretrained_net = models.vgg16_bn(pretrained=False)


class FCN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.stage1 = pretrained_net.features[:7]
        self.stage2 = pretrained_net.features[7:14]
        self.stage3 = pretrained_net.features[14:24]
        self.stage4 = pretrained_net.features[24:34]
        self.stage5 = pretrained_net.features[34:]

        self.scores1 = nn.Conv2d(512, num_classes, 1)
        self.scores2 = nn.Conv2d(512, num_classes, 1)
        self.scores3 = nn.Conv2d(128, num_classes, 1)

        self.conv_trans1 = nn.Conv2d(512, 256, 1)
        self.conv_trans2 = nn.Conv2d(256, num_classes, 1)

        self.upsample_8x = nn.ConvTranspose2d(num_classes, num_classes, 16, 8, 4, bias=False)
        self.upsample_8x.weight.data = bilinear_kernel(num_classes, num_classes, 16)

        self.upsample_2x_1 = nn.ConvTranspose2d(512, 512, 4, 2, 1, bias=False)
        self.upsample_2x_1.weight.data = bilinear_kernel(512, 512, 4)

        self.upsample_2x_2 = nn.ConvTranspose2d(256, 256, 4, 2, 1, bias=False)
        self.upsample_2x_2.weight.data = bilinear_kernel(256, 256, 4)

    def forward(self, x):
        # print('image:', x.size())

        s1 = self.stage1(x)
        # print('pool1:', s1.size())

        s2 = self.stage2(s1)
        # print('pool2:', s2.size())

        s3 = self.stage3(s2)
        # print('pool3:', s3.size())

        s4 = self.stage4(s3)
        # print('pool4:', s4.size())

        s5 = self.stage5(s4)
        # print('pool5:', s5.size())

        scores1 = self.scores1(s5)  # self.scores1 = nn.Conv2d(512, num_classes, 1); 这里进行了一次通道数的变化
        # print('scores1:', scores1.size())

        s5 = self.upsample_2x_1(s5)  # nn.ConvTranspose2d(512, 512, 4, 2, 1, bias=False); 转置卷积进行第一次上采样
        # print('s5:', s5.size())

        ##############融合##################
        add1 = s5 + s4  # 第一次上采样 与 s4进行融合
        # print('add1:', add1.size())

        scores2 = self.scores2(add1)  # self.scores2 = nn.Conv2d(512, num_classes, 1)  将融合后的add1进行一次通道数变化为num_classes
        # print('scores2:', scores2.size())

        add1 = self.conv_trans1(add1)  # self.conv_trans1 = nn.Conv2d(512, 256, 1) 将融合后的add1进行一次通道数变化为256
        # print('add1:', add1.size())

        add1 = self.upsample_2x_2(
            add1)  # self.upsample_2x_2 = nn.ConvTranspose2d(256, 256, 4, 2, 1, bias=False) 将通道256的add1 ,上采样为add1
        # print('add1:', add1.size())

        add2 = add1 + s3  # 将add1  和 s3 进行融合
        # print('add2:', add2.size())

        output = self.conv_trans2(add2)  # self.conv_trans2 = nn.Conv2d(256, num_classes, 1) 改变add2的通道数
        # print('output:', output.size())

        output = self.upsample_8x(
            output)  # self.upsample_8x = nn.ConvTranspose2d(num_classes, num_classes, 16, 8, 4, bias=False)
        # 使用转置卷积进行上采样
        # print('output:', output.size())

        return output





vgg16_pretrained = models.vgg16(pretrained=False)


def decoder(input_channel, output_channel, num=3):
    if num == 3:
        decoder_body = nn.Sequential(
            nn.Conv2d(input_channel, input_channel, 3, padding=1),
            nn.Conv2d(input_channel, input_channel, 3, padding=1),
            nn.Conv2d(input_channel, output_channel, 3, padding=1))
    elif num == 2:
        decoder_body = nn.Sequential(
            nn.Conv2d(input_channel, input_channel, 3, padding=1),
            nn.Conv2d(input_channel, output_channel, 3, padding=1))

    return decoder_body


class VGG16_deconv(torch.nn.Module):
    def __init__(self, num_classes=8):
        super(VGG16_deconv, self).__init__()

        pool_list = [4, 9, 16, 23, 30]
        for index in pool_list:
            vgg16_pretrained.features[index].return_indices = True

        self.encoder1 = vgg16_pretrained.features[:4]
        self.pool1 = vgg16_pretrained.features[4]

        self.encoder2 = vgg16_pretrained.features[5:9]
        self.pool2 = vgg16_pretrained.features[9]

        self.encoder3 = vgg16_pretrained.features[10:16]
        self.pool3 = vgg16_pretrained.features[16]

        self.encoder4 = vgg16_pretrained.features[17:23]
        self.pool4 = vgg16_pretrained.features[23]

        self.encoder5 = vgg16_pretrained.features[24:30]
        self.pool5 = vgg16_pretrained.features[30]

        self.decoder5 = decoder(512, 512)
        self.unpool5 = nn.MaxUnpool2d(2, 2)

        self.decoder4 = decoder(512, 256)
        self.unpool4 = nn.MaxUnpool2d(2, 2)

        self.decoder3 = decoder(256, 128)
        self.unpool3 = nn.MaxUnpool2d(2, 2)

        self.decoder2 = decoder(128, 64, 2)
        self.unpool2 = nn.MaxUnpool2d(2, 2)

        self.decoder1 = decoder(64, num_classes, 2)
        self.unpool1 = nn.MaxUnpool2d(2, 2)

    def forward(self, x):
        #  print('x:', x.size())
        encoder1 = self.encoder1(x);
        #  print('encoder1:', encoder1.size())
        output_size1 = encoder1.size()
        pool1, indices1 = self.pool1(encoder1);
        #  print('pool1:', pool1.size());
        #  print('indices1:', indices1.size())

        encoder2 = self.encoder2(pool1);
        #  print('encoder2:', encoder2.size())
        output_size2 = encoder2.size()
        pool2, indices2 = self.pool2(encoder2);
        #  print('pool2:', pool2.size());
        #  print('indices2:', indices2.size())

        encoder3 = self.encoder3(pool2);
        #  print('encoder3:', encoder3.size())
        output_size3 = encoder3.size()
        pool3, indices3 = self.pool3(encoder3);
        #  print('pool3:', pool3.size());
        #  print('indices3:', indices3.size())

        encoder4 = self.encoder4(pool3);
        #  print('encoder4:', encoder4.size())
        output_size4 = encoder4.size()
        pool4, indices4 = self.pool4(encoder4);
        #  print('pool4:', pool4.size());
        #  print('indices4:', indices4.size())

        encoder5 = self.encoder5(pool4);
        #  print('encoder5:', encoder5.size())
        output_size5 = encoder5.size()
        pool5, indices5 = self.pool5(encoder5);
        #  print('pool5:', pool5.size());
        #  print('indices5:', indices5.size())

        unpool5 = self.unpool5(input=pool5, indices=indices5, output_size=output_size5);
        #  print('unpool5:', unpool5.size())
        decoder5 = self.decoder5(unpool5);
        #  print('decoder5:', decoder5.size())

        unpool4 = self.unpool4(input=decoder5, indices=indices4, output_size=output_size4);
        #  print('unpool4:', unpool4.size())
        decoder4 = self.decoder4(unpool4);
        #  print('decoder4:', decoder4.size())

        unpool3 = self.unpool3(input=decoder4, indices=indices3, output_size=output_size3);
        #  print('unpool3:', unpool3.size())
        decoder3 = self.decoder3(unpool3);
        #  print('decoder3:', decoder3.size())

        unpool2 = self.unpool2(input=decoder3, indices=indices2, output_size=output_size2);
        #  print('unpool2:', unpool2.size())
        decoder2 = self.decoder2(unpool2);
        #  print('decoder2:', decoder2.size())

        unpool1 = self.unpool1(input=decoder2, indices=indices1, output_size=output_size1);
        #  print('unpool1:', unpool1.size())
        decoder1 = self.decoder1(unpool1);
        #  print('decoder1:', decoder1.size())

        return decoder1




# 生成网络实例
# net = UNet(in_channels=3,num_classes=2)
# net = FCN(3)
net = VGG16_deconv(num_classes=3)
# 启用评测模型，参数停止更新
net.eval()
# 加载最优模型参数
net.load_state_dict(torch.load(r'.\best_model.pth'))
# 跟踪推理
trace = torch.jit.trace(net, torch.randn(1, 3, 640, 640))
# 导出trace pt模型
torch.jit.save(trace,'FCN_model2.pt')
# 加载模型
model = torch.load('FCN_model2.pt')
# 启用评测模型，参数停止更新
model.eval()
# 模型转cuda
model.cuda()

input_x = torch.randn(1,3,640,640).cuda()
# torch.onnx.export(model, x, 'UNet_model.onnx', input_names=input_names, output_names=output_names, verbose='True')
# 在batchsize上设置为动态的
res = torch.onnx.export(model,
                  input_x,
                  'FCN_model2.onnx',
                  input_names=['inputs'],
                  output_names=['outputs'],
                  opset_version=11,
                  dynamic_axes={"inputs": {0: "bs",1: "channel",2: "h",3: "w"}},
                 # verbose=True, #  true表示打印调试信息
                )

转引擎

import tensorrt as trt
import os
import common  # 修改后的common文件
# 显示设置批大小为1
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
# 生成TRT的日志实例
TRT_LOGGER = trt.Logger()

trt.init_libnvinfer_plugins(TRT_LOGGER,"")

'''
动态大小输入参数设置
'''
mix_size = (1, 3, 128, 128)  # 最小size
common_size = (1, 3, 640, 640)  # 适合size
max_size = (1, 3, 2048, 2048)  # 最大size

'''
静态大小输入
onnx模型转TensorRT的engine
'''
def get_engine(onnx_file_path, engine_file_path=""):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    # 构建引擎
    def build_engine():
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        # 1.生成builder构建器实例，起别名为 builder
        # 2.通过构建器的builder构造网络，起别名为 network
        # 3.通过构建器的builder构造配置，起别名为 config
        # 4.生成Onnx模型解析器OnnxParser，起别名为 parser
        # 5.根据日志创建运行时
        with trt.Builder(TRT_LOGGER) as builder, \
             builder.create_network(EXPLICIT_BATCH) as network, \
             builder.create_builder_config() as config, \
             trt.OnnxParser(network, TRT_LOGGER) as parser, \
             trt.Runtime(TRT_LOGGER) as runtime:
            # 设置最大工作空间，为4GB
            config.max_workspace_size = 1 << 32  # 4GB
            # 设置构建器最大的批处理数量为1
            builder.max_batch_size = 1
            # 开始解析onnx模型文件
            # 判断onnx模型文件是否存在
            if not os.path.exists(onnx_file_path):
                print(
                    "ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
                )
                exit(0)
            # 打印onnx模型位置
            print("Loading ONNX file from path {}...".format(onnx_file_path))
            # 打开模型，parse进行解析
            with open(onnx_file_path, "rb") as model:
                print("Beginning ONNX file parsing")
                # 读取解析onnx
                if not parser.parse(model.read()):
                    print("ERROR: Failed to parse the ONNX file.")
                    # 打印解析失败原因
                    for error in range(parser.num_errors):
                        print(parser.get_error(error))
                    return None
            # 为每个动态输入绑定一个profile
            profile = builder.create_optimization_profile()
            print("network.get_input(0).name:", network.get_input(0).name)
            profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
                              (1, 3, 648, 648))  # 最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
            config.add_optimization_profile(profile)
            # 从网络中获取输入
            inputs = [network.get_input(i) for i in range(network.num_inputs)]
            # 打印输入
            print("input",inputs)
            # 从网络中获取输出
            outputs = [network.get_output(i) for i in range(network.num_outputs)]
            # 打印输出
            print("out:",outputs)
            print("Completed parsing of ONNX file")
            print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
            # 通过网络network和配置config进行引擎构建
            plan = builder.build_serialized_network(network, config)
            # 反序列化引擎流
            engine = runtime.deserialize_cuda_engine(plan)
            # 打印引擎构建完成
            print("Completed creating Engine")
            # 保存引擎文件至本地
            with open(engine_file_path, "wb") as f:
                f.write(plan)
            return engine

    # 如果trt的引擎文件存在，则进行运行时反序列化尝试，判断引擎是否可用
    # 如果引擎路径不存在，则进行根据onnx模型路径进行引擎构建，保存引擎到本地
    if os.path.exists(engine_file_path):
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read()) # 运行时反序列化引擎
    else:
        return build_engine()


    self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(engine)
    self.context = engine.create_execution_context()

# def get_engine(onnx_file_path, engine_file_path="", is_dyn=1):
#     """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
#     # 构建引擎
#     def build_engine():
#         """Takes an ONNX file and creates a TensorRT engine to run inference with"""
#         # 1.生成builder构建器实例，起别名为 builder
#         # 2.通过构建器的builder构造网络，起别名为 network,构建一个空的网络计算图
#         # 3.通过构建器的builder构造配置，起别名为 config
#         # 4.生成Onnx模型解析器OnnxParser，起别名为 parser
#         # 5.根据日志创建运行时
#         with trt.Builder(TRT_LOGGER) as builder, \
#              builder.create_network(EXPLICIT_BATCH) as network, \
#              builder.create_builder_config() as config, \
#              trt.OnnxParser(network, TRT_LOGGER) as parser, \
#              trt.Runtime(TRT_LOGGER) as runtime:
#             # 设置最大工作空间，为4GB
#             config.max_workspace_size = 1 << 32  # 4GB
#             # 设置构建器最大的批处理数量为1
#             builder.max_batch_size = 1
#             # 开始解析onnx模型文件
#             # 判断onnx模型文件是否存在
#             if not os.path.exists(onnx_file_path):
#                 print(
#                     "ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
#                 )
#                 exit(0)
#             # 打印onnx模型位置
#             print("Loading ONNX file from path {}...".format(onnx_file_path))
#             # 打开模型，parse进行解析
#             with open(onnx_file_path, "rb") as model:
#                 print("Beginning ONNX file parsing")
#                 # 读取解析onnx
#                 if not parser.parse(model.read()):
#                     print("ERROR: Failed to parse the ONNX file.")
#                     # 打印解析失败原因
#                     for error in range(parser.num_errors):
#                         print(parser.get_error(error))
#                     return None
#             # 将转换之后的模型的输入输出的对应的大小进行打印，从而进行验证
#             # 从网络中获取输入
#             inputs = [network.get_input(i) for i in range(network.num_inputs)]
#             # 打印输入
#             print("input",inputs)
#             # 从网络中获取输出
#             outputs = [network.get_output(i) for i in range(network.num_outputs)]
#             # 打印输出
#             print("out:",outputs)
#             print("Completed parsing of ONNX file")
#             print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
#
#             # 判断是否时动态输入
#             if is_dyn:
#                 # Dynamic input setting 动态输入在builder的profile设置
#                 # 为每个动态输入绑定一个profile
#                 profile = builder.create_optimization_profile()
#                 profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
#                                   (1, 3, 2048, 2048))  # 最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
#                 config.add_optimization_profile(profile)
#             else:
#                 pass
#
#             if is_dyn:
#                 # 通过网络network和配置config进行引擎构建
#                 engine = builder.build_engine(network, config)
#             else:
#                 # 通过网络network和配置config进行引擎构建
#                 plan = builder.build_serialized_network(network, config)
#                 # 反序列化引擎流
#                 engine = runtime.deserialize_cuda_engine(plan)
#             # 打印引擎构建完成
#             print("Completed creating Engine")
#             # 保存引擎文件至本地
#             if is_dyn:
#                 with open(engine_file_path, "wb") as f:
#                     f.write(engine.serialize())
#             else:
#                 with open(engine_file_path, "wb") as f:
#                     f.write(plan)
#             return engine
#
#     # 如果trt的引擎文件存在，则进行运行时反序列化尝试，判断引擎是否可用
#     # 如果引擎路径不存在，则进行根据onnx模型路径进行引擎构建，保存引擎到本地
#     if os.path.exists(engine_file_path):
#         print("Reading engine from file {}".format(engine_file_path))
#         with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
#             return runtime.deserialize_cuda_engine(f.read()) # 运行时反序列化引擎
#     else:
#         return build_engine()
#
#
#     self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(engine)
#     self.context = engine.create_execution_context()

def get_DynEngine(onnx_file_path, engine_file_path):
    '''
    Attempts to load a serialized engine if available,
    otherwise build a new TensorRT engine as save it
    '''

    # 1.生成builder构建器实例，起别名为 builder
    # 2.通过构建器的builder构造网络，起别名为 network
    # 3.通过构建器的builder构造配置，起别名为 config
    # 4.生成Onnx模型解析器OnnxParser，起别名为 parser
    # 5.根据日志创建运行时
    def build_engine():
        builder = trt.Builder(TRT_LOGGER)
        network = builder.create_network(common.EXPLICIT_BATCH)
        config = builder.create_builder_config()
        parser = trt.OnnxParser(network, TRT_LOGGER)
        runtime = trt.Runtime(TRT_LOGGER)
        print("common.EXPLICIT_BATCH:",common.EXPLICIT_BATCH)
        # 最大内存占用
        # 显存溢出需要重新设置
        config.max_workspace_size = 10 << 30  # 256MB
        print("max_workspace_size:",config.max_workspace_size)
        builder.max_batch_size = 1  # 推理的时候要保证batch_size<=max_batch_size

        # parse model file
        if not os.path.exists(onnx_file_path):
            print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
            exit(0)
        print(f'Loading ONNX file from path {onnx_file_path}...')
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            if not parser.parse(model.read()):
                print('ERROR:Failed to parse the ONNX file')
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        inputs = [network.get_input(i) for i in range(network.num_inputs)]
        print("input",inputs)

        outputs = [network.get_output(i) for i in range(network.num_outputs)]
        print("out:",outputs)

        print("Network Description")
        for input in inputs:
            # 获取当前转化之前的 输入的 batch_size
            batch_size = input.shape[0]
            print("Input '{}' with shape {} and dtype {} . ".format(input.name, input.shape, input.dtype))
        for output in outputs:
            print("Output '{}' with shape {} and dtype {} . ".format(output.name, output.shape, output.dtype))

        # Dynamic input setting 动态输入在builder的profile设置
        # 为每个动态输入绑定一个profile
        profile = builder.create_optimization_profile()
        print("network.get_input(0).name:",network.get_input(0).name)
        profile.set_shape(network.get_input(0).name, (1, 3, 32, 32), (1, 3, 512, 512),
                          (1, 3, 648, 648))  # 最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
        config.add_optimization_profile(profile)

        print('Completed parsing the ONNX file')
        print(f'Building an engine from file {onnx_file_path}; this may take a while...')
        # plan = builder.build_serialized_network(network,config)
        # engine = runtime.deserialize_cuda_engine(plan)
        engine = builder.build_engine(network, config)
        print('Completed creating Engine')
        with open(engine_file_path, 'wb') as f:
            # f.write(plan)
            f.write(engine.serialize())
        return engine

    if os.path.exists(engine_file_path):
        print(f'Reading engine from file {engine_file_path}')
        build_engine()
        with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    else:
        return build_engine()



if __name__ == "__main__":
    """Create a TensorRT engine for seg and run inference."""
    from datetime import datetime

    startTime = datetime.now()
    onnx_file_path = "FCN_model2.onnx" # "UNet_model2.onnx" #  "HySegNet.onnx"
    engine_file_path = "model222.engine"# "test.engine" # "model_seg.engine"
   # get_engine(onnx_file_path, engine_file_path,is_dyn=1)
    get_DynEngine(onnx_file_path, engine_file_path)
    endTime = datetime.now()
    duringTime = endTime - startTime
    print(duringTime)

引擎推理

import numpy as np
import os
import pycuda.driver as cuda #GPU CPU之间的数据传输
import pycuda.autoinit  #负责数据初始化，内存管理，销毁等
import tensorrt as trt
import torch
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import torch.nn.functional as F

'''
step1:创建logger：日志记录器
'''
TRT_LOGGER = trt.Logger()


import cv2
# Filenames of TensorRT plan file and input/output images.
# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].

'''
输入数据-前处理
'''
def preprocess(image):
    # Mean normalization
    mean = np.array([0.485, 0.456, 0.406]).astype('float32')
    stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
    data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev

    # Switch from HWC to to CHW order
    return np.moveaxis(data, 2, 0)


'''
模型输出数据-后处理
'''
def postprocess(data):
    num_classes = 21
    # create a color palette, selecting a color for each class
    palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
    colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
    # plot the segmentation predictions for 21 classes in different colors
    img = Image.fromarray(data.astype('uint8'), mode='P')
    img.putpalette(colors)
    return img


'''
#step2:创建runtime并反序列化生成engine
'''
def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())



'''
显示图像
'''
def pred2show(mask,iii):
    # 1.读取对应表,确定对应关系
    path_color2class_table = r".\color2class_table.csv"
    dataframe = pd.read_csv(path_color2class_table)
    list_rgb = []
    list_class_id = []
    for i in range(len(dataframe)):
        rgb = list(dataframe.iloc[i][2:])
        class_id = int(dataframe.iloc[i][0])
        list_rgb.append(rgb)
        list_class_id.append(class_id)
    for i in range(len(list_rgb)):
        list_rgb[i] = i*255
    dict_color2class = dict(zip(list_class_id, list_rgb))

    # 2.创建空数组
    crop_size = (640, 640)  # (512,512)
    pred = np.empty([crop_size[0], crop_size[1]], dtype=int)
    # print(frame.shape)  # shape内包含三个元素：按顺序为高、宽、通道数
    height = mask.shape[0]
    weight = mask.shape[1]

    # 3.遍历mask,根据对应关系填充rgb
    for row in range(height):  # 遍历高
        for col in range(weight):  # 遍历宽
            pred[row,col] = np.array(dict_color2class[mask[row,col]])
    cv2.imwrite(output_file,pred)
    img_show = cv2.imread("test"+str(iii)+".png")
    cv2.imshow("test",img_show)
    cv2.waitKey(0)

'''
推理
'''
def infer(engine, input_file, output_file):
    # 打印输入图像路径
    print("Reading input image from file {}".format(input_file))
    # 打开图像
    with Image.open(input_file) as img:
        img =img.resize((640, 640), Image.ANTIALIAS)
        input_image = preprocess(img)
        image_width = img.width
        image_height = img.height
    # step5:创建上下文context并进行推理
    with engine.create_execution_context() as context:
        # Set input shape based on image dimensions for inference
        # 设置推理的输入shape
        context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
        # 分配主机和设备缓冲区
        bindings = []

        # 在cpu和gpu上申请内存
        for binding in engine:
            #
            binding_idx = engine.get_binding_index(binding)
            # 数据大小
            size = trt.volume(context.get_binding_shape(binding_idx))
            # 数据类型
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            if engine.binding_is_input(binding):
                # step3:分配输入数据的CPU锁页内存和GPU显存
                input_buffer = np.ascontiguousarray(input_image)
                # 分配输入数据的cuda显存
                input_memory = cuda.mem_alloc(input_image.nbytes)
                bindings.append(int(input_memory))
            else:
                # step3:分配输出数据的CPU锁页内存和GPU显存
                output_buffer = cuda.pagelocked_empty(size, dtype)
                # 分配输出数据的cuda显存
                output_memory = cuda.mem_alloc(output_buffer.nbytes)
                bindings.append(int(output_memory))

        # step4:创建cuda流
        stream = cuda.Stream()
        # 将输入数据转入cuda
        cuda.memcpy_htod_async(input_memory, input_buffer, stream)
        # 执行推理
        import datetime
        startTime = datetime.datetime.now()
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        endTime = datetime.datetime.now()
        durTime = 'funtion time use:%dms' % (
                    (endTime - startTime).seconds * 1000 + (endTime - startTime).microseconds / 1000)
        print(durTime)

        # 从GPU中将输出数据取出（output_buffer）
        cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
        # 同步流
        stream.synchronize()

    print(output_buffer)
    # # reshape输出数据的维度
    res = np.reshape(output_buffer, (2,image_height, image_width))
    # # 转tensor
    # out = torch.tensor(res)
    # # 扩展维度
    # out = torch.unsqueeze(out,0)
    # # 激活函数
    # out = F.log_softmax(out, dim=1)
    # # 取mask
    # pre_label = out.max(1)[1].squeeze().cpu().data.numpy()
    # # 显示
    # rgb = pred2show(pre_label, 1)
    print(res)
    #with postprocess(np.reshape(output_buffer, (image_height, image_width,2))) as img:
        # print("Writing output image to file {}".format(output_file))
        # img.convert('RGB').save(output_file, "PPM")

'''
动态输入推理
'''
import common
def infer2(engine, input_file, output_file):
    # 打开图像
    with Image.open(input_file) as img:
        img =img.resize((640, 640), Image.ANTIALIAS)
        input_image = preprocess(img)
        width = img.width
        height = img.height

    # 创建执行上下文
    context = engine.create_execution_context()

    # 修改allocate_buffers函数,支持动态输入
    inputs, outputs, bindings, stream = common.allocate_buffers(engine, (height, width))

    # 指定使用哪个profile
    context.active_optimization_profile = 0  # 新增部分
    origin_inputshape = context.get_binding_shape(0)


    if origin_inputshape[-1] == -1:
        origin_inputshape[-2], origin_inputshape[-1] = (height, width)
        context.set_binding_shape(0, (origin_inputshape))

    print(f'Running inference on image {input_file}...')
    tmpImg = input_image[np.newaxis, :, :, :]  # CHW->NCHW
    inputs[0].host = np.ascontiguousarray(tmpImg)  # ************************
    trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)[0]
    # pdb.set_trace()
    trt_outputs = np.reshape(trt_outputs, (3,height, width))
    print(trt_outputs)
    # # postprocess trt output
    # trt_outputs = 1.0 - trt_outputs
    # trt_outputs_max = np.max(trt_outputs)
    # trt_output_min = np.min(trt_outputs)
    # trt_outputs = (trt_outputs - trt_output_min) / (trt_outputs_max - trt_output_min)
    #
    # trt_outputs = trt_outputs * 255
    # trt_outputs = np.clip(trt_outputs, 0, 255)
    #
    # cv2.imwrite(output_file, trt_outputs)


if __name__ == '__main__':
    flag = 0
    if flag:
        # 引擎路径
        engine_file =  "model_seg.engine" # "model_seg.engine"
        # 输入图像路径
        input_file = r".\liver\train\image\0.png"  # r"E0_0_E0_0_Image_20220907142018844.bmp"
        # 输出结果保存路径
        output_file = "output.png"
        # 读取图像数据流
        img = Image.open(input_file)
        #img.show()
        print("Running TensorRT inference for Seg")
        # 加载引擎
        with load_engine(engine_file) as engine:
            # 推理
            infer(engine, input_file, output_file)
            # infer(engine, input_file, output_file)
            # infer(engine, input_file, output_file)
        # import datetime
        # startTime = datetime.datetime.now()
        # infer(engine, input_file, output_file)
        # endTime = datetime.datetime.now()
        # durTime = 'funtion time use:%dms' % ((endTime -startTime ).seconds * 1000 + (endTime -startTime ).microseconds / 1000)
        # print(durTime)
    else:
        engine_file = "model222.engine"  # "model_seg.engine"
        # 输入图像路径
        input_file = r".\liver\train\image\0.png"  # r"E0_0_E0_0_Image_20220907142018844.bmp"
        # 输出结果保存路径
        output_file = "output.png"
        with load_engine(engine_file) as engine:
            infer2(engine,input_file,output_file)