【玩转Jetson TX2 NX】（十一）TX2 NX 基于UNet网络实现显著目标分割（详细教程+错误解决）

最新推荐文章于 2023-04-05 11:37:39 发布

鲁棒最小二乘支持向量机

最新推荐文章于 2023-04-05 11:37:39 发布

阅读量1.4k

点赞数

分类专栏：一起学Jetson TX2 NX 笔记文章标签：深度学习 pytorch jetson tx2 nx 人工智能 UNet

本文链接：https://blog.csdn.net/qq_42078934/article/details/129962990

版权

笔记同时被 2 个专栏收录

225 篇文章 178 订阅

订阅专栏

一起学Jetson TX2 NX

12 篇文章 38 订阅

订阅专栏

1 写在前面的话

该博客作为一个小Demo，记录深度学习算法UNet部署到Jetson TX2 NX的全部流程，可将此作为参考，继续部署自己的深度学习算法模型。部署总流程为：

1、在服务器端训练模型，一般为pth文件
2、在服务器端将pth文件转为onnx文件
3、在Jetson TX2 NX端将onnx文件转为trt文件
4、在Jetson TX2 NX端推理模型，输出分割结果

2 训练UNet网络

训练和测试数据均为显著性目标检测数据集SOD，image图像如图所示：

在这里插入图片描述
mask图像如图所示：

在这里插入图片描述

该项目的代码基于pytorch-UNet进行修改，源代码链接: https://github.com/milesial/Pytorch-UNet 直接下载即可

在这里插入图片描述
由于源代码写的比较完美，对于完成这个小Demo稍有复杂，因此进行代码的改写
unet_parts.py代码如下：

# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F

class DoubleConv(nn.Module):
    """(convolution => [BN] => ReLU) * 2"""

    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)


class Down(nn.Module):
    """Downscaling with maxpool then double conv"""

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)


class Up(nn.Module):
    """Upscaling then double conv"""

    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
        else:
            self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
            self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        # diffY = x2.size()[2] - x1.size()[2]
        # diffX = x2.size()[3] - x1.size()[3]

        # x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
        #                 diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

注意红色框内代码，如图所示，最好是注释掉或者改写，具体原因在onnx文件转为trt文件步骤时会解释

在这里插入图片描述
unet_model.py代码如下：

# -*- coding: utf-8 -*-
 
from unet_parts import *

class UNet(nn.Module):
    def __init__(self, n_channels, n_classes, bilinear=False):
        super(UNet, self).__init__()

        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear
 
        self.inc = (DoubleConv(n_channels, 64))
        self.down1 = (Down(64, 128))
        self.down2 = (Down(128, 256))
        self.down3 = (Down(256, 512))
        factor = 2 if bilinear else 1
        self.down4 = (Down(512, 1024 // factor))
        self.up1 = (Up(1024, 512 // factor, bilinear))
        self.up2 = (Up(512, 256 // factor, bilinear))
        self.up3 = (Up(256, 128 // factor, bilinear))
        self.up4 = (Up(128, 64, bilinear))
        self.outc = (OutConv(64, n_classes))

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        logits = self.outc(x)
        return logits
    
def print_network(model, name):
    num_params = 0
    for p in model.parameters():
        num_params += p.numel()
    total_num = sum(p.numel() for p in model.parameters())    
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(name)
    print(model)
    print("The number of parameters: {}".format(num_params))
    print("The number of total parameters: {}".format(total_num))
    print("The number of trainable parameters: {}".format(trainable_num)) 
if __name__ == '__main__':
    input = torch.randn((4, 3, 352, 352))
    net = UNet(n_channels=3, n_classes=1, bilinear=False)
    C1 = net(input)
    print_network(net,'Model Structure')
    print(C1.size())

如图所示：

在这里插入图片描述
dataset.py代码如下：

# -*- coding: utf-8 -*-
import cv2
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import glob
    
class LDataset(Dataset):
    def __init__(self):

        self.inp_path = './data/imgs'
        self.out_path = './data/masks'

        self.inp_files = sorted(glob.glob(self.inp_path + '/*'))
        self.out_files = sorted(glob.glob(self.out_path + '/*'))

    def __getitem__(self, idx):
        inp_img = cv2.imread(self.inp_files[idx])
        inp_img = cv2.cvtColor(inp_img, cv2.COLOR_BGR2RGB)
        inp_img = inp_img.astype('float32')

        mask_img = cv2.imread(self.out_files[idx], 0)
        mask_img = mask_img.astype('float32')
        mask_img /= (np.max(mask_img)+1e-15)

        # Pad images to target size
        inp_img = cv2.resize(inp_img,(480,320))
        mask_img = cv2.resize(mask_img,(480,320))
        inp_img /= 255.0
        inp_img = np.transpose(inp_img, axes=(2, 0, 1))
        inp_img = torch.from_numpy(inp_img).float()

        mask_img = np.expand_dims(mask_img, axis=0)

        return inp_img, torch.from_numpy(mask_img).float()

    def __len__(self):
        return len(self.inp_files)

注意设置图像和标签的尺寸，后面要统一，如图所示：

在这里插入图片描述
train.py代码如下：

# -*- coding: utf-8 -*-
import os
import argparse
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from dataset import LDataset
from unet_model import UNet

def parse_arguments():
    parser = argparse.ArgumentParser(description='Parameters to train your model.') 
    parser.add_argument('--epochs', default=60, help='Number of epochs to train the model for', type=int)
    parser.add_argument('--bs', default=2, help='Batch size', type=int) 
    parser.add_argument('--lr', default=5e-5, help='Learning Rate', type=float) 
    parser.add_argument('--wd', default=0.0005, help='L2 Weight decay', type=float)  
    parser.add_argument('--n_worker', default=2, help='Number of workers to use for loading data', type=int) 
    parser.add_argument('--test_interval', default=2, help='Number of epochs after which to test the weights', type=int) 
    parser.add_argument('--save_interval', default=10, help='Number of epochs after which to save the weights. If None, does not save', type=int)
    parser.add_argument('--save_opt', default=False, help='Whether to save optimizer along with model weights or not', type=bool)
    parser.add_argument('--log_interval', default=250, help='Logging interval (in #batches)', type=int) 
    parser.add_argument('--res_mod', default=None, type=str) 
    parser.add_argument('--res_opt', default=None, help='Path to the optimizer to resume from', type=str) 
    parser.add_argument('--use_gpu', default=True, help='Flag to use GPU or not', type=bool) 
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    parser.add_argument('--base_save_path', default='./models', help='Base path for the models to be saved', type=str) 

    return parser.parse_args()
 
class Engine:
    def __init__(self, args):
        self.epochs = args.epochs
        self.bs = args.bs
        self.lr = args.lr
        self.wd = args.wd
        self.n_worker = args.n_worker
        self.test_interval = args.test_interval
        self.save_interval = args.save_interval
        self.save_opt = args.save_opt
        self.log_interval = args.log_interval
        self.res_mod_path = args.res_mod
        self.res_opt_path = args.res_opt
        self.use_gpu = args.use_gpu
        
        self.lr_decay_epoch = [8, 16, 24] 

        self.model_path = args.base_save_path + '/checkpoint_epoch{}.pth'.format(str(self.epochs))
        print('Models would be saved at : {}\n'.format(self.model_path))
        if not os.path.exists(os.path.join(self.model_path, 'weights')):
            os.makedirs(os.path.join(self.model_path, 'weights'))
        if not os.path.exists(os.path.join(self.model_path, 'optimizers')):
            os.makedirs(os.path.join(self.model_path, 'optimizers'))

        if torch.cuda.is_available():
            self.device = torch.device(device='cuda')
        else:
            self.device = torch.device(device='cpu')

        self.model = UNet(n_channels=3, n_classes=1, bilinear=False)
        self.model.to(self.device)
        self.criterion = nn.BCEWithLogitsLoss()
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=self.wd)
        self.print_network(self.model, 'Net Structure') 

        # Load model and optimizer if resumed
        if self.res_mod_path is not None:
            chkpt = torch.load(self.res_mod_path, map_location=self.device)
            self.model.load_state_dict(chkpt['model'])
            print("Resuming training with checkpoint : {}\n".format(self.res_mod_path))
        if self.res_opt_path is not None:
            chkpt = torch.load(self.res_opt_path, map_location=self.device)
            self.optimizer.load_state_dict(chkpt['optimizer'])
            print("Resuming training with optimizer : {}\n".format(self.res_opt_path))

        data = LDataset()
        self.train_data, self.test_data = torch.utils.data.random_split(data, (240, 60))
        
        self.train_dataloader = DataLoader(self.train_data, batch_size=self.bs, shuffle=True, num_workers=self.n_worker)
        self.test_dataloader = DataLoader(self.test_data, batch_size=self.bs, shuffle=False, num_workers=self.n_worker)

    def print_network(self, model, name):
        num_params = 0
        for p in model.parameters():  
            num_params += p.numel()
        total_num = sum(p.numel() for p in model.parameters())    
        trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(name)
        print(model)
        print("The number of parameters: {}".format(num_params))
        print("The number of total parameters: {}".format(total_num))
        print("The number of trainable parameters: {}".format(trainable_num))

    def train(self):
        best_test_mae = float('inf')
        # batch_num = len(self.train_dataloader) // self.bs# batch的数量
        for epoch in range(self.epochs):
            self.model.train()
            for batch_idx, (inp_imgs, gt_masks) in enumerate(self.train_dataloader):
                inp_imgs = inp_imgs.to(self.device)
                gt_masks = gt_masks.to(self.device)

                self.optimizer.zero_grad()
                pred_masks = self.model(inp_imgs)
                loss = self.criterion(pred_masks, gt_masks)
                
                loss.backward()
                self.optimizer.step()

                if batch_idx % self.log_interval == 0:
                    print('TRAIN :: Epoch : {}\tBatch : {}/{} ({:.2f}%)\t\tTot Loss : {:.4f}'
                          .format(epoch + 1,
                                  batch_idx + 1, len(self.train_dataloader),
                                  (batch_idx + 1) * 100 / len(self.train_dataloader),
                                  loss.item()))
             # 对应epoch对学习率进行lr dacay
            if epoch in self.lr_decay_epoch:
                self.lr = self.lr * 0.1
                self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=self.wd)

            # Validation
            if epoch % self.test_interval == 0 or epoch % self.save_interval == 0:
                te_avg_loss, te_acc, te_pre, te_rec, te_mae = self.test()
                mod_chkpt = {'epoch': epoch,
                            'test_mae' : float(te_mae),
                            'model' : self.model.state_dict(),
                            'test_loss': float(te_avg_loss),
                            'test_acc': float(te_acc),
                            'test_pre': float(te_pre),
                            'test_rec': float(te_rec)}

                if self.save_opt:
                    opt_chkpt = {'epoch': epoch,
                                'test_mae' : float(te_mae),
                                'optimizer': self.optimizer.state_dict(),
                                'test_loss': float(te_avg_loss),
                                'test_acc': float(te_acc),
                                'test_pre': float(te_pre),
                                'test_rec': float(te_rec)}

                # Save the best model
                if te_mae <= best_test_mae:
                    best_test_mae = te_mae
                    torch.save(mod_chkpt, self.model_path + '/weights/best-model_epoch-{:03}_mae-{:.4f}_loss-{:.4f}.pth'.
                               format(epoch, best_test_mae, te_avg_loss))
                    if self.save_opt:
                        torch.save(opt_chkpt, self.model_path + '/optimizers/best-opt_epoch-{:03}_mae-{:.4f}_loss-{:.4f}.pth'.
                                   format(epoch, best_test_mae, te_avg_loss))
                    print('Best Model Saved !!!\n')
                    continue
                
                # Save model at regular intervals
                if self.save_interval is not None and epoch % self.save_interval == 0:
                    torch.save(mod_chkpt, self.model_path + '/weights/model_epoch-{:03}_mae-{:.4f}_loss-{:.4f}.pth'.
                               format(epoch, te_mae, te_avg_loss))
                    if self.save_opt:
                        torch.save(opt_chkpt, self.model_path + '/optimizers/opt_epoch-{:03}_mae-{:.4f}_loss-{:.4f}.pth'.
                                   format(epoch, best_test_mae, te_avg_loss))
                    print('Model Saved !!!\n')
                    continue
            print('\n')

    def test(self):
        self.model.eval()
        tot_loss = 0
        tp_fp = 0   # TruePositive + TrueNegative, for accuracy
        tp = 0      # TruePositive
        pred_true = 0   # Number of '1' predictions, for precision
        gt_true = 0     # Number of '1's in gt mask, for recall
        mae_list = []   # List to save mean absolute error of each image
        fscore, cnt, number   =  0, 0, 256
        mean_pr, mean_re, threshod = 0, 0, np.linspace(0, 1, number, endpoint=False)

        with torch.no_grad():
            for batch_idx, (inp_imgs, gt_masks) in enumerate(self.test_dataloader, start=1):
                inp_imgs = inp_imgs.to(self.device)
                gt_masks = gt_masks.to(self.device)

                pred_masks = self.model(inp_imgs)
                loss = self.criterion(pred_masks, gt_masks)
                
                tot_loss += loss.item()

                tp_fp += (pred_masks.round() == gt_masks).float().sum()
                tp += torch.mul(pred_masks.round(), gt_masks).sum()
                pred_true += pred_masks.round().sum()
                gt_true += gt_masks.sum()

        
                ae = torch.mean(torch.abs(pred_masks - gt_masks), dim=(1, 2, 3)).cpu().numpy()
                mae_list.extend(ae)
                
                cnt += 1
                ## F-Score
                precision = torch.zeros(number)
                recall    = torch.zeros(number)
                for i in range(number):
                    temp         = (pred_masks >= threshod[i]).float()
                    precision[i] = (temp*gt_masks).sum()/(temp.sum()+1e-12)
                    recall[i]    = (temp*gt_masks).sum()/(gt_masks.sum()+1e-12)
                mean_pr += precision
                mean_re += recall
                fscore   = mean_pr*mean_re*(1+0.3)/(0.3*mean_pr+mean_re+1e-12)

        avg_loss = tot_loss / batch_idx
        accuracy = tp_fp / (len(self.test_data) * 480 * 320) 
        precision = tp / pred_true  
        recall = tp / gt_true        
        mae = np.mean(mae_list)
        maxf = fscore.max()/cnt
        print('TEST :: MAE : {:.4f}\tMAXF : {:.4f}\tACC : {:.4f}\tPRE : {:.4f}\tREC : {:.4f}\tAVG-LOSS : {:.4f}\n'.format(mae,
                                                                                             maxf,
                                                                                             accuracy,
                                                                                             precision,
                                                                                             recall,
                                                                                             avg_loss))

        return avg_loss, accuracy, precision, recall, mae 
 
if __name__ == '__main__':
    rt_args = parse_arguments()

    # Driver class
    trainer = Engine(rt_args)
    trainer.train()
    # trainer.test()

由于tensorrt里面还没有实现bilinear双线性插值上采样操作，所以选择使用deconv作为上采样的unet网络结构，因此如图所示，注意UNet模型这，不能使用双线性插值的方式进行上采样，故bilinear=False

在这里插入图片描述
模型训练结束后，开始将pth文件转为onnx文件，代码参考大佬 ubuntu18一文学会Pytorch端到端网络部署Tensorrt模型推理

from unet_model import UNet # 
import torch
import onnx
 
# gloabl variable
model_path = "./models/best-model_epoch-000_mae-0.8259_loss-0.5747.pth"
 
if __name__ == "__main__":
    # input shape
    dummy_input = torch.randn(1, 3, 320, 480, device="cuda")
    # [1] create network
    model = UNet(n_channels=3, n_classes=1, bilinear=False)
    model = model.cuda()
    print("create U-Net model finised ...")
    # [2] load weight
    device = torch.device(device='cuda')
    chkpt = torch.load(model_path, map_location=device)
    model.load_state_dict(chkpt['model'])
    
    print("load weight to model finised ...")
 
    # convert torch format to onnx
    input_names = ["input"]
    output_names = ["output"]
    torch.onnx.export(model, 
        dummy_input, 
        "unet_deconv.onnx", 
        verbose=True, 
        input_names=input_names,
        output_names=output_names)
    print("convert torch format model to onnx ...")
    # [4] confirm the onnx file
    net = onnx.load("unet_deconv.onnx")
    # check that the IR is well formed
    onnx.checker.check_model(net)
    # print a human readable representation of the graph
    onnx.helper.printable_graph(net.graph)

注意输入shape以及UNet设置与上述设置相同，直接运行即可，如图所示

在这里插入图片描述
至此，在服务器端的操作已经完成，然后将所有程序以及训练的模型和onnx文件拷贝到Jetson TX2 NX

3 Jetson TX2 NX实现图像分割

首先安装一些包，输入命令

pip3 install pycuda==2019.1.2 --user

如图所示，该版本可以安装成功

在这里插入图片描述
pycuda包安装成功，如图所示，继续安装onnx包，输入命令

pip3 install onnx

最终无法安装成功：

在这里插入图片描述
报错如图所示：

在这里插入图片描述
解决上述错误，首先安装protobuf，输入命令

pip3 install protobuf

然后就安装protobuf-compiler和libprotoc-dev，输入命令

sudo apt-get install protobuf-compiler libprotoc-dev

如图所示：

在这里插入图片描述
然后输入onnx安装命令，注意版本为1.9.0

pip3 install onnx==1.9.0

成功安装，如图所示：

在这里插入图片描述
可以尝试在Jetson TX2 NX端进行pth文件转onnx文件，可能会被killed，无法成功，这里可以尝试增加Swap分区，大家自行实验

在这里插入图片描述
将在服务器端转换完成的onnx文件拷贝到Jetson TX2 NX，如图所示

在这里插入图片描述
开始进行onnx文件转trt文件，输入命令

onnx2trt unet_deconv.onnx -o unet_deconv.trt

这里注意onnx文件位置，以及trt文件需要保存的位置，但是转换出错了，如图所示

在这里插入图片描述
参考大佬文章: pytorch模型转tensorrt，pad报错，opset11 我使用了比较简单的方法，即注释掉该部分代码，在以后部署其他深度模型时，可能需要考虑在pytorch中不使用pad操作

在这里插入图片描述
于是重复所有步骤，重新在服务器端训练模型，然后将pth文件转为onnx文件，并拷贝到Jetson TX2 NX，如图所示：

在这里插入图片描述
然后继续进行onnx文件转trt文件，输入命令

onnx2trt unet_deconv.onnx -o unet_deconv.trt

如图所示

在这里插入图片描述
出现问题，进程被killed

在这里插入图片描述
我直接关机重启Jetson TX2 NX，然后再次输入命令，成功，大家可以尝试增加Swap分区

在这里插入图片描述
接下来开始测试，inference.py代码如下：

import os
import sys
import time
# from PIL import Image
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
# TensorRT logger singleton
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
 
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream

def load_engine(trt_path):
    # 反序列化引擎
    with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())


class TRTInference(object):
    """Manages TensorRT objects for model inference."""
 
    def __init__(self, trt_engine_path, onnx_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):
        """Initializes TensorRT objects needed for model inference.
        Args:
            trt_engine_path (str): path where TensorRT engine should be stored
            uff_model_path (str): path of .uff model
            trt_engine_datatype (trt.DataType):
                requested precision of TensorRT engine used for inference
            batch_size (int): batch size for which engine
                should be optimized for
        """
 
        # Initialize runtime needed for loading TensorRT engine from file
        # TRT engine placeholder
        self.trt_engine = None
 
        # Display requested engine settings to stdout
        print("TensorRT inference engine settings:")
        print("  * Inference precision - {}".format(trt_engine_datatype))
        print("  * Max batch size - {}\n".format(batch_size))
        # If we get here, the file with engine exists, so we can load it
        if not self.trt_engine:
            print("Loading cached TensorRT engine from {}".format(
                trt_engine_path))
            self.trt_engine = load_engine(
                trt_engine_path)
 
        # This allocates memory for network inputs/outputs on both CPU and GPU
        self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.trt_engine)
 
        # Execution context is needed for inference
        self.context = self.trt_engine.create_execution_context()
 
    def infer(self, full_img, output_shapes, new_width, new_height):
        """Infers model on given image.
        Args:
            image_path (str): image to run object detection model on
        """
        
        assert new_width > 0 and new_height > 0, "Scale is too small"
        # resize and transform to array
        scale_img = cv2.resize(full_img, (new_width, new_height))
        print("scale image shape:{}".format(scale_img.shape))
        # scale_img = np.array(scale_img)
        # HWC to CHW
        scale_img = scale_img.transpose((2, 0, 1))
        # 归一化
        if scale_img.max() > 1:
            scale_img = scale_img / 255
        # 扩增通道数
        # scale_img = np.expand_dims(scale_img, axis=0)
        # 将数据成块
        scale_img = np.array(scale_img, dtype=np.float32, order='C')
        # Copy it into appropriate place into memory
        # (self.inputs was returned earlier by allocate_buffers())
        np.copyto(self.inputs[0].host, scale_img.ravel())
        # Output shapes expected by the post-processor
        # output_shapes = [(1, 11616, 4), (11616, 21)]
        # When infering on single image, we measure inference
        # time to output it to the user
        inference_start_time = time.time()
 
        # Fetch output from the model
        trt_outputs = do_inference(
            self.context, bindings=self.bindings, inputs=self.inputs,
            outputs=self.outputs, stream=self.stream)
        print("network output shape:{}".format(trt_outputs[0].shape))
        # Output inference time
        print("TensorRT inference time: {} ms".format(
            int(round((time.time() - inference_start_time) * 1000))))
        # Before doing post-processing, we need to reshape the outputs as the common.do_inference will
        # give us flat arrays.
        outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)]
        # And return results
        return outputs
 
 
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

predict.py代码如下：

import tensorrt as trt
import numpy as np
import cv2
import inference as inference_utils  # TRT/TF inference wrappers
 
if __name__ == "__main__":
    # 1. 网络构建
    # Precision command line argument -> TRT Engine datatype
    TRT_PRECISION_TO_DATATYPE = {
        16: trt.DataType.HALF,
        32: trt.DataType.FLOAT
    }
    # datatype: float 32
    trt_engine_datatype = TRT_PRECISION_TO_DATATYPE[16]
    # batch size = 1
    max_batch_size = 1
    engine_file_path = "unet_deconv_sim.trt"
    onnx_file_path = "unet_deconv_sim.onnx"
    new_width, new_height = 480, 320
    output_shapes = [(1, new_height, new_width)]
    trt_inference_wrapper = inference_utils.TRTInference(
        engine_file_path, onnx_file_path,
        trt_engine_datatype, max_batch_size,
    )
    
    # 2. 图像预处理
    image_path = "example.jpg"
    img = cv2.imread(image_path)
    # inference
    trt_outputs = trt_inference_wrapper.infer(img, output_shapes, new_width, new_height)[0]
    # 输出后处理
    out_threshold = 0.5
    print("the size of tensorrt output : {}".format(trt_outputs.shape))
    output = trt_outputs.transpose((1, 2, 0))
    # 0/1像素值
    output[output > out_threshold] = 255
    output[output <= out_threshold] = 0
    
    output = output.astype(np.uint8)
    result = cv2.resize(output, (img.shape[1], img.shape[0]))
    cv2.imwrite("best_output_deconv.jpg", result)