0. 源码
Yolact_minimal
1. 导出推理文件
使用如下导出方式比官方的快一倍左右
python -m onnxsim ./onnx_files/res50_coco.onnx ./onnx_files/res50_coco_sim.onnx # 进行精简
trtexec --onnx=onnx_files/res50_coco_sim.onnx --workspace=10240 --int8 --saveEngine=trt_files/res50_coco.engine
2. webcam预测
参数解释添加:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import argparse
import cv2
import time
import math
import time
import torch
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from config import get_config
from utils.box_utils import make_anchors
from utils.augmentations import val_aug
from utils.output_utils import nms, after_nms, draw_img
parser = argparse.ArgumentParser(description='YOLACT Detection with TensorRT.')
parser.add_argument('--weight', default='trt_files/res101_coco.trt', type=str)
parser.add_argument('--img_size', type=int, default=544, help='The image size for validation.')
parser.add_argument('--traditional_nms', default=False, action='store_true', help='Whether to use traditional nms.')
parser.add_argument('--hide_mask', default=False, action='store_true', help='Hide masks in results.')
parser.add_argument('--hide_bbox', default=False, action='store_true', help='Hide boxes in results.')
parser.add_argument('--hide_score', default=False, action='store_true', help='Hide scores in results.')
parser.add_argument('--cutout', default=False, action='store_true', help='Cut out each object and save.')
parser.add_argument('--save_lincomb', default=False, action='store_true', help='Show the generating process of masks.')
parser.add_argument('--no_crop', default=False, action='store_true',
help='Do not crop the output masks with the predicted bounding box.')
parser.add_argument('--real_time', default=True, action='store_true', help='Show the detection results real-timely.')
parser.add_argument('--visual_thre', default=0.5, type=float,
help='Detections with a score under this threshold will be removed.')
args = parser.parse_args()
args.cfg = 'res50_coco'
cfg = get_config(args, mode='detect')
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem:
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
anchors = []
fpn_fm_shape = [math.ceil(cfg.img_size / stride) for stride in (8, 16, 32, 64, 128)]
for i, size in enumerate(fpn_fm_shape):
anchors += make_anchors(cfg, size, size, cfg.scales[i])
# prepare engine
with open(cfg.weight, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
# ------------------------------------------------------------------------------------------------------------
# Since also the inference procedure are done on GPU, so any other CUDA relevant operation should be excluded,
# e.g. CUDA operation in PyTorch, or some unexpected error may occur.
# ------------------------------------------------------------------------------------------------------------
anchors = []
fpn_fm_shape = [math.ceil(cfg.img_size / stride) for stride in (8, 16, 32, 64, 128)]
for i, size in enumerate(fpn_fm_shape):
anchors += make_anchors(cfg, size, size, cfg.scales[i])
cap = cv2.VideoCapture(0)
max_fps = 0
while True:
img_origin = cap.read()[1]
img_origin = cv2.flip(img_origin, 1)
img_h, img_w = img_origin.shape[0:2]
img = val_aug(img_origin, cfg.img_size)
img = img[np.newaxis, :]
with engine.create_execution_context() as context:
start = time.time()
inputs[0].host = img # input dtype should be float32
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
results = [out.host for out in outputs]
class_p = results[3].reshape(1, -1, cfg.num_classes)
box_p = results[1].reshape(1, -1, 4)
proto_p = results[0].reshape(1, int(cfg.img_size / 4), int(cfg.img_size / 4), 32)
coef_p = results[2].reshape(1, -1, 32)
class_p, box_p, proto_p, coef_p = torch.from_numpy(class_p), torch.from_numpy(box_p), torch.from_numpy(proto_p), torch.from_numpy(coef_p)
ids_p, class_p, box_p, coef_p, proto_p = nms(class_p, box_p, coef_p, proto_p, anchors, cfg)
ids_p, class_p, boxes_p, masks_p = after_nms(ids_p, class_p, box_p, coef_p, proto_p, img_h, img_w, cfg, img_name="img_name")
end = time.time()
fps = 1 / (end - start)
img_numpy = draw_img(ids_p, class_p, boxes_p, masks_p, img_origin, cfg, img_name="img_name", fps=fps)
cv2.imshow("test", img_numpy)
cv2.waitKey(1)
if max_fps < fps:
max_fps = fps
print(f"\rMax FPS: {int(max_fps)}, FPS: {int(fps)}", end="")
3. 运行
python detect_with_trt.py --weight='trt_files/res50_coco.engine'