0. 背景
本文主要介绍,如何在tensorrt上部署经过transfer learning toolkit导出的engine文件。包括的主要内容:
1. 安装tensorrt oss
2. 网络图像前处理
3. 确定输出编写Yolov3的后处理
4. Yolov3的engine加载类构建
1. 安装TensorRT OSS
对于Jetson平台的JetPack 4.4环境,需要编译安装TensorRT OSS文件,构建TensorRT开源软件(OSS)。这是必需的,因为这些模型所需要的几个TensorRT插件只在TensorRT开源repo中可用,而在一般的TensorRT版本中没有。
具体安装教程请参考:https://blog.csdn.net/hello_dear_you/article/details/111224823的6.2 tlt-converter得到engine文件
2. 编写网络图像前处理
经过分析,Yolov3的前处理如下:
def _preprocess_yolov3(img, shape): # shape: W, H
"""Preprocess an image before TRT YOLOv3 inferencing."""
img = cv2.resize(img, shape)
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.transpose((2, 0, 1)).astype(np.float32)
# img /= 255.0
return img
3. 网络输出后处理
基于tlt-export工具将tlt格式的权重文件转换为etlt格式的文件,在转换过程中会根据规范文件的一些参数处理Yolov3网络的后处理,需要注意的是etlt文件以及后面生成的engine文件对于Yolov3而言,其输出总共有四个值:
- topK:即最后保存多少个预测框数据(100或200)
- 预测的box信息:shape=(topK, 4)
- 预测的score信息:shape=(topK)
- 预测的class类别:shape=(topK)
需要注意:box给出的值是相对于输入大小而言,因此需要处理输入宽高再乘以图像的原始大小
def _postprocess_trt(img, output, conf_th, input_shape):
"""Postprocess TRT YOLO output."""
# tic = time.time()
orig_W, orig_H = (img.shape[1], img.shape[0])
p_keep_count = output[0][0]
# print("the total bounding boxes:{}".format(p_keep_count))
p_bboxes = output[1].reshape(200, 4)
p_scores = output[2]
p_classes = output[3]
boxes, confs, clss = [], [], []
for idx in range(p_keep_count):
conf = float(p_scores[idx])
if conf < conf_th:
continue
if p_bboxes[idx][2] < p_bboxes[idx][1] or p_bboxes[idx][3] < p_bboxes[idx][1]:
continue
# print("label:{}-conf:{}-box:{}".format(p_classes[idx], p_scores[idx], p_bboxes[idx]))
clip_fn = lambda a, min_value, max_value: max(min(a, max_value), min_value)
x1 = int(clip_fn(p_bboxes[idx][0]/input_shape[0]*orig_W, 0, orig_W - 1))
y1 = int(clip_fn(p_bboxes[idx][1]/input_shape[1]*orig_H, 0, orig_H - 1))
x2 = int(clip_fn(p_bboxes[idx][2]/input_shape[0]*orig_W, 0, orig_W - 1))
y2 = int(clip_fn(p_bboxes[idx][3]/input_shape[1]*orig_H, 0, orig_H - 1))
cls = int(p_classes[idx])
boxes.append((x1, y1, x2, y2))
confs.append(conf)
clss.append(cls)
# toc = time.time()
# print("postprocess time:{}\n".format(toc-tic))
return boxes, confs, clss
4. Yolo的engine加载类构建
class HostDeviceMem(object):
"""Simple helper data class that's a little nicer to use than a 2-tuple."""
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
"""Allocates all host/device in/out buffers required for an engine."""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * 1
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
"""do_inference (for TensorRT 6.x or lower)
This function is generalized for multiple inputs/outputs.
Inputs and outputs are expected to be lists of HostDeviceMem objects.
"""
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size,
bindings=bindings,
stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# --------------------------- Yolov3 tensorrt ------------------------------
class TrtYOLOv3(object):
"""TrtYOLOv3 class encapsulates things needed to run TRT YOLOv3."""
def _load_plugins(self):
trt.init_libnvinfer_plugins(self.trt_logger, '')
def _load_engine(self, engine_file):
# yolov3 engine file path
with open(engine_file, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def _create_context(self):
return self.engine.create_execution_context()
def __init__(self, engine_file, input_shape=(416, 416)):
"""Initialize TensorRT plugins, engine and conetxt."""
self.input_shape = input_shape
self.trt_logger = trt.Logger(trt.Logger.INFO)
self._load_plugins()
# 1. load engine
self.engine = self._load_engine(engine_file)
# 2. create context
self.context = self._create_context()
# 3. allocate buffers
self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.engine)
# 4. inference
self.inference_fn = do_inference
def __del__(self):
"""Free CUDA memories."""
del self.stream
del self.outputs
del self.inputs
def detect(self, img, conf_th=0.6):
"""Detect objects in the input image."""
# preprocess
img_resized = _preprocess_yolov3(img, self.input_shape)
# Set host input to the image. The do_inference() function
# will copy the input to the GPU before executing.
np.copyto(self.inputs[0].host, img_resized.ravel())
trt_outputs = self.inference_fn(
context=self.context,
bindings=self.bindings,
inputs=self.inputs,
outputs=self.outputs,
stream=self.stream)
return _postprocess_trt(img, trt_outputs, conf_th, self.input_shape)
具体完整代码可见分享:链接:https://pan.baidu.com/s/1qjBG5RQwv4l0Fw_D-zKamg 提取码:19ob