使用 TensorRT python api 进行批量推理。(注意:这里介绍的代码大部分参考了 nvidia 提供的示例,并包含个人修改)
批处理输入
批处理方法有多种。可以使用 Torch 包来使用 Dataloader,但这里使用生成器从头开始实现
BATCH_SIZE=5
INPUT_SHAPE_W_BS = (BATCH_SIZE, 3, 640, 640)
ALLOWED_EXTENSIONS = (".jpeg", ".jpg", ".png")
src_files = [
path for path in glob.iglob(os.path.join(img_path, "**"), recursive = True)
if os.path.isfile(path) and path.lower().endswith(ALLOWED_EXTENSIONS)
]
if len(src_files) == 0:
raise Exception(
"ERROR: src data path [{}] contains no files!".format(
img_path
)
# Add files for making a multiple of batch size
if len(src_files) % BATCH_SIZE != 0:
src_files += src_files[len(src_files) % BATCH_SIZE : BATCH_SIZE]
# initialize batch
init_batch = np.zeros(INPUT_SHAPE_W_BS, dtype=np.float32)
# make batch
def load_batches(batch, src_files, preprocessing_func):
for i in range(0, len(src_files), BATCH_SIZE):
for offset in range(BATCH_SIZE):
img = Image.open(src_files[i + offset])
batch[offset] = preprocessing_func(img)
yield batch
def get_batch(batches):
try:
batch = next(batches)
return batch
except StopIteration:
return None
因此,无论何时执行 get_batch,都会出现 (batch size, 3, 640, 640) 批处理输入。
预处理
这里的预处理是指训练期间应用的调整大小、规范化或字母加框。
如果您查看 load_batches 函数,您会发现执行此操作时会使用预处理的图像进行批处理。此处使用的预处理函数如下。
def default_preprocessing(img):
scr_w, scr_h = img.size
inp_w, inp_h = INPUT_SHAPE_W_BS[2], INPUT_SHAPE_W_BS[3]
scale_ratio = min(inp_w / scr_w, inp_h / scr_h)
nw = int(scr_w * scale_ratio)
nh = int(scr_h * scale_ratio)
image = img.resize((nw, nh), Image.BICUBIC).copy()
new_image = Image.new("RGB", (inp_w, inp_h))
new_image.paste(image, ((inp_w - nw) // 2, (inp_h - nh) // 2))
scaled_image = np.asarray(new_image, dtype=np.float32) * 0.0039215697906911373
whc2cwh = np.swapaxes(scaled_image, 2, 0)
cwh2chw = np.swapaxes(whc2cwh, 2, 1)
return cwh2chw
这里不涉及letter boxing处理
letter boxing前处理请看:
def letterbox_preprocessing(img):
scr_w, scr_h = img.size
inp_w, inp_h = INPUT_SHAPE_W_BS[2], INPUT_SHAPE_W_BS[3]
# Letter boxing
scale_ratio = min(inp_w / scr_w, inp_h / scr_h)
nw = int(scr_w * scale_ratio)
nh = int(scr_h * scale_ratio)
image = img.resize((nw, nh), Image.BICUBIC).copy()
new_image = Image.new("RGB", (inp_w, inp_h))
new_image.paste(image, ((inp_w - nw) // 2, (inp_h - nh) // 2))
scaled_image = (
np.asarray(new_image, dtype=np.float32) * 0.0039215697906911373
) # Rescaling factor: 1/255 = 0.0039215697906911373
# PIL image pre-processing
# WHC (PIL2numpy) -> 1. CWH -> 2. CHW -> 3. NCHW
whc2cwh = np.swapaxes(
scaled_image, 2, 0
) # NETWORK INPUT ORDER (1. channel axis move to 0 from 2)
cwh2chw = np.swapaxes(whc2cwh, 2, 1) # NETWORK INPUT ORDER (2. HW -> WH)
img_data = cwh2chw
return img_data
推理
Inference 按照 sync 和 async 方法、动态批处理和固定批处理分为四种类型 (execute_async, execute_async_v2, execute_v2, execute)。v2 是可以动态批处理的函数,sync 和 async 区分是同步使用 gpu 还是异步使用 gpu。你可以根据自己的目的来使用。这里使用可以执行 async 和动态批处理的 async_execute_v2。( https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/ExecutionContext.html )
在这个函数中,context、bindings 和 stream_handle 是输入变量。让我们解释一下准备输入变量时所需的一些过程,并解释最终的推理代码。
def allocate_buffers(engine, batch_size = 1):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(-size if size < 0 else size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""
Within this context, host_mom means the cpu memory and device means the GPU memory
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
allocate_buffers 函数将与引擎绑定的输入和输出数据类型以及内存大小从主机内存(cpu)分配到设备内存(gpu)。
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
context.set_binding_shape(0, [batch_size, *context.get_binding_shape(0)[1:]])
for inp in inputs:
device_ptr = inp.device # binding array
host_array = inp.host # input array
cuda.memcpy_htod_async(device_ptr, host_array, stream)
# Run inference.
# context.execute_v2(bindings=bindings) # for Profiling
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # for Inferencing
# Transfer predictions back from the GPU.
for out in outputs:
cuda.memcpy_dtoh_async(out.host, out.device, stream)
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
host_outputs = []
for out in outputs:
host_outputs.append(out.host)
return host_outputs
do_inference 函数是使用引擎上下文中分配的设备内存执行推理并返回输出的函数。
后处理
推理结果以向量形式(1 x N)输出,需要根据模型的batch size和输出形状进行reshape。
(这里原始的yolov8只有一个输出output0,参考本系列文章2,可以修改为有conf、class_id、bbox输出的模型。)
def reshape_trt_outputs(h_outputs, shape_of_output):
# print("h_outputs", h_outputs)
h_outputs = h_outputs.reshape(*shape_of_output)
return h_outputs
reshape_trt_outputs 是一个函数,它将推理输出结果转换为原始形状,因为它以矢量格式输出。
例如,batchsize 5 的 bbox 输出形状为 (1, 168000),将其更改为 (5, 8400, 4) 形状。
def parse(trt_output, trt_output_shape):
# for removing index order dependency of outputs, use if statements.
for i in range(len(trt_output_shape)):
if trt_output_shape[i]["name"] == 'conf':
shaped_trt_conf = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
elif trt_output_shape[i]["name"] == 'bbox':
shaped_trt_bbox = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
elif trt_output_shape[i]["name"] == 'class_id':
shaped_trt_class = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
return shaped_trt_bbox, shaped_trt_conf, shaped_trt_class
parse 函数将所有形状按输出名称进行变换,并返回最终的输出。因此,shaped_trt_bbox 的形状为 (BATCH_SIZE, 8400, 4),shaped_trt_bbox 和shaped_trt_class 的形状为 (BATCH_SIZE, 8400)。
main function
with open(engine_path, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
inputs, outputs, bindings, stream = allocate_buffers(engine, batch_size=BATCH_SIZE)
context = engine.create_execution_context()
init_time = time.time()
batches = load_batches(init_batch, src_files, default_preprocessing)
for i in range((len(src_files) // BATCH_SIZE)):
batch = get_batch(batches)
inputs[0].host = batch.reshape(-1)
trt_output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=BATCH_SIZE)
output_shape = []
# for removing order dependency of outputs, use binding index of engine and get name and shape
for binding_idx in range(1, 4): # index 0 is input, index 1 ~ 4 are outputs
output_shape.append({"name": engine.get_binding_name(binding_idx), \
"shape": (BATCH_SIZE, *engine.get_binding_shape(binding_idx)[1:])})
shaped_trt_bbox, shaped_trt_conf, shaped_trt_class = parse(trt_output, output_shape)
主要功能包括引擎反序列化以及获取最终输出的bbox、conf、class_id的过程,
yolov8对此的示例图像结果如下。
以上结果显示了原始的 yolov8 结果,不包括后处理 NMS 结果。(nms 后处理的代码实现将在稍后完成。)
结论
因此,使用 TensorRT Python API 和 Yolov8 模型执行批量推理。在看到后处理的精炼结果后,看到深度学习网络的原始结果真是太棒了。接下来,我们将实施 nms 后处理以显示更清晰的结果。
参考
TensorRT 推理代码示例 — https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/python
原文链接
[yolov8] Batch inference implementation using tensorrt#3 — batch inference using TensorRT python api