【YOLOv8】使用tensorrt实现动态batch推理（python api）

最新推荐文章于 2024-09-18 17:21:42 发布

菜鸟的追梦旅行

最新推荐文章于 2024-09-18 17:21:42 发布

阅读量188

点赞数

分类专栏： TensorRT教程文章标签： YOLO batch python

原文链接：https://medium.com/@smallerNdeeper/yolov8-batch-inference-implementation-using-tensorrt-3-batch-inference-using-tensorrt-python-cf30ae10920c

版权

TensorRT教程专栏收录该内容

2 篇文章 0 订阅

订阅专栏

使用 TensorRT python api 进行批量推理。（注意：这里介绍的代码大部分参考了 nvidia 提供的示例，并包含个人修改）

批处理输入

批处理方法有多种。可以使用 Torch 包来使用 Dataloader，但这里使用生成器从头开始实现

BATCH_SIZE=5
INPUT_SHAPE_W_BS = (BATCH_SIZE, 3, 640, 640)
ALLOWED_EXTENSIONS = (".jpeg", ".jpg", ".png")

src_files = [
    path for path in glob.iglob(os.path.join(img_path, "**"), recursive = True)
    if os.path.isfile(path) and path.lower().endswith(ALLOWED_EXTENSIONS)
]
if len(src_files) == 0:
        raise Exception(
            "ERROR: src data path [{}] contains no files!".format(
                img_path
            )
       
# Add files for making a multiple of batch size
if len(src_files) % BATCH_SIZE != 0:
    src_files += src_files[len(src_files) % BATCH_SIZE : BATCH_SIZE]
# initialize batch
init_batch = np.zeros(INPUT_SHAPE_W_BS, dtype=np.float32)
# make batch
def load_batches(batch, src_files, preprocessing_func):
    for i in range(0, len(src_files), BATCH_SIZE):
        for offset in range(BATCH_SIZE):
            img = Image.open(src_files[i + offset])
            batch[offset] = preprocessing_func(img)
        yield batch
def get_batch(batches):
    try:
        batch = next(batches)
        return batch
    
    except StopIteration:
        return None

因此，无论何时执行 get_batch，都会出现 (batch size, 3, 640, 640) 批处理输入。

预处理

这里的预处理是指训练期间应用的调整大小、规范化或字母加框。
如果您查看 load_batches 函数，您会发现执行此操作时会使用预处理的图像进行批处理。此处使用的预处理函数如下。

def default_preprocessing(img):
    scr_w, scr_h = img.size
    inp_w, inp_h = INPUT_SHAPE_W_BS[2], INPUT_SHAPE_W_BS[3]
    scale_ratio = min(inp_w / scr_w, inp_h / scr_h)
    nw = int(scr_w * scale_ratio)
    nh = int(scr_h * scale_ratio)
    image = img.resize((nw, nh), Image.BICUBIC).copy()
    new_image = Image.new("RGB", (inp_w, inp_h))
    new_image.paste(image, ((inp_w - nw) // 2, (inp_h - nh) // 2))
    
    scaled_image = np.asarray(new_image, dtype=np.float32) * 0.0039215697906911373
    whc2cwh = np.swapaxes(scaled_image, 2, 0)
    cwh2chw = np.swapaxes(whc2cwh, 2, 1)
    
    return cwh2chw

这里不涉及letter boxing处理
letter boxing前处理请看：

def letterbox_preprocessing(img):
    scr_w, scr_h = img.size
    inp_w, inp_h = INPUT_SHAPE_W_BS[2], INPUT_SHAPE_W_BS[3]
    # Letter boxing
    scale_ratio = min(inp_w / scr_w, inp_h / scr_h)
    nw = int(scr_w * scale_ratio)
    nh = int(scr_h * scale_ratio)
    image = img.resize((nw, nh), Image.BICUBIC).copy()
    new_image = Image.new("RGB", (inp_w, inp_h))
    new_image.paste(image, ((inp_w - nw) // 2, (inp_h - nh) // 2))

    scaled_image = (
            np.asarray(new_image, dtype=np.float32) * 0.0039215697906911373
        )  # Rescaling factor: 1/255 = 0.0039215697906911373
        # PIL image pre-processing
        # WHC (PIL2numpy) -> 1. CWH -> 2. CHW -> 3. NCHW
        whc2cwh = np.swapaxes(
            scaled_image, 2, 0
        )  # NETWORK INPUT ORDER (1. channel axis move to 0 from 2)
        cwh2chw = np.swapaxes(whc2cwh, 2, 1)  # NETWORK INPUT ORDER (2. HW -> WH)
        img_data = cwh2chw
        return img_data

推理

Inference 按照 sync 和 async 方法、动态批处理和固定批处理分为四种类型 (execute_async, execute_async_v2, execute_v2, execute)。v2 是可以动态批处理的函数，sync 和 async 区分是同步使用 gpu 还是异步使用 gpu。你可以根据自己的目的来使用。这里使用可以执行 async 和动态批处理的 async_execute_v2。( https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/ExecutionContext.html )
在这个函数中，context、bindings 和 stream_handle 是输入变量。让我们解释一下准备输入变量时所需的一些过程，并解释最终的推理代码。

def allocate_buffers(engine, batch_size = 1):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(-size if size < 0 else size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
    
    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            """
            Within this context, host_mom means the cpu memory and device means the GPU memory
            """
            self.host = host_mem
            self.device = device_mem
        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
        def __repr__(self):
            return self.__str__()

allocate_buffers 函数将与引擎绑定的输入和输出数据类型以及内存大小从主机内存（cpu）分配到设备内存（gpu）。

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    context.set_binding_shape(0, [batch_size, *context.get_binding_shape(0)[1:]])
    for inp in inputs:
        device_ptr = inp.device # binding array
        host_array = inp.host # input array
        cuda.memcpy_htod_async(device_ptr, host_array, stream)

    # Run inference.
        # context.execute_v2(bindings=bindings)  # for Profiling
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)  # for Inferencing
        # Transfer predictions back from the GPU.
        for out in outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, stream)
            
        # Synchronize the stream
        stream.synchronize()
        
        # Return only the host outputs.
        host_outputs = []
        for out in outputs:
            host_outputs.append(out.host)
        return host_outputs

do_inference 函数是使用引擎上下文中分配的设备内存执行推理并返回输出的函数。

后处理

推理结果以向量形式（1 x N）输出，需要根据模型的batch size和输出形状进行reshape。
（这里原始的yolov8只有一个输出output0，参考本系列文章2，可以修改为有conf、class_id、bbox输出的模型。）

def reshape_trt_outputs(h_outputs, shape_of_output):
    # print("h_outputs", h_outputs)
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs

reshape_trt_outputs 是一个函数，它将推理输出结果转换为原始形状，因为它以矢量格式输出。
例如，batchsize 5 的 bbox 输出形状为 (1, 168000)，将其更改为 (5, 8400, 4) 形状。

def parse(trt_output, trt_output_shape):
    # for removing index order dependency of outputs, use if statements.
    for i in range(len(trt_output_shape)):
        if trt_output_shape[i]["name"] == 'conf':
            shaped_trt_conf = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
        elif trt_output_shape[i]["name"] == 'bbox':
            shaped_trt_bbox = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
        elif trt_output_shape[i]["name"] == 'class_id':
            shaped_trt_class = reshape_trt_outputs(trt_output[i], trt_output_shape[i]["shape"])
    return shaped_trt_bbox, shaped_trt_conf, shaped_trt_class

parse 函数将所有形状按输出名称进行变换，并返回最终的输出。因此，shaped_trt_bbox 的形状为 (BATCH_SIZE, 8400, 4)，shaped_trt_bbox 和shaped_trt_class 的形状为 (BATCH_SIZE, 8400)。

main function

with open(engine_path, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
    inputs, outputs, bindings, stream = allocate_buffers(engine, batch_size=BATCH_SIZE)
    context = engine.create_execution_context()
    
init_time = time.time()
batches = load_batches(init_batch, src_files, default_preprocessing)
for i in range((len(src_files) // BATCH_SIZE)):
    batch = get_batch(batches)
    inputs[0].host = batch.reshape(-1)
    trt_output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=BATCH_SIZE)
    output_shape = []

# for removing order dependency of outputs, use binding index of engine and get name and shape
    for binding_idx in range(1, 4): # index 0 is input, index 1 ~ 4 are outputs 
        output_shape.append({"name": engine.get_binding_name(binding_idx), \
            "shape": (BATCH_SIZE, *engine.get_binding_shape(binding_idx)[1:])})
    shaped_trt_bbox, shaped_trt_conf, shaped_trt_class = parse(trt_output, output_shape)