Cpp DenseNet Tensorrt测试

最新推荐文章于 2024-09-12 18:02:47 发布

tacom_

最新推荐文章于 2024-09-12 18:02:47 发布

阅读量268

点赞数 6

文章标签： Tensorrt 人工智能 cuda

本文链接：https://blog.csdn.net/qq_34524246/article/details/140960251

版权

Cpp DenseNet Tensorrt测试

Python版本导引

Cpp OpenVINO版本导引

模型测试

和OpenVINO相比，Tensorrt的使用就会困难很多，此框架需要用户手动的去分配Host和Device的内存位置，完成数据传递，触发执行函数，拷贝结果, 下面会将一些辅助函数独立，以提高程序可读性

EngineTool

注意这里使用了较新的API版本 (V3)，V2，V1的写法、函数名字都会有所不同，如果有需求，需要手动查询资料，进行修改

# Copyright 2019-2020 NVIDIA Corporation.  All rights reserved.
"""Helper functions for loading engine."""
import numpy as np
import pycuda.autoinit  # noqa pylint: disable=unused-import
import pycuda.driver as cuda
import tensorrt as trt


class HostDeviceMem(object):
    """Simple helper data class that's a little nice to use than a 2-tuple."""

    def __init__(self, host_mem, device_mem):
        """Init function."""
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        """___str___."""
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        """___repr___."""
        return self.__str__()


def do_inference(context, engine, bindings, inputs, outputs, stream):
    """Generalization for multiple inputs/outputs.

    inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    for inp in inputs:
        cuda.memcpy_htod_async(inp.device, inp.host, stream)
    # Run inference.
    for i in range(engine.num_io_tensors):
        context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
    context.execute_async_v3(stream_handle=stream.handle)
    for out in outputs:
        cuda.memcpy_dtoh_async(out.host, out.device, stream)
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


def allocate_buffers(engine):
    """Allocates host and device buffer for TRT engine inference.

    This function is similair to the one in common.py, but
    converts network outputs (which are np.float32) appropriately
    before writing them to Python buffer. This is needed, since
    TensorRT plugins doesn't support output type description, and
    in our particular case, we use NMS plugin as network output.

    Args:
        engine (trt.ICudaEngine): TensorRT engine
        context (trt.IExecutionContext): Context for dynamic shape engine

    Returns:
        inputs [HostDeviceMem]: engine input memory
        outputs [HostDeviceMem]: engine output memory
        bindings [int]: buffer to device bindings
        stream (cuda.Stream): cuda stream for engine inference synchronization
    """
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()

    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    for binding in tensor_names:
        shape = engine.get_tensor_shape(binding)
        size = trt.volume(shape)
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))

        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            print("Input info: ", str(shape), str(dtype))
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            print("Output info: ", str(shape), str(dtype))
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream


def load_engine(trt_runtime, engine_path):
    """Helper funtion to load an exported engine."""
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

TestInfer

经过封装，已经大大减少了程序的困惑性，可以跟OpenVINO进行比对，学习二者框架的差异性

# https://github.com/NVIDIA/TensorRT/blob/main/quickstart/SemanticSegmentation/tutorial-runtime.ipynb
# https://github.com/NVIDIA/TensorRT/blob/main/samples/python/common_runtime.py
# https://github.com/NVIDIA/TensorRT/blob/main/samples/python/introductory_parser_samples/onnx_resnet50.py
import time
import cv2
import numpy
import torch
import tensorrt as trt
import torch.nn as nn
from torchvision import models
from torch.nn import functional as F
from torchvision.transforms import transforms
from EngineTools import allocate_buffers, do_inference, load_engine


TRT_LOGGER = trt.Logger(trt.Logger.INFO)


def get_kernel(kernel_len=16, nsig=10):  # nsig 标准差 ，kernlen=16核尺寸
    GaussianKernel = cv2.getGaussianKernel(kernel_len, nsig) \
                     * cv2.getGaussianKernel(kernel_len, nsig).T
    return GaussianKernel


class Gaussian_kernel(nn.Module):
    def __init__(self,
                 kernel_len, nsig=20):
        super(Gaussian_kernel, self).__init__()
        self.kernel_len = kernel_len
        kernel = get_kernel(kernel_len=kernel_len, nsig=nsig)  # 获得高斯卷积核
        kernel = torch.FloatTensor(kernel).unsqueeze(0).unsqueeze(0)  # 扩展两个维度
        self.weight = nn.Parameter(data=kernel, requires_grad=False)

        self.padding = torch.nn.ReplicationPad2d(int(self.kernel_len / 2))

    def forward(self, x):  # x1是用来计算attention的，x2是用来计算的Cs
        x = self.padding(x)
        # 对三个channel分别做卷积
        res = []
        for i in range(x.shape[1]):
            res.append(F.conv2d(x[:, i, :, :], self.weight))
        x_output = torch.cat(res, dim=0)
        return x_output


class DensenetGrade:
    def __init__(self, pth_path: str):
        # We first load all custom plugins shipped with TensorRT,
        # some of them will be needed during inference
        trt.init_libnvinfer_plugins(TRT_LOGGER, '')

        # Initialize runtime needed for loading TensorRT engine from file
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        self.trt_engine = load_engine(self.trt_runtime, pth_path)

        # Allocate memory for multiple usage [e.g. multiple batch inference]
        self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.trt_engine)

        # Contexts are used to perform inference.
        self.context = self.trt_engine.create_execution_context()

        self.gaussian_kernel = Gaussian_kernel(11, 30)
        self.transform_kernel = transforms.Compose([
            transforms.ToTensor()
        ])

    def softmax(self, x):
        f_x = numpy.exp(x) / numpy.sum(numpy.exp(x))
        return f_x

    @torch.no_grad()
    def infer(self, image_path: str):
        # pre process
        trans_dim = self._preprocess(image_path).numpy()

        # copy to host
        numpy.copyto(self.inputs[0].host, trans_dim.ravel())

        # bind
        result = do_inference(self.context, engine=self.trt_engine, bindings=self.bindings,
                              inputs=self.inputs, outputs=self.outputs, stream=self.stream)

        y = self.softmax(result[0]).tolist()
        return self._postprocess(y)

    def _postprocess(self, pred_arr):
        return {
            'index': pred_arr.index(max(pred_arr)),
            'pie': pred_arr
        }

    def _preprocess(self, data_in: str):
        img_arr = self.circle_crop(data_in) / 255.0
        x = torch.from_numpy(img_arr[:, :, ::-1].astype(numpy.float32).transpose((2, 0, 1))).unsqueeze(0)
        return x

    def circle_crop(self, image_src: str):
        crop_mask = self.crop_image_from_mask(image_src)
        return self.crop_image_with_gaussian(crop_mask)

    def crop_image_from_mask(self, image_src: str):
        # load
        image = cv2.imread(image_src)

        # binary
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, binary_image = cv2.threshold(gray_image, 7, 255, cv2.THRESH_BINARY)

        # cal roi
        x, y, w, h = cv2.boundingRect(binary_image)
        center = (w // 2), (h // 2)
        radius = min(center)
        y = y + center[1] - radius
        x = x + center[0] - radius
        copy_image = image[y: y + 2 * radius, x: x + 2 * radius]

        # gen mask
        mask = numpy.zeros_like(copy_image)
        cv2.circle(mask, (radius, radius), radius, (1, 1, 1), -1)

        # exposure
        return copy_image * mask

    def crop_image_with_gaussian(self, data_in: numpy.ndarray):
        ori_image = cv2.resize(data_in, (224, 224)).astype(numpy.float32)
        with torch.no_grad():
            image_cuda = self.transform_kernel(ori_image).unsqueeze(0)
            out = numpy.transpose(self.gaussian_kernel(image_cuda).cpu().numpy(), (1, 2, 0))

        if out.shape != (224, 224, 3):
            out = out[0: 224, 0: 224]
        exposure = cv2.addWeighted(ori_image, 4, out, -4, 128)
        exposure = numpy.clip(exposure, 0, 255).astype(numpy.uint8)
        exposure = cv2.cvtColor(exposure, cv2.COLOR_BGR2RGB)
        return exposure


if __name__ == '__main__':
    # fp32: {'index': 2, 'pie': [0.1414167284965515, 0.05537400394678116, 0.7084451913833618, 0.08920560777187347, 0.005558510776609182]}
    grade = DensenetGrade("export_dense121_gpu.engine")
    t1 = time.perf_counter()
    print(grade.infer("1.jpg"))
    t2 = time.perf_counter()
    print(t2 - t1)