TensorRT 模型加速

TensorRT 框架模型加速

  1. TensorRT配置

    #下载pycuda
    conda install pycuda
    
    #下载 TensorRT框架
    pip install /home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/python/tensorrt-7.2.3.4-py38-none-linux_x86_64.whl
    
    # 添加系统路径
    sudo gedit ~/.bashrc
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/lib
    export LIBRARY_PATH=/home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/lib::$LIBRARY_PATH
    source ~/.bashrc
    
  2. pt模型转通用onnx模型

    import torch
    import torch.onnx as onnx
    
    
    def convert2onnx(model, input_size, batch_size, save_path):
        """
        this function is going to conver a pytorch model into onnx file.
    
        :param model: original model
        :param input_size: the input image size of original model requirement. input size should be a list object.
        :param batch_size: set a batch size in predict process. this parameter should be a int object
        :param save_path: onnx file path
        """
        # convert a pytorch model to onnx file
        input_size.insert(0, batch_size)
        dummy_input = torch.randn(input_size)
        torch.onnx.export(model, dummy_input, save_path, verbose=False)
        pass
    
  3. onnx 模型转 tensorRT推断用的.engine模型

    使用tensorRT 自带的trtexec工具进行转码。

    trtexrc --onnx xxx.onnx --saveEngine xxx.engine --fp16
    
  4. 使用trt模型推断

    import torch
    from torchvision.transforms import Normalize
    import numpy as np
    import pycuda.driver as cuda
    
    # 处理读入内存的图像数据
    def preprocess_image(img, f_type=16):
        norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        result = norm(torch.from_numpy(img).transpose(0, 2).transpose(1, 2))
        if f_type == 16:
            return np.array(result, dtype=np.float16)
        elif f_type == 32:
            return np.array(result, dtype=np.float32)
        else:
            return np.array(result, dtype=np.float64)
    
    # 使用TensorRT工具进行预测
    def predict(batch, d_input, d_output, output, stream, bindings, context):  # result gets copied into output
        # transfer input data to device
        cuda.memcpy_htod_async(d_input, batch, stream)
        # execute model
        context.execute_async_v2(bindings, stream.handle, None)
        # transfer predictions back
        cuda.memcpy_dtoh_async(output, d_output, stream)
        # syncronize threads
        stream.synchronize()
    
        return output
    
    
    # TensorRT 使用demo.展示TensorRT框架加速效果。
    from commo.torchTool import preprocess_image, predict
    import tensorrt as trt
    import pycuda.driver as cuda
    import pycuda.autoinit
    import numpy as np
    import cv2 as cv
    import time
    
    import torchvision.models as models
    import torch
    import torch.onnx
    
    BATCH_SIZE = 32
    
    # load the pretrained model
    resnet50_gpu = models.resnet50(num_classes=1000, channels=3).to('cuda').eval()
    
    # load data as imput_batch
    dummy_input = torch.randn(BATCH_SIZE, 3, 640, 640)
    url = '../image/1.jpg'
    img = cv.imread(url)
    img = cv.resize(img, (640, 640), interpolation=cv.INTER_AREA)
    input_batch = np.array(np.repeat(np.expand_dims(np.array(img, dtype=np.float32), axis=0), BATCH_SIZE, axis=0),
                           dtype=np.float32)
    
    input_batch_chw = torch.from_numpy(input_batch).transpose(1, 3).transpose(2, 3)
    input_batch_gpu = input_batch_chw.to("cuda")
    
    # execute torch model in cuda without TensorRT
    t1 = time.time()
    with torch.no_grad():
        predictions = np.array(resnet50_gpu(input_batch_gpu).cpu())
        pass
    t2 = time.time()
    print('pytorch model use {} ms!'.format((t2 - t1) * 1000))
    indices = (-predictions[0]).argsort()[:5]
    print("Class | Likelihood (torch)")
    res = list(zip(indices, predictions[0][indices]))
    print('predict result is :{}'.format(res))
    
    preprocessed_images = np.array([preprocess_image(image) for image in input_batch])
    print('preprocessed shape is {}'.format(preprocessed_images.shape))
    f = open("../trt/resnet_engine_pytorch.trt", "rb")
    runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
    
    engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()
    
    output = np.empty([BATCH_SIZE, 25200 * 12], dtype=np.float16)
    print('Output size is {}'.format(output.shape))
    # allocate device memory
    d_input = cuda.mem_alloc(1 * input_batch.nbytes)
    d_output = cuda.mem_alloc(1 * output.nbytes)
    
    bindings = [int(d_input), int(d_output)]
    
    stream = cuda.Stream()
    t1 = time.time()
    pred = predict(preprocessed_images, d_input, d_output, output, stream, bindings, context)
    res = np.resize(pred, (32, 25200, 12))
    t2 = time.time()
    print(res.shape)
    print('trf model use {}ms! '.format((t2 - t1) * 1000))
    indices = (-pred[0]).argsort()[:7]
    print("Class | Probability (trf)")
    print(list(zip(indices, pred[0][indices])))
    
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值