TensorRT 模型加速

最新推荐文章于 2024-07-04 15:58:32 发布

帅与

最新推荐文章于 2024-07-04 15:58:32 发布

阅读量384

点赞数

分类专栏：神经网络文章标签： python 开发语言

本文链接：https://blog.csdn.net/qq_36865682/article/details/126526635

版权

神经网络专栏收录该内容

5 篇文章 0 订阅

订阅专栏

TensorRT 框架模型加速

TensorRT配置

#下载pycuda
conda install pycuda

#下载 TensorRT框架
pip install /home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/python/tensorrt-7.2.3.4-py38-none-linux_x86_64.whl

# 添加系统路径
sudo gedit ~/.bashrc
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/lib
export LIBRARY_PATH=/home/s4/Downloads/TensorRT/TensorRT-7.2.3.4/lib::$LIBRARY_PATH
source ~/.bashrc

pt模型转通用onnx模型

import torch
import torch.onnx as onnx


def convert2onnx(model, input_size, batch_size, save_path):
    """
    this function is going to conver a pytorch model into onnx file.

    :param model: original model
    :param input_size: the input image size of original model requirement. input size should be a list object.
    :param batch_size: set a batch size in predict process. this parameter should be a int object
    :param save_path: onnx file path
    """
    # convert a pytorch model to onnx file
    input_size.insert(0, batch_size)
    dummy_input = torch.randn(input_size)
    torch.onnx.export(model, dummy_input, save_path, verbose=False)
    pass

onnx 模型转 tensorRT推断用的.engine模型

使用tensorRT 自带的trtexec工具进行转码。
```
trtexrc --onnx xxx.onnx --saveEngine xxx.engine --fp16
```

使用trt模型推断

import torch
from torchvision.transforms import Normalize
import numpy as np
import pycuda.driver as cuda

# 处理读入内存的图像数据
def preprocess_image(img, f_type=16):
    norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    result = norm(torch.from_numpy(img).transpose(0, 2).transpose(1, 2))
    if f_type == 16:
        return np.array(result, dtype=np.float16)
    elif f_type == 32:
        return np.array(result, dtype=np.float32)
    else:
        return np.array(result, dtype=np.float64)

# 使用TensorRT工具进行预测
def predict(batch, d_input, d_output, output, stream, bindings, context):  # result gets copied into output
    # transfer input data to device
    cuda.memcpy_htod_async(d_input, batch, stream)
    # execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # syncronize threads
    stream.synchronize()

    return output

# TensorRT 使用demo.展示TensorRT框架加速效果。
from commo.torchTool import preprocess_image, predict
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2 as cv
import time

import torchvision.models as models
import torch
import torch.onnx

BATCH_SIZE = 32

# load the pretrained model
resnet50_gpu = models.resnet50(num_classes=1000, channels=3).to('cuda').eval()

# load data as imput_batch
dummy_input = torch.randn(BATCH_SIZE, 3, 640, 640)
url = '../image/1.jpg'
img = cv.imread(url)
img = cv.resize(img, (640, 640), interpolation=cv.INTER_AREA)
input_batch = np.array(np.repeat(np.expand_dims(np.array(img, dtype=np.float32), axis=0), BATCH_SIZE, axis=0),
                       dtype=np.float32)

input_batch_chw = torch.from_numpy(input_batch).transpose(1, 3).transpose(2, 3)
input_batch_gpu = input_batch_chw.to("cuda")

# execute torch model in cuda without TensorRT
t1 = time.time()
with torch.no_grad():
    predictions = np.array(resnet50_gpu(input_batch_gpu).cpu())
    pass
t2 = time.time()
print('pytorch model use {} ms!'.format((t2 - t1) * 1000))
indices = (-predictions[0]).argsort()[:5]
print("Class | Likelihood (torch)")
res = list(zip(indices, predictions[0][indices]))
print('predict result is :{}'.format(res))

preprocessed_images = np.array([preprocess_image(image) for image in input_batch])
print('preprocessed shape is {}'.format(preprocessed_images.shape))
f = open("../trt/resnet_engine_pytorch.trt", "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))

engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

output = np.empty([BATCH_SIZE, 25200 * 12], dtype=np.float16)
print('Output size is {}'.format(output.shape))
# allocate device memory
d_input = cuda.mem_alloc(1 * input_batch.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

bindings = [int(d_input), int(d_output)]

stream = cuda.Stream()
t1 = time.time()
pred = predict(preprocessed_images, d_input, d_output, output, stream, bindings, context)
res = np.resize(pred, (32, 25200, 12))
t2 = time.time()
print(res.shape)
print('trf model use {}ms! '.format((t2 - t1) * 1000))
indices = (-pred[0]).argsort()[:7]
print("Class | Probability (trf)")
print(list(zip(indices, pred[0][indices])))