TensorRT入门

一、定义

  1. 安装部署教程
  2. 使用方法
  3. 案例1
  4. 案例2使用torch2trt 直接转换

二、实现

  1. 安装部署教程
    1.1 下载地址:https://developer.nvidia.com/tensorrt/download
    EA即early access抢先体验版,GA是general availability即通用稳定版本。很明显我们优先选择GA版本。
    在这里插入图片描述
    1.2 安装
tar -xzvf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz # 解压文件 
# 将lib添加到环境变量里面 
vim ~/.bashrc 
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/TensorRT-10.0.1.6/TensorRT-10.0.1.6/lib 
source ~/.bashrc 
 
# 或 直接将 TensorRT-8.6.1.6/lib 添加到 cuda/lib64 里面 
cp -r ./lib/* /usr/local/cuda/lib64/ 
 
# 安装python的包 
cd TensorRT-10.0.1.6/python 
pip install tensorrt-xxx-none-linux_x86_64.whl

1.3 验证

# 验证是否安装成功: 
python 
>>>import tensorrt 
>>>print(tensorrt.__version__) 
>>>assert tensorrt.Builder(tensorrt.Logger())

问题:ImportError: libnvinfer.so.10: cannot open shared object file: No such file or directory
解决:export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/TensorRT-10.0.1.6/TensorRT-10.0.1.6/lib 未配好。

  1. 使用方法
    方法一: pytorch -> onnx -> tensorrt
    方法二: pytorch–>tensorrt

  2. 案例
    官网教程:https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#convert-onnx-engine
    接口:https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/gettingStarted.html
    3.1 pytorch—>onnx(见onnx篇)

import torch
import torchvision

model = torchvision.models.resnet18(pretrained=False)

device = 'cuda' if torch.cuda.is_available else 'cpu'

dummy_input = torch.randn(1, 3, 224, 224, device=device)
model.to(device)
model.eval()
output = model(dummy_input)

print("pytorch result:", torch.argmax(output))

import torch.onnx

torch.onnx.export(model, dummy_input, './model.onnx', input_names=["input"], output_names=["output"], do_constant_folding=True,
                  verbose=True, keep_initializers_as_inputs=True, opset_version=14, dynamic_axes={"input": {0: "nBatchSize"}, "output": {0: "nBatchSize"}})

import onnx
import numpy as np
import onnxruntime as ort

model_onnx_path = './model.onnx'
# 验证模型的合法性
onnx_model = onnx.load(model_onnx_path)
onnx.checker.check_model(onnx_model)
# 创建ONNX运行时会话
ort_session = ort.InferenceSession(model_onnx_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# 准备输入数据
input_data = {
    'input': dummy_input.cpu().numpy()
}
# 运行推理
y_pred_onnx = ort_session.run(None, input_data)
print("onnx result:", np.argmax(y_pred_onnx[0]))

3.2 onnx–>tensorrt

/home/TensorRT-10.0.1.6/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --fp16 --shapes=input:2x3x224x224

3.3 部署使用

import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

import ctypes
import os
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
import pycuda.driver as cuda
import pycuda.autoinit
import collections
import numpy as np
import time

# Load the BERT-Large Engine
with open("model.trt", "rb") as f, \
        trt.Runtime(TRT_LOGGER) as runtime, \
        runtime.deserialize_cuda_engine(f.read()) as engine, \
        engine.create_execution_context() as context:
    #输入
    input_ids = np.random.randn(1, 3, 224, 224)
    input_ids = input_ids.astype(np.float32)
    input_shape = (1, 3,224,224)

    stream = cuda.Stream()
    input_nbytes = trt.volume(input_shape) * trt.float32.itemsize      #字节数

    # Allocate device memory for inputs.
    d_inputs = cuda.mem_alloc(input_nbytes)   #申请内存

    # Copy inputs
    input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids.ravel()))  #内存页被锁定

    cuda.memcpy_htod_async(d_inputs, input_ids, stream)  # 将input_ids 复制d_inputs[0]

    #输出
    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
    tensor_name = engine.get_tensor_name(0)
    context.set_input_shape(tensor_name, input_shape)   #配置

    assert context.all_binding_shapes_specified

    #设置cuda 输出内存
    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
    h_output = cuda.pagelocked_empty(tuple(context.get_tensor_shape(engine.get_tensor_name(0))), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)


    # Setup tensor address
    bindings = [int(d_inputs)] + [int(d_output)]

    for i in range(engine.num_io_tensors):
        context.set_tensor_address(engine.get_tensor_name(i), bindings[i])

    # Run inference
    context.execute_async_v3(stream_handle=stream.handle)
    stream.synchronize()     #等待返回


    # Transfer predictions back from GPU
    cuda.memcpy_dtoh_async(h_output, d_output, stream)     #从设备上复制到内存中
    stream.synchronize()

    print(h_output)
  1. 案例2-使用torch2trt 直接转换
    网址: https://github.com/NVIDIA-AI-IOT/torch2trt?tab=readme-ov-file
#安装
sudo apt-get install libprotobuf* protobuf-compiler ninja-build
 
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
 
cd torch2trt
 
sudo python setup.py install 

案例:

#转换
import torch
from torch2trt import torch2trt
from torchvision.models.alexnet import alexnet

# create some regular pytorch model...
model = alexnet(pretrained=True).eval().cuda()

# create example data
x = torch.ones((1, 3, 224, 224)).cuda()

# convert to TensorRT feeding sample data as input
model_trt = torch2trt(model, [x])
#保存
torch.save(model_trt.state_dict(), 'alexnet_trt.pth')

加载、推理

from torch2trt import TRTModule

model_trt = TRTModule()

model_trt.load_state_dict(torch.load('alexnet_trt.pth'))

x = torch.ones((1, 3, 224, 224)).cuda()
y_trt = model_trt(x)
  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值