一、定义
- 安装部署教程
- 使用方法
- 案例1
- 案例2使用torch2trt 直接转换
二、实现
- 安装部署教程
1.1 下载地址:https://developer.nvidia.com/tensorrt/download
EA即early access抢先体验版,GA是general availability即通用稳定版本。很明显我们优先选择GA版本。
1.2 安装
tar -xzvf TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz # 解压文件
# 将lib添加到环境变量里面
vim ~/.bashrc
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/TensorRT-10.0.1.6/TensorRT-10.0.1.6/lib
source ~/.bashrc
# 或 直接将 TensorRT-8.6.1.6/lib 添加到 cuda/lib64 里面
cp -r ./lib/* /usr/local/cuda/lib64/
# 安装python的包
cd TensorRT-10.0.1.6/python
pip install tensorrt-xxx-none-linux_x86_64.whl
1.3 验证
# 验证是否安装成功:
python
>>>import tensorrt
>>>print(tensorrt.__version__)
>>>assert tensorrt.Builder(tensorrt.Logger())
问题:ImportError: libnvinfer.so.10: cannot open shared object file: No such file or directory
解决:export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/TensorRT-10.0.1.6/TensorRT-10.0.1.6/lib 未配好。
-
使用方法
方法一: pytorch -> onnx -> tensorrt
方法二: pytorch–>tensorrt -
案例
官网教程:https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html#convert-onnx-engine
接口:https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/gettingStarted.html
3.1 pytorch—>onnx(见onnx篇)
import torch
import torchvision
model = torchvision.models.resnet18(pretrained=False)
device = 'cuda' if torch.cuda.is_available else 'cpu'
dummy_input = torch.randn(1, 3, 224, 224, device=device)
model.to(device)
model.eval()
output = model(dummy_input)
print("pytorch result:", torch.argmax(output))
import torch.onnx
torch.onnx.export(model, dummy_input, './model.onnx', input_names=["input"], output_names=["output"], do_constant_folding=True,
verbose=True, keep_initializers_as_inputs=True, opset_version=14, dynamic_axes={"input": {0: "nBatchSize"}, "output": {0: "nBatchSize"}})
import onnx
import numpy as np
import onnxruntime as ort
model_onnx_path = './model.onnx'
# 验证模型的合法性
onnx_model = onnx.load(model_onnx_path)
onnx.checker.check_model(onnx_model)
# 创建ONNX运行时会话
ort_session = ort.InferenceSession(model_onnx_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# 准备输入数据
input_data = {
'input': dummy_input.cpu().numpy()
}
# 运行推理
y_pred_onnx = ort_session.run(None, input_data)
print("onnx result:", np.argmax(y_pred_onnx[0]))
3.2 onnx–>tensorrt
/home/TensorRT-10.0.1.6/bin/trtexec --onnx=model.onnx --saveEngine=model.trt --fp16 --shapes=input:2x3x224x224
3.3 部署使用
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
import ctypes
import os
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
import pycuda.driver as cuda
import pycuda.autoinit
import collections
import numpy as np
import time
# Load the BERT-Large Engine
with open("model.trt", "rb") as f, \
trt.Runtime(TRT_LOGGER) as runtime, \
runtime.deserialize_cuda_engine(f.read()) as engine, \
engine.create_execution_context() as context:
#输入
input_ids = np.random.randn(1, 3, 224, 224)
input_ids = input_ids.astype(np.float32)
input_shape = (1, 3,224,224)
stream = cuda.Stream()
input_nbytes = trt.volume(input_shape) * trt.float32.itemsize #字节数
# Allocate device memory for inputs.
d_inputs = cuda.mem_alloc(input_nbytes) #申请内存
# Copy inputs
input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids.ravel())) #内存页被锁定
cuda.memcpy_htod_async(d_inputs, input_ids, stream) # 将input_ids 复制d_inputs[0]
#输出
# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
tensor_name = engine.get_tensor_name(0)
context.set_input_shape(tensor_name, input_shape) #配置
assert context.all_binding_shapes_specified
#设置cuda 输出内存
# Allocate output buffer by querying the size from the context. This may be different for different input shapes.
h_output = cuda.pagelocked_empty(tuple(context.get_tensor_shape(engine.get_tensor_name(0))), dtype=np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)
# Setup tensor address
bindings = [int(d_inputs)] + [int(d_output)]
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
# Run inference
context.execute_async_v3(stream_handle=stream.handle)
stream.synchronize() #等待返回
# Transfer predictions back from GPU
cuda.memcpy_dtoh_async(h_output, d_output, stream) #从设备上复制到内存中
stream.synchronize()
print(h_output)
- 案例2-使用torch2trt 直接转换
网址: https://github.com/NVIDIA-AI-IOT/torch2trt?tab=readme-ov-file
#安装
sudo apt-get install libprotobuf* protobuf-compiler ninja-build
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
sudo python setup.py install
案例:
#转换
import torch
from torch2trt import torch2trt
from torchvision.models.alexnet import alexnet
# create some regular pytorch model...
model = alexnet(pretrained=True).eval().cuda()
# create example data
x = torch.ones((1, 3, 224, 224)).cuda()
# convert to TensorRT feeding sample data as input
model_trt = torch2trt(model, [x])
#保存
torch.save(model_trt.state_dict(), 'alexnet_trt.pth')
加载、推理
from torch2trt import TRTModule
model_trt = TRTModule()
model_trt.load_state_dict(torch.load('alexnet_trt.pth'))
x = torch.ones((1, 3, 224, 224)).cuda()
y_trt = model_trt(x)