1. what is tensorrt
参考:TensorRT的介绍与应用
1. tensorflow、tensorrt数据格式
tensorflow | temsorrt |
---|---|
NWHC | NCWH |
N:batch-size
W:width
H:height
C:通道
2. end_to_end_tensorflow_mnist
cd /opt/developutils/deepstream/tensorrt/TensorRT-7.2.3.4/samples/python/end_to_end_tensorflow_mnist
python -m pip install -r requirements.txt
mkdir models
python model.py
convert-to-uff models/lenet5.pb
python sample.py -d /opt/developutils/deepstream/tensorrt/TensorRT-7.2.3.4/data/
3. 使用tensorrt创建自定义网络
4. 使用python接口导入Tensorflow模型
5. 在python中序列化引擎
6.how does tensorrt work?
- [] 生成的计划文件不能跨平台或tensorrt版本移植
- [] 构建阶段在图层上执行以下优化
- 消除为使用输出层
- 融合卷积、偏差和relu操作
- 聚合足够相似的参数和相同的源张量的操作(例如,googlenet v
- 5 inception模块中的1*1卷积)
- 通过将层输出定向到正确的最终目标来合并连接层 - [] 如有必要, bulider还会修改权重的精度
- [] 构建阶段还在虚拟数据上运行图层以其从内核目录中选择最快的内核,并在适当的情况下执行权重预格式化和内存优化
7. what capabilities does tensorrt provide
- cafeparser
- uffparser
- onnxparser
参考:TensorRT的介绍与应用 看到yolov3
### 模型转换代码
resnet50---》onnx---》engine---》load engine
-- coding: UTF-8 --
“”"
@Time: 2021/8/11下午5:10
@Author: geekplusa
@FIleName: app_onnx_resnet50.py
@Descripttion: [resnet50—》onnx—》engine—》load engine]
@Software: PyCharm
“”"
#~~~~~~~~~import-start
pip install {}
import torch
import torchvision
from torchsummary import summary
import time
import pycuda.driver as cuda
import pycuda.autoinit
#~~~~~~~import-end
torch.manual_seed(0)
device = torch.device(‘cuda:0’ if torch.cuda.is_available() else ‘cpu’)
resnet50 = torchvision.models.resnet50().to(device)
resnet50.eval()
input_data = torch.randn(1, 3, 1080, 1920, dtype=torch.float32, device=device)
output_data_pytorch = resnet50(input_data).cpu().detach().numpy()
nRound = 1
torch.cuda.synchronize()
t0 = time.time()
for i in range(nRound):
resnet50(input_data)
torch.cuda.synchronize()
t1 = time.time()
time_pytorch = (t1 - t0) /nRound
print(‘PyTorch time:’, time_pytorch)
input_names = [‘input’]
output_names = [‘output’]
torch.onnx.export(resnet50, input_data, ‘resnet50.onnx’, input_names=input_names, output_names=output_names, verbose=True, opset_version=11)
输入可变
torch.onnx.export(resnet50, input_data, ‘resnet50.dynamic_shape.onnx’, dynamic_axes={“input”: [0, 2, 3]}, input_names=input_names, output_names=output_names, verbose=True, opset_version=11)
“”"
继续运行python代码前,先执行如下命令
trtexec --verbose --onnx=…/samples/python/network_api_pytorch_mnist/resnet50.onnx --saveEngine=…/samples/python/network_api_pytorch_mnist/resnet50.trt
trtexec --verbose --onnx=…/samples/python/network_api_pytorch_mnist/resnet50.onnx --saveEngine=…/samples/python/network_api_pytorch_mnist/resnet50_fp16.trt --fp16
以下命令仅供参考
trtexec --verbose --onnx=…/samples/python/network_api_pytorch_mnist/resnet50.dynamic_shape.onnx --saveEngine=resnet50.dynamic_shape.trt --optShapes=input:1x3x1080x1920 --minShapes=input:1x3x1080x1920 --maxShapes=input:1x3x1080x1920
trtexec --verbose --onnx=…/samples/python/network_api_pytorch_mnist/resnet50.dynamic_shape.onnx --saveEngine=resnet50.dynamic_shape_fp16.trt --optShapes=input:1x3x1080x1920 --minShapes=input:1x3x1080x1920 --maxShapes=input:1x3x1080x1920 --fp16
“”"
from trt_lite2 import TrtLite
import numpy as np
import os
class PyTorchTensorHolder(pycuda.driver.PointerHolderBase):
def init(self, tensor):
super(PyTorchTensorHolder, self).init()
self.tensor = tensor
def get_pointer(self):
return self.tensor.data_ptr()
for engine_file_path in [‘resnet50.trt’, ‘resnet50_fp16.trt’]:
if not os.path.exists(engine_file_path):
print(‘Engine file’, engine_file_path, ‘doesn’t exist. Please run trtexec and re-run this script.’)
exit(1)
print('====', engine_file_path, '===')
trt = TrtLite(engine_file_path=engine_file_path)
trt.print_info()
i2shape = {0: (1, 3, 1080, 1920)}
io_info = trt.get_io_info(i2shape)
d_buffers = trt.allocate_io_buffers(i2shape, True)
output_data_trt = np.zeros(io_info[1][2], dtype=np.float32)
d_buffers[0].copy_(input_data.reshape(d_buffers[0].size()))
trt.execute([t.data_ptr() for t in d_buffers], i2shape)
output_data_trt = d_buffers[1].cpu().numpy()
torch.cuda.synchronize()
t0 = time.time()
for i in range(nRound):
trt.execute([t.data_ptr() for t in d_buffers], i2shape)
torch.cuda.synchronize()
time_trt = (time.time() - t0) / nRound
print('TensorRT time:', time_trt)
print('Speedup:', time_pytorch / time_trt)
print('Average diff percentage:',
np.mean(np.abs(output_data_pytorch - output_data_trt) / np.abs(output_data_pytorch)))