文章目录
一:TensorRT
可以把TensorRT理解为一种针对NVIDIA GPU的一种优化编译器,以及其他开发工具。
- 自动优化模型
- 寻找模型中可以并行处理的地方
- 针对当前部署的GPU框架,寻找最优的调度和并行策略
- 支持多框架输入
- ONNX
- Python/C++ API接口
- 可以方便在自己的程序中调用TensorRT API来实现推理
1:部署常见组合
Pytorch-> ONNX-> TensorRT
1:TensorRT优势
- 整体硬件设计和编译技术很成熟
- BUG算比较少的那种
- 对量化的支持比较全面以及完善
- SDK很充足
- 可以参考的文档比较多
- 官方给的资源还是相当丰富的
- 给的SAMPLE也很适合学习
- Community很大
- 出现问题的时候有的问的
2:TensorRT的工作流
3:TensorRT的一些限制
3:TensorRT优化策略
1:层融合(Layer Fusion)
- 垂直层融合
- 水平层融合
层融合可以减少启动kernel的开销与memory操作,从而提高效率。同时,有些计算可以通过层融合优化后,跟其他计算合并
1:垂直层融合
2:水平层融合
2:量化(Quantization)
量化
- 压缩模型的一个很重要的策略
- 将单精度类型(FP32)训练权重转变为半精度(FP16)或者整型(INT8, INT4)
二:ONNX
ONNX是一种神经网络的格式,采用Protobuf二进制形式进行序列化模型。Protobuf会根据用于定义的数据结构来进行序列化存储
1:ONNX的组织架构
# 理解onnx中的组织结构
# - ModelProto (描述的是整个模型的信息)
# --- GraphProto (描述的是整个网络的信息)
# ------ NodeProto (描述的是各个计算节点,比如conv, linear)
# ------ TensorProto (描述的是tensor的信息,主要包括权重)
# ------ ValueInfoProto (描述的是input/output信息)
# ------ AttributeProto (描述的是node节点的各种属性信息)
对于onnx的proto定义在https://github.com/onnx/onnx/tree/main/onnx的onnx.in.proto中定义
2:生成ONNX以及netron工具使用
import torch
import torchvision
import onnxsim
import onnx
import argparse
# 使用torch中存在的模型
def get_model(type, dir):
if type == "resnet":
model = torchvision.models.resnet50()
file = dir + "resnet50.onnx"
elif type == "vgg":
model = torchvision.models.vgg11()
file = dir + "vgg11.onnx"
elif type == "mobilenet":
model = torchvision.models.mobilenet_v3_small()
file = dir + "mobilenetV3.onnx"
elif type == "efficientnet":
model = torchvision.models.efficientnet_b0()
file = dir + "efficientnetb0.onnx"
elif type == "efficientnetv2":
model = torchvision.models.efficientnet_v2_s()
file = dir + "efficientnetV2.onnx"
elif type == "regnet":
model = torchvision.models.regnet_x_1_6gf()
file = dir + "regnet1.6gf.onnx"
return model, file
def export_norm_onnx(model, file, input):
model.cuda()
torch.onnx.export(model, args=(input,), f=file, input_names=[
"input0"], output_names=["output0"], opset_version=15)
print("Finished onnx export!")
model_onnx = onnx.load(file)
onnx.checker.check_model(model_onnx)
# simplify简化
model_onnx, check = onnxsim.simplify(model=model_onnx)
assert check,"assert check failed"
onnx.save(model_onnx, file)
def main(args):
type = args.type
dir = args.dir
input = torch.rand(1, 3, 224, 224, device='cuda')
model, file = get_model(type, dir)
export_norm_onnx(model, file, input)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--type", type=str, default="resnet")
parser.add_argument("-d", "--dir", type=str, default="../models/")
opt = parser.parse_args()
main(opt)
import torch
import torch.nn as nn
import torch.onnx
# 输出自定义的模型
class Model(torch.nn.Module):
def __init__(self, in_features, out_features, weights, bias=False):
super().__init__()
self.linear = nn.Linear(in_features, out_features, bias)
with torch.no_grad():
self.linear.weight.copy_(weights)
def forward(self, x):
x = self.linear(x)
return x
def infer():
in_features = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
weights = torch.tensor([
[1, 2, 3, 4],
[2, 3, 4, 5],
[3, 4, 5, 6]
],dtype=torch.float32)
model = Model(4, 3, weights)
x = model(in_features)
print("result of {1, 1, 1 ,4} is ", x.data)
def export_onnx():
input = torch.zeros(1, 1, 1, 4)
weights = torch.tensor([
[1, 2, 3, 4],
[2, 3, 4, 5],
[3, 4, 5, 6]
],dtype=torch.float32)
model = Model(4, 3, weights)
model.eval() #添加eval防止权重继续更新
# pytorch导出onnx的方式,参数有很多,也可以支持动态size
torch.onnx.export(
model = model,
args = (input,),
f = "../models/example_dynamic_shape.onnx",
input_names = ["input0"],
output_names = ["output0"],
dynamic_axes = {
'input0': {0: 'batch'},
'output0': {0: 'batch'}
},
opset_version = 12)
print("Finished onnx export")
if __name__ == "__main__":
infer()
export_onnx()
在输出onnx模型后,可以通过netron ***.onnx来查看模型
3:onnx.helper创建ONNX(较为底层基本不用,目前普用onnx_graphsurgeon)
# onnx中的组织结构
# - ModelProto (描述的是整个模型的信息)
# --- GraphProto (描述的是整个网络的信息)
# ------ NodeProto (描述的是各个计算节点,比如conv, linear)
# ------ TensorProto (描述的是tensor的信息,主要包括权重)
# ------ ValueInfoProto (描述的是input/output信息)
# ------ AttributeProto (描述的是node节点的各种属性信息)
onnx.helper.make_tensor
onnx.helper.make_tensor_value_info
onnx.helper.make_attribute
onnx.helper.make_node
onnx.helper.make_graph
onnx.helper.make_model
import onnx
from onnx import helper
from onnx import TensorProto
def create_onnx():
# 创建ValueProto
a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [10, 10])
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 10])
b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [10, 10])
y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [10, 10])
# 创建NodeProto
mul = helper.make_node('Mul', ['a', 'x'], 'c', "multiply")
add = helper.make_node('Add', ['c', 'b'], 'y', "add")
# 构建GraphProto
graph = helper.make_graph([mul, add], 'sample-linear', [a, x, b], [y])
# 构建ModelProto
model = helper.make_model(graph)
# 检查model是否有错误
onnx.checker.check_model(model)
print(model)
# 保存model
onnx.save(model, "../models/sample-linear.onnx")
return model
if __name__ == "__main__":
model = create_onnx()
4:格式化输出onnx信息
- 正常输出
import onnx
def main():
model = onnx.load("../models/sample-convnet.onnx")
onnx.checker.check_model(model)
graph = model.graph
initializers = graph.initializer
nodes = graph.node
inputs = graph.input
outputs = graph.output
print("\n**************parse input/output*****************")
for input in inputs:
input_shape = []
for d in input.type.tensor_type.shape.dim:
if d.dim_value == 0:
input_shape.append(None)
else:
input_shape.append(d.dim_value)
print("Input info: \
\n\tname: {} \
\n\tdata Type: {} \
\n\tshape: {}".format(input.name, input.type.tensor_type.elem_type, input_shape))
for output in outputs:
output_shape = []
for d in output.type.tensor_type.shape.dim:
if d.dim_value == 0:
output_shape.append(None)
else:
output_shape.append(d.dim_value)
print("Output info: \
\n\tname: {} \
\n\tdata Type: {} \
\n\tshape: {}".format(input.name, output.type.tensor_type.elem_type, input_shape))
print("\n**************parse node************************")
for node in nodes:
print("node info: \
\n\tname: {} \
\n\top_type: {} \
\n\tinputs: {} \
\n\toutputs: {}".format(node.name, node.op_type, node.input, node.output))
print("\n**************parse initializer*****************")
for initializer in initializers:
print("initializer info: \
\n\tname: {} \
\n\tdata_type: {} \
\n\tshape: {}".format(initializer.name, initializer.data_type, initializer.dims))
if __name__ == "__main__":
main()
- 引入parser.py输出
parser.py:
import onnx
import numpy as np
# 注意,因为weight是以字节的形式存储的,所以要想读,需要转变为float类型
def read_weight(initializer: onnx.TensorProto):
shape = initializer.dims
data = np.frombuffer(initializer.raw_data, dtype=np.float32).reshape(shape)
print("\n**************parse weight data******************")
print("initializer info: \
\n\tname: {} \
\n\tdata: \n{}".format(initializer.name, data))
def parse_onnx(model: onnx.ModelProto):
graph = model.graph
initializers = graph.initializer
nodes = graph.node
inputs = graph.input
outputs = graph.output
print("\n**************parse input/output*****************")
for input in inputs:
input_shape = []
for d in input.type.tensor_type.shape.dim:
if d.dim_value == 0:
input_shape.append(None)
else:
input_shape.append(d.dim_value)
print("Input info: \
\n\tname: {} \
\n\tdata Type: {} \
\n\tshape: {}".format(input.name, input.type.tensor_type.elem_type, input_shape))
for output in outputs:
output_shape = []
for d in output.type.tensor_type.shape.dim:
if d.dim_value == 0:
output_shape.append(None)
else:
output_shape.append(d.dim_value)
print("Output info: \
\n\tname: {} \
\n\tdata Type: {} \
\n\tshape: {}".format(input.name, output.type.tensor_type.elem_type, input_shape))
print("\n**************parse node************************")
for node in nodes:
print("node info: \
\n\tname: {} \
\n\top_type: {} \
\n\tinputs: {} \
\n\toutputs: {}".format(node.name, node.op_type, node.input, node.output))
print("\n**************parse initializer*****************")
for initializer in initializers:
print("initializer info: \
\n\tname: {} \
\n\tdata_type: {} \
\n\tshape: {}".format(initializer.name, initializer.data_type, initializer.dims))
parser.py的使用
import torch
import torch.nn as nn
import torch.onnx
import onnx
from parser import parse_onnx
from parser import read_weight
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3)
self.bn1 = nn.BatchNorm2d(num_features=16)
self.act1 = nn.LeakyReLU()
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.act1(x)
return x
def export_norm_onnx():
input = torch.rand(1, 3, 5, 5)
model = Model()
model.eval()
file = "../models/sample-cbr.onnx"
torch.onnx.export(
model=model,
args=(input,),
f=file,
input_names=["input0"],
output_names=["output0"],
opset_version=15)
print("Finished normal onnx export")
def main():
export_norm_onnx()
model = onnx.load_model("../models/sample-cbr.onnx")
parse_onnx(model)
initializers = model.graph.initializer
for item in initializers:
read_weight(item)
if __name__ == "__main__":
main()
5:onnx注册算子
当出现导出onnx不成功的时候,我们需要考虑:
- 修改opset的版本
- 查看不支持的算子在新的opset中是否被支持
- 如果不考虑自己搭建plugin的话,也需要看看onnx-trt中这个算子是否被支持
- 文档链接
- 替换pytorch中的算子组合
- 把某些计算替换成onnx可以识别的
- 在pytorch登记onnx中某些算子
- 有可能onnx中有支持,但没有被登记
- 直接修改onnx,创建plugin
- 使用onnx-surgeon
- 一般是用在加速某些算子上使用
eg:asinh算子
asinh算子已经在version 9中支持,但是使用无法检测到asinh算子,这是因为该算子未注册
import torch
import torch.onnx
import onnxruntime
from torch.onnx import register_custom_op_symbolic
# 创建一个asinh算子的symblic,符号函数,用来登记
# 符号函数内部调用g.op, 为onnx计算图添加Asinh算子
# g: 就是graph,计算图
# 也就是说,在计算图中添加onnx算子
# 由于我们已经知道Asinh在onnx是有实现的,所以我们只要在g.op调用这个op的名字就好了
# symblic的参数需要与Pytorch的asinh接口函数的参数对齐
# def asinh(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ...
def asinh_register(g, input, *, out=None):
return g.op("Asinh", input)
# 在这里,将asinh_symbolic这个符号函数,与PyTorch的asinh算子绑定。也就是所谓的“注册算子”
# asinh是在名为aten的一个c++命名空间下进行实现的
# 那么aten是什么呢?
# aten是"a Tensor Library"的缩写,是一个实现张量运算的C++库
register_custom_op_symbolic('aten::asinh', asinh_register, 12) # 注册符号函数
# 这里容易混淆的地方:
# 1. register_op中的第一个参数是PyTorch中的算子名字: aten::asinh
# 2. g.op中的第一个参数是onnx中的算子名字: Asinh
class Model(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
def forward(self, x):
x = torch.asinh(x)
return x
def infer():
input = torch.rand(1, 5)
model = Model()
x = model(input)
def validate_onnx():
input = torch.rand(1, 5)
# pytorch推理
model = Model()
x = model(input)
print("result is: ", x)
# onnxruntime推理
sess = onnxruntime.InferenceSession("../models/sample_asinh.onnx")
x = sess.run(None, {'input0': input.numpy()})
print("result is: ", x)
def export_onnx():
input = torch.rand(1, 5)
model = Model()
model.eval()
file = "../models/sample_asinh.onnx"
torch.onnx.export(model=model, args=(input,), f=file, input_names=[
"input0"], output_names=["output0"], opset_version=12) # asinh()要改成12,因为register_custom_op_symbolic注册为12
print("finish export onnx")
if __name__ == "__main__":
export_onnx()
validate_onnx()
6:onnx_graph_surgeon(gs)创建ONNX
- onnx_graph_surgeon:
1:创建/修改onnx的工具。在TensorRT/tools中可以安装
2:更加方便的添加/修改onnx节点
3:更加方便的修改子图
4:更加方便的替换算子
5:底层一般是用的onnx.helper
- onnx_graph_surgeon(gs)中的IR会有以下三种结构
Tensor:有两种类型
1:Variable: 主要就是那些不到推理不知道的变量
2:Constant: 不用推理时,而在推理前就知道的变量 Node: 跟onnx中的NodeProto差不多,Node的属性以前使用AttributeProto保存,但是gs中统一用dict来保存
Graph:跟onnx中的GraphProto差不多
- 通过原生gs创建onnx
def main() -> None:
input = gs.Variable(
name = "input0",
dtype = np.float32,
shape = (1, 3, 224, 224))
weight = gs.Constant(
name = "conv1.weight",
values = np.random.randn(5, 3, 3, 3))
bias = gs.Constant(
name = "conv1.bias",
values = np.random.randn(5))
output = gs.Variable(
name = "output0",
dtype = np.float32,
shape = (1, 5, 224, 224))
node = gs.Node(
op = "Conv",
inputs = [input, weight, bias],
outputs = [output],
attrs = {"pads":[1, 1, 1, 1]})
# 该输入输出为整个网络的输入和输出
graph = gs.Graph(
nodes = [node],
inputs = [input],
outputs = [output])
model = gs.export_onnx(graph)
onnx.save(model, "../models/sample-conv.onnx")
- 通过自定义函数创建onnx
import onnx_graphsurgeon as gs
import numpy as np
import onnx
##################### 在graph注册调用的函数########################
@gs.Graph.register() # 注册函数
def add(self, a, b):
return self.layer(op="Add", inputs=[a, b], outputs=["add_out_gs"])
@gs.Graph.register()
def mul(self, a, b):
return self.layer(op="Mul", inputs=[a, b], outputs=["mul_out_gs"])
@gs.Graph.register()
def gemm(self, a, b, trans_a=False, trans_b=False):
attrs = {"transA": int(trans_a), "transB": int(trans_b)} # 属性以字典形式保存
return self.layer(op="Gemm", inputs=[a, b], outputs=["gemm_out_gs"], attrs=attrs)
@gs.Graph.register()
def relu(self, a):
return self.layer(op="Relu", inputs=[a], outputs=["act_out_gs"])
##################### 通过注册的函数进行创建网络########################
# input (64, 64)
# |
# gemm (constant tensor A(64, 32))
# |
# add (constant tensor B(64, 32))
# |
# relu
# |
# mul (constant tensor C(64, 32))
# |
# add (constant tensor D(64, 32))
# 初始化网络的opset
graph = gs.Graph(opset=12)
# 初始化网络需要用的参数
consA = gs.Constant(name="consA", values=np.random.randn(64, 32))
consB = gs.Constant(name="consB", values=np.random.randn(64, 32))
consC = gs.Constant(name="consC", values=np.random.randn(64, 32))
consD = gs.Constant(name="consD", values=np.random.randn(64, 32))
input0 = gs.Variable(name="input0", dtype=np.float32, shape=(64, 64))
# 设计网络架构
gemm0 = graph.gemm(input0, consA, trans_b=True)
relu0 = graph.relu(*graph.add(*gemm0, consB))
mul0 = graph.mul(*relu0, consC)
output0 = graph.add(*mul0, consD)
# 设置网络的输入输出
graph.inputs = [input0]
graph.outputs = output0
for out in graph.outputs:
out.dtype = np.float32
# 保存模型
onnx.save(gs.export_onnx(graph), "../models/sample-complicated-graph.onnx")