onnx优化推理过程

最新推荐文章于 2024-07-09 14:44:53 发布

唐僧爱吃唐僧肉

最新推荐文章于 2024-07-09 14:44:53 发布

阅读量1.8k

点赞数 1

分类专栏： onnx accelerate

本文链接：https://blog.csdn.net/znevegiveup1/article/details/119515055

版权

onnx accelerate 专栏收录该内容

1 篇文章 1 订阅

订阅专栏

首先表明onnx是优化的推理过程，而不是训练过程，也就是使用cpu计算的过程
1.将对应的模型结构导出保存

model = torch.load(model_path).to(torch.device("cuda"))
model.eval()
if length == 32:
    data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 20, 3,
              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0]],
            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0]],
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0]]]

else:
    data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20,
              3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130,
              5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16,
              2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16,
              36, 130, 5605, 458]],
            [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
              1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
              1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ]],
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]


inputs = {
    'input_ids': torch.tensor(data[0]).to(config.device),
    'input_masks': torch.tensor(data[1]).to(config.device),
    'segment_ids': torch.tensor(data[2]).to(config.device)
}

if True or not os.path.exists(export_model_path):
    with torch.no_grad():
        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
        torch.onnx.export(model,  # model being run
                          args=tuple(inputs.values()),  # model input (or a tuple for multiple inputs)
                          f=export_model_path,  # where to save the model (can be a file or file-like object)
                          opset_version=opset_version,  # the ONNX version to export the model to
                          do_constant_folding=True,  # whether to execute constant folding for optimization
                          input_names=['input_ids',  # the model's input names
                                       'input_masks',
                                       'segment_ids'],
                          output_names=['predict'],  # the model's output names
                          dynamic_axes={'input_ids': symbolic_names,  # variable length axes
                                        'input_masks': symbolic_names,
                                        'segment_ids': symbolic_names,
                                        'predict': symbolic_names})
        print("Model exported at ", export_model_path)

2.使用onnx模块进行优化，存储优化完的结构

import onnx
import onnxoptimizer
print('&&&load export_model_path = '+str(export_model_path))
original_model = onnx.load(export_model_path)
print('The model before optimization:\n\n{}'.format(onnx.helper.printable_graph(original_model.graph)))
# A full list of supported optimization passes can be found using get_available_passes()
all_passes = onnxoptimizer.get_available_passes()
print("Available optimization passes:")
for p in all_passes:
    print('\t{}'.format(p))
print()
r"""
Available optimization passes:
eliminate_deadend
eliminate_duplicate_initializer
eliminate_identity
eliminate_if_with_const_cond
eliminate_nop_cast
eliminate_nop_dropout
eliminate_nop_flatten
eliminate_nop_monotone_argmax
eliminate_nop_pad
eliminate_nop_transpose
eliminate_unused_initializer
extract_constant_to_initializer
fuse_add_bias_into_conv
fuse_bn_into_conv
fuse_consecutive_concats
fuse_consecutive_log_softmax
fuse_consecutive_reduce_unsqueeze
fuse_consecutive_squeezes
fuse_consecutive_transposes
fuse_matmul_add_bias_into_gemm
fuse_pad_into_conv
fuse_transpose_into_gemm
lift_lexical_references
nop
split_init
split_predict
"""
#from transformers.convert_graph_to_onnx import convert
#convert(framework="pt",model=model_path,output=
passes = ['fuse_add_bias_into_conv']
optimized_model = onnxoptimizer.optimize(original_model, passes)

print('The model after optimization:\n\n{}'.format(onnx.helper.printable_graph(optimized_model.graph)))

# save new model
onnx.save(optimized_model, optimized_model_path)
torch.cuda.empty_cache()

3.调用优化完成的结构进行预测
(cpu上加在onnx模型，并进行推理)

from transformers import BertTokenizerFast

def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: 
  
  assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

  # Few properties that might have an impact on performances (provided by MS)
  options = SessionOptions()
  options.intra_op_num_threads = 1
  options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

  # Load the model as a graph and prepare the CPU backend 
  session = InferenceSession(model_path, options, providers=[provider])
  session.disable_fallback()
    
  return session
tokenizer = BertTokenizerFast.from_pretrained("/home/data/pretrain_models/bert-base-chinese-pytorch") # 使用 Pytorch 模型的字典
cpu_model = create_model_for_provider("onnx/bert-base-chinese.opt.onnx", "CPUExecutionProvider") # 使用 优化过的 onnx

# Inputs are provided through numpy array
model_inputs = tokenizer("大家好, 我是卖切糕的小男孩, 毕业于华中科技大学", return_tensors="pt")
inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}

# Run the model (None = get all the outputs)
sequence, pooled = cpu_model.run(None, inputs_onnx)

# Print information about outputs
print(f"Sequence output: {sequence.shape}, Pooled output: {pooled.shape}")

唐僧爱吃唐僧肉

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
onnx优化推理过程

首先表明onnx是优化的推理过程，而不是训练过程，也就是使用cpu计算的过程1.将对应的模型结构导出保存model = torch.load(model_path).to(torch.device("cuda"))model.eval()if length == 32: data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 20, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
复制链接

扫一扫

专栏目录