首先表明onnx是优化的推理过程,而不是训练过程,也就是使用cpu计算的过程
1.将对应的模型结构导出保存
model = torch.load(model_path).to(torch.device("cuda"))
model.eval()
if length == 32:
data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 20, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]],
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]],
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]]]
else:
data = [[[2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20,
3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130,
5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16,
2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16, 36, 130, 5605, 458, 2, 16, 2874, 20, 3, 16,
36, 130, 5605, 458]],
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ]],
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]
inputs = {
'input_ids': torch.tensor(data[0]).to(config.device),
'input_masks': torch.tensor(data[1]).to(config.device),
'segment_ids': torch.tensor(data[2]).to(config.device)
}
if True or not os.path.exists(export_model_path):
with torch.no_grad():
symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
torch.onnx.export(model, # model being run
args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)
f=export_model_path, # where to save the model (can be a file or file-like object)
opset_version=opset_version, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names=['input_ids', # the model's input names
'input_masks',
'segment_ids'],
output_names=['predict'], # the model's output names
dynamic_axes={'input_ids': symbolic_names, # variable length axes
'input_masks': symbolic_names,
'segment_ids': symbolic_names,
'predict': symbolic_names})
print("Model exported at ", export_model_path)
2.使用onnx模块进行优化,存储优化完的结构
import onnx
import onnxoptimizer
print('&&&load export_model_path = '+str(export_model_path))
original_model = onnx.load(export_model_path)
print('The model before optimization:\n\n{}'.format(onnx.helper.printable_graph(original_model.graph)))
# A full list of supported optimization passes can be found using get_available_passes()
all_passes = onnxoptimizer.get_available_passes()
print("Available optimization passes:")
for p in all_passes:
print('\t{}'.format(p))
print()
r"""
Available optimization passes:
eliminate_deadend
eliminate_duplicate_initializer
eliminate_identity
eliminate_if_with_const_cond
eliminate_nop_cast
eliminate_nop_dropout
eliminate_nop_flatten
eliminate_nop_monotone_argmax
eliminate_nop_pad
eliminate_nop_transpose
eliminate_unused_initializer
extract_constant_to_initializer
fuse_add_bias_into_conv
fuse_bn_into_conv
fuse_consecutive_concats
fuse_consecutive_log_softmax
fuse_consecutive_reduce_unsqueeze
fuse_consecutive_squeezes
fuse_consecutive_transposes
fuse_matmul_add_bias_into_gemm
fuse_pad_into_conv
fuse_transpose_into_gemm
lift_lexical_references
nop
split_init
split_predict
"""
#from transformers.convert_graph_to_onnx import convert
#convert(framework="pt",model=model_path,output=
passes = ['fuse_add_bias_into_conv']
optimized_model = onnxoptimizer.optimize(original_model, passes)
print('The model after optimization:\n\n{}'.format(onnx.helper.printable_graph(optimized_model.graph)))
# save new model
onnx.save(optimized_model, optimized_model_path)
torch.cuda.empty_cache()
3.调用优化完成的结构进行预测
(cpu上加在onnx模型,并进行推理)
from transformers import BertTokenizerFast
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:
assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"
# Few properties that might have an impact on performances (provided by MS)
options = SessionOptions()
options.intra_op_num_threads = 1
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
# Load the model as a graph and prepare the CPU backend
session = InferenceSession(model_path, options, providers=[provider])
session.disable_fallback()
return session
tokenizer = BertTokenizerFast.from_pretrained("/home/data/pretrain_models/bert-base-chinese-pytorch") # 使用 Pytorch 模型的字典
cpu_model = create_model_for_provider("onnx/bert-base-chinese.opt.onnx", "CPUExecutionProvider") # 使用 优化过的 onnx
# Inputs are provided through numpy array
model_inputs = tokenizer("大家好, 我是卖切糕的小男孩, 毕业于华中科技大学", return_tensors="pt")
inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
# Run the model (None = get all the outputs)
sequence, pooled = cpu_model.run(None, inputs_onnx)
# Print information about outputs
print(f"Sequence output: {sequence.shape}, Pooled output: {pooled.shape}")