#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
onnxruntime-gpu Version: 1.13.1
transformers Version: 4.21.3
torch Version: 1.13.0a0+08820cb
'''
import torch
from transformers import AutoModel
model_path = "/home/zhonglongshen/nlp/model/bert-base-chinese"
MODEL_ONNX_PATH = "raw_bert_dynamic.onnx"
OPERATOR_EXPORT_TYPE = torch._C._onnx.OperatorExportTypes.ONNX
model = AutoModel.from_pretrained(model_path)
model.eval()
def make_train_dummy_input(seq_len):
org_input_ids = torch.LongTensor([[i for i in range(seq_len)]])
org_token_type_ids = torch.LongTensor([[1 for i in range(seq_len)]])
org_input_mask = torch.LongTensor(
[[0 for i in range(int(seq_len / 2))] + [1 for i in range(seq_len - int(seq_len / 2))]])
return (org_input_ids, org_token_type_ids, org_input_mask)
def pytorch_2_onnx():
seq_len = 16
org_dummy_input = make_train_dummy_input(seq_len)
output = torch.onnx.export(model,
org_dummy_input,
MODEL_ONNX_PATH,
verbose=True,
operator_export_type=OPERATOR_EXPORT_TYPE,
opset_version=12,
input_names=['input_ids', 'attention_mask', 'token_type_ids'], # 需要注意顺序!不可随意改变, 否则结果与预期不符
output_names=['last_hidden_state', 'pooler_output'], # 需要注意顺序, 否则在推理阶段可能用错output_names
do_constant_folding=True,
dynamic_axes={"input_ids": {0: "batch_size", 1: "length"},
"token_type_ids": {0: "batch_size", 1: "length"},
"attention_mask": {0: "batch_size", 1: "length"},
"pooler_output": {0: "batch_size"},
"last_hidden_state": {0: "batch_size"}}
)
print("Export of {} complete!".format(MODEL_ONNX_PATH))
def pytorch_2_torchscript():
import torch
import pdb
from transformers import BertModel, BertTokenizer, BertConfig, AutoTokenizer
device = "cuda"
is_use_gpu = False
model_path = "/share/model_zoo/bert-base-chinese/"
test_text = ["我是卖铁观音的小男孩,毕业于华中科技大学"]
tokenizer = BertTokenizer.from_pretrained(model_path)
inputs = tokenizer(test_text, max_length=32, return_tensors="pt")
config = BertConfig.from_pretrained(model_path, cache_dir=None, torchscript=True, use_cache=False)
model = BertModel.from_pretrained(model_path, config=config, cache_dir=None, )
if is_use_gpu:
model = model.to(device)
inputs = inputs.to(device)
model.eval()
print("inputs=", inputs)
temp = (inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'],)
traced_model = torch.jit.trace(model, temp)
if is_use_gpu:
torch.jit.save(traced_model, "traced_bert_gpu.pt")
else:
torch.jit.save(traced_model, "traced_bert_cpu.pt")
def check_onnx():
import os
import time
import pdb
import numpy as np
from tqdm import tqdm
from os import environ
from psutil import cpu_count
# Constants from the performance optimization available in onnxruntime
# It needs to be done before importing onnxruntime
environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'
from transformers import AutoTokenizer
from onnxruntime import InferenceSession
import onnxruntime as ort
print("onnxruntime device=", ort.get_device())
# device1 = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print("device=", device1)
device = "cuda:0"
is_use_gpu = True
max_len = 64
run_max_times = 1000
is_input_length_padding = False # 是否真的做 padding
test_text_batch_size = 1
# tokenizer = AutoTokenizer.from_pretrained("/home/text_embeddings/SimCSE/result/kg_forward_dynamics/")
tokenizer = AutoTokenizer.from_pretrained("/home/zhonglongshen/nlp/model/bert-base-chinese") # 原始 Pytorch 目录,主要加载词典
test_text = ["我是卖铁观音的小男孩"]*test_text_batch_size
model_file = "raw_bert_dynamic.onnx"
if is_use_gpu:
session = InferenceSession(model_file, providers=['CUDAExecutionProvider'])
print("Use onnxruntime-GPU")
else:
session = InferenceSession(model_file, providers=['CPUExecutionProvider'])
print("Use onnxruntime-CPU")
# ONNX Runtime expects NumPy arrays as input
if is_input_length_padding:
inputs = tokenizer(test_text, padding='max_length', truncation=True, max_length=max_len, return_tensors="np")
else:
inputs = tokenizer(test_text, padding=True, truncation=True, max_length=max_len, return_tensors="np")
for i in range(10):
outputs = session.run(output_names=["pooler_output"], input_feed=dict(inputs))
# warmup
onnx_cost_time = []
# for i in range(run_max_times):
for i in tqdm(range(run_max_times)):
# padding=True, truncation=True, return_tensors="pt"
# inputs = tokenizer(test_text, return_tensors="np")
# print("inputs=", inputs)
# last_hidden_state
start_time = time.time()
outputs = session.run(output_names=["pooler_output"], input_feed=dict(inputs))
# print(outputs)
# last_hidden_state
print(outputs[0][0].tolist())
end_time = time.time()
onnx_cost_time.append(end_time - start_time)
inference_device = "GPU" if is_use_gpu else "CPU"
print("onnx-{} infer time: {:.4f}".format(inference_device, sum(onnx_cost_time) / len(onnx_cost_time)))
check_onnx()
bert转onnx教程
最新推荐文章于 2023-10-30 10:14:29 发布