一、模型加速原理
原始的训练框架(pytorch,TensorFlow)比较重,结合GPU计算能力没那么强,利用其它快速的框架,例如onnx(微软开源),tensorrt(NVIDIA推出);
二、环境准备:
机器:阿里云5号机,后面是有的目录均指改机器下的目录;
显卡:必须V100(32G),16G的不确定是否兼容;
cuda:必须10.2,不同cuda版本有diff,cuda10.2还要打两个补丁;
onnx:从github clone最新的,我已经完成,对应地址为:/root/myname/onnx
onnxruntime: 从github clone最新的,我已经完成,对应地址为:/root/myname/onnxruntime
tensortrt(trt):必须TensorRT-7.2.3.4; trt这里注意包括两个部分,一个是NVIDIA官网提供的闭源算子实现库,主要是so库的方式给出;另外是github上开源的一个repo,主要包括各种plugin,plugin实现会调用底层算子;这里我分别已下载,算子库的目录:/trt/TensorRT-7.2.3.4 ; plugin的地址:/root/myname/TensorRT(python接口不需要plugin);
其他的库,pytorch,TensorFlow等与cuda版本兼容即可;
三、环境安装:
python3的版本,使用的系统原生python3.6.9,切记注意这个,容易被conda的python3覆盖掉;
cuda10.2以及显卡驱动,目前已安装至阿里云5号机,后面上线时要求运维完全对齐,否则遇到未知问题很难定位;
onnx安装,主要通过源码编译安装。进入onnx目录/root/lixianku/onnx,执行python3 setup.py install安装,无报错即安装完成;若出现报错则去onnx的git查找相关issue;安装完成后,尝试onnx安装是否成功,切记一定要切出onnx的目录,要不然会报错,报错原因是onnx目录名和onnx包名完全一样,会出现调用目录下文件而不是包函数;
onnxruntime安装,主要通过源码编译安装。进入onnx目录/root/lixianku/onnxruntime,执行python3 setup.py install安装。
tensorflow-onnx安装,
tensorrt的安装,设置tensorrt算子库的路径,让系统环境变量可访问,修改~/.bashrc,
即设置LD_LIBRARY_PATH,让算子库的lib路径系统可访问;然后进入/trt/TensorRT-7.2.3.4/python,里面有对应不同python版本的trt的安装wheel文件,例如我python版本是3.6,安装命令:pip install tensorrt-7.2.3.4-cp36-none-linux_x86_64.whl;
四. 模型转换
1)onnx模型导出,
pytorch转onnx:代码地址/root/myname/BERT-NER-Pytorch/trt.py
tensorflow模型,先转pytorch然后再转onnx;
def convert_onnx():
model=NER()
model.load_state_dict(torch.load("outputs/faq_output/bert/checkpoint-592/bert.pt"))
device = "cpu"
model.to(device)
input_ids=torch.ones([1,512],dtype=torch.long)
# index=torch.ones([1,512],dtype=torch.int32)
model(input_ids)
onnx_save_name="outputs/faq_output/bert/checkpoint-592/bert.onnx"
torch_onnx_out = torch.onnx.export(model, (input_ids), onnx_save_name,
export_params=True,
verbose=True,
input_names=['input'],
output_names=["output"],
opset_version=12,
keep_initializers_as_inputs=True,
do_constant_folding=True,dynamic_axes={
'input':{1:'text_length'},
'output':{1:"text_length"}},
operator_export_type = torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH)
# onnx_save_name=os.path.join(onnx_path,'checkpoint_best.onnx')
# torch_onnx_out = torch.onnx.export(model, (input_ids,index), onnx_save_name,
# export_params=True,
# verbose=True,
# input_names=['input','index'],
# output_names=["output"],
# opset_version=12,
# keep_initializers_as_inputs=True,
# do_constant_folding=True,
# operator_export_type = torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH)
2)onnx模型测试,转换成功后,即可进行模型测试,对齐pytorch图片:
核心点:1)InferenceSession进行创建图,2)feed数据后调用onnx_session.run做图推理;
def test_onnx():
pred_output_dir="predict2"
test_file='test_metric_data/test.char.bmes.dev'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = FaqProcessor()
label_list = processor.get_labels()
num_labels = len(label_list)
onnx_session = ort.InferenceSession("outputs/faq_output/bert/checkpoint-592/bert.onnx")
crf=CRF(num_labels, batch_first=True)
crf.load_state_dict(torch.load("outputs/faq_output/bert/checkpoint-592/crf.pt"))
id2label = {i: label for i, label in enumerate(label_list)}
markup = 'bios'
crf.to(device)
if not os.path.exists(pred_output_dir):
os.makedirs(pred_output_dir)
test_dataset, all_input_text = load_and_cache_examples()
# Note that DistributedSampler samples randomly
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn)
results = []
output_predict_file = os.path.join(pred_output_dir, "test_prediction.json")
pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")
bt=0
ct=0
for step, batch in enumerate(test_dataloader):
batch = tuple(t.to(device) for t in batch)
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None}
input_feed={'input':inputs['input_ids'].cpu().numpy()}
t1=time.time()
outputs=onnx_session.run(['output'], input_feed=input_feed)
t2=time.time()
with torch.no_grad():
tags = crf.decode(torch.from_numpy(outputs[0]).to("cuda"), inputs['attention_mask'])
t3=time.time()
tags = tags.squeeze(0).cpu().numpy().tolist()
bt+=(t2-t1)
ct+=(t3-t2)
preds = tags[0][1:-1] # [CLS]XXXX[SEP]
label_entities = get_entities(preds, id2label, markup)
json_d = {}
json_d['id'] = step
json_d['text'] = all_input_text[step]
json_d['tag_seq'] = " ".join([id2label[x] for x in preds])
json_d['entities'] = label_entities
results.append(json_d)
pbar(step)
print("\n")
with open(output_predict_file, "w") as writer:
for record in results:
writer.write(json.dumps(record, ensure_ascii=False) + '\n')
compute_test_metric(test_file, output_predict_file)
print("bt:{}".format(bt/3563))
print("ct:{}".format(ct/3563))
3)tensorRt模型转换,即进行onnx转trt模型,地址 : /root/myname/intent_classify_model/src/trt.py
这里代码是C++风格,先计算内存大小,然后分配内存,数据从CPU拷贝至GPU,GPU完成计算,然后从GPU拷贝至CPU;
def convert_trt():
# name="checkpoint_best_mean"
logger=trt.Logger(trt.Logger.VERBOSE)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(logger) as builder,builder.create_network(EXPLICIT_BATCH) as network,trt.OnnxParser(network,logger) as parser:
builder.max_batch_size = 4
config = builder.create_builder_config()
config.max_workspace_size = GiB(2)
# config.set_flag(trt.BuilderFlag.FP16)
with open("/root/lixianku/BERT-NER-Pytorch/outputs/faq_output/bert/checkpoint-592/bert.onnx") as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
profile = builder.create_optimization_profile()
# input = network.get_input(0)
# print(input.shape)
profile.set_shape("input" ,(1, 1), (1, 32), (1, 512))
config.add_optimization_profile(profile)
#config.flags = 1 << int(trt.BuilderFlag.FP16)
# config.max_workspace_size = 1 << 30
# for i in range(network.num_layers):
# layer=network.get_layer(i)
# print(layer)
engine = builder.build_engine(network, config)
# print("Completed creating a dynamic Engine")
print("engine:", engine)
with open("/root/lixianku/BERT-NER-Pytorch/outputs/faq_output/bert/checkpoint-592/bert.trt", "wb",encoding='utf-8') as f:
f.write(engine.serialize())
4)tensorRT模型测试,对齐pytorch和onnx模型精度,以及加速比例;
结果对齐后,整个转换加速过程完成;
def test_trt(model_path, test_path):
filename=test_path
tokenizer = BertTokenizer.from_pretrained("roberta_chinese_wwm_ext_large")
count=0
engine = load_engine(model_path, verbose=True)
print(engine.get_binding_shape(0),engine.get_binding_dtype(0))
print(engine.get_binding_shape(1),engine.get_binding_dtype(1))
true=[]
pred=[]
with engine.create_execution_context() as context:
context.active_optimization_profile = 0
s1=time.time()
for line in open(filename,'r'):
print(line)
text=line.split(",")[0].split(":")[1].replace("\"","").replace(" ","")
label=line.split(",")[1].split(":")[1][:-2].replace("\"","").replace(" ","")
if "\\u" in text:
text=text.encode('utf-8').decode('unicode_escape')
if "\\u" in label:
label=label.encode('utf-8').decode('unicode_escape')
query = text[:510] #truncate
inputs = tokenizer.encode_plus(query)
input_ids = np.expand_dims(np.array(inputs['input_ids']),0).astype(np.int32)
st=time.time()
print("query length:{}".format(input_ids.shape[1]))
print(context.set_binding_shape(0, input_ids.shape))
inputs, outputs, bindings, stream=allocate_buffers2(engine, context)
inputs[0].host = input_ids
trt_outputs =do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print("time cost:{}".format(time.time()-st))
id=np.argmax(trt_outputs[0])
pred.append(id)
true.append(l2i[label])
if id==l2i[label]:
count+=1
print(classification_report(true,pred,target_names=kl))
# print("#######################################")
print("#######ACC:{}########".format(count/10892))
print("#######AVE TIME:{}########".format((time.time()-s1)/10892))
5)总的流程写在/root/myname/intent_classify_model/src/trt.py文件的最下面
可以根据需要,执行具体的转换及测试步骤;
五、上线镜像:
onnx镜像:onnx:v1
trt镜像:trt:v14
import re
import sys
import os
import io
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
import pycuda.driver as cuda
import pycuda.autoinit
import time
import tensorrt as trt
from torch._C import dtype
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
sys.path.append(os.getcwd())
import numpy as np
#import pandas as pd
from sklearn import metrics
import torch
from torch.utils.data import DataLoader
import random
import torch.nn as nn
# from transformers.modeling_bert import *
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score as accuracy_score
from src.utils import init_dir, OutputDictionary
from src.model2 import SimpleRobertaConfig
from src.model2 import SimpleRoberta
from src.dataset import QueryDataset
from src.model2 import model_name_to_cls_dict, model_name_to_config_cls_dict
from sklearn.metrics import classification_report
import json
import onnxruntime as ort
#import onnx_graphsurgeon as gs
from transformers import BertTokenizer
import onnx
kl=[
"外语",
"中文字词成语",
"功能",
"闲聊",
"诗歌",
"百科",
"知识点",
"素材",
"计算",
]
l2i={k:i for i,k in enumerate(kl)}
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def load_json_file(file_path):
return json.load(open(file_path, "r", encoding="utf-8"))
def set_output_dicitionary(model,output_vocab_path):
output_dictionary = model.output_dictionary
if not output_dictionary:
output_dictionary = OutputDictionary(output_vocab_path)
return output_dictionary
def test_torch(model_path, test_path):
model_config_path = "model_save/model.config"
output_vocab_path = "data/DL/20210727/out_vocab.txt"
device = "cuda"
model = get_model(model_config_path, model_path, device)
model.to('cuda')
filename=test_path
label_list={
"功能":0,"计算":0,"中文字词成语":0,"闲聊":0,"外语":0,"知识点":0,"诗歌":0,"素材":0,"百科":0
}
pred_list={
"功能":0,"计算":0,"中文字词成语":0,"闲聊":0,"外语":0,"知识点":0,"诗歌":0,"素材":0,"百科":0
}
pred=[]
true=[]
s1=time.time()
for line in open(filename,'r'):
text=line.split(",")[0].split(":")[1].replace("\"","").replace(" ","")
if "\\u" in text:
text=text.encode('utf-8').decode('unicode_escape')
print(text)
label=line.split(",")[1].split(":")[1][:-2].replace("\"","").replace(" ","")
if "\\u" in label:
label=label.encode('utf-8').decode('unicode_escape')
query = text[:510] #truncate
inputs = model.tokenizer.encode_plus(query)
input_ids = torch.tensor([inputs['input_ids']], dtype=torch.long, device=device)
print("query length:{}".format(input_ids.size(1)))
attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long, device=device)
with torch.no_grad():
logits = model(input_ids)
probs = F.softmax(logits, dim=-1)
index_tensor = torch.argsort(logits, descending=True, dim=-1)
probs_list = probs.tolist()[0]
index_list = index_tensor.tolist()[0]
ret_list = []
for _ind in index_list:
ret_list.append(dict(cls=set_output_dicitionary(model,output_vocab_path).id2label(_ind), prob=probs_list[_ind]))
label_list[label]+=1
pred.append(l2i[ret_list[0]['cls']])
true.append(l2i[label])
if ret_list[0]['cls']==label:
pred_list[ret_list[0]['cls']]+=1
print("+"*30)
print(label_list)
print(pred_list)
num=0
for k,v in pred_list.items():
num+=v
print("######ACC#####:{}".format(num/10892))
print(classification_report(true,pred,target_names=kl))
print("#######AVE TIME:{}########".format((time.time()-s1)/10892))
def get_model(config_path, model_path, device):
_config = load_json_file(config_path)
model_type = _config["model_type"]
ModelConfigCls = model_name_to_config_cls_dict.get(model_type)
ModelCls = model_name_to_cls_dict.get(model_type)
model_config = ModelConfigCls.load_from_json_file(config_path)
model = ModelCls(model_config)
model = nn.DataParallel(model)
load_model_from_checkpoint(model, model_path, device)
model = model.module
return model
def load_model_from_checkpoint(model, checkpoint_path, device):
if device == "cpu":
print("here")
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
else:
checkpoint = torch.load(checkpoint_path)
# model_state=model.state_dict()
# for k,v in checkpoint['model_state_dict'].items():
# model_state[k.replace('module.','')]
model.load_state_dict(checkpoint['model_state_dict'])
START_EPOCH = checkpoint['epoch']
eval_loss = checkpoint['eval_loss']
best_loss = checkpoint["best_loss"]
START_STEP = checkpoint["step"]
print("Loaded exits model ...")
print("epoch {}, eval_loss {}, best_loss {}, step {}".format(START_EPOCH,
eval_loss,
best_loss,
START_STEP))
return START_EPOCH, best_loss, START_STEP
def remove_initializer_from_input(model_path):
model = onnx.load(model_path)
inputs = model.graph.input
name_to_input = {}
for input in inputs:
name_to_input[input.name] = input
for initializer in model.graph.initializer:
if initializer.name in name_to_input:
inputs.remove(name_to_input[initializer.name])
onnx.save(model, model_path)
def convert_onnx(model_name):
onnx_path="onnx1118"
if not os.path.exists(onnx_path):
os.makedirs(onnx_path)
model_path = "../model_save/" + model_name + ".pt"
model_config_path = "../model_save/model.config"
device = "cpu"
model = get_model(model_config_path, model_path, device)
model.to(device)
input_ids=torch.ones([1,512],dtype=torch.long)
# index=torch.ones([1,512],dtype=torch.int32)
# model(input_ids,index)
onnx_save_name=os.path.join(onnx_path, model_name + '.onnx')
torch_onnx_out = torch.onnx.export(model, (input_ids), onnx_save_name,
export_params=True,
verbose=True,
input_names=['input'],
output_names=["output"],
opset_version=12,
keep_initializers_as_inputs=True,
do_constant_folding=True,dynamic_axes={
'input':{1:'text_length'},
'output':{1:"text_length"}},
operator_export_type = torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH)
remove_initializer_from_input(onnx_save_name)
# onnx_save_name=os.path.join(onnx_path,'checkpoint_best.onnx')
# torch_onnx_out = torch.onnx.export(model, (input_ids,index), onnx_save_name,
# export_params=True,
# verbose=True,
# input_names=['input','index'],
# output_names=["output"],
# opset_version=12,
# keep_initializers_as_inputs=True,
# do_constant_folding=True,
# operator_export_type = torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH)
def test_onnx(model_path, test_path):
onnx_session = ort.InferenceSession(model_path)
filename= test_path
tokenizer = BertTokenizer.from_pretrained("../roberta_chinese_wwm_ext_large")
count=0
s1=time.time()
pred=[]
true=[]
for line in open(filename,'r'):
# print(line)
text=line.split(",")[0].split(":")[1].replace("\"","").replace(" ","")
label=line.split(",")[1].split(":")[1][:-2].replace("\"","").replace(" ","")
if "\\u" in text:
text=text.encode('utf-8').decode('unicode_escape')
if "\\u" in label:
label=label.encode('utf-8').decode('unicode_escape')
query = text[:510] #truncate
inputs = tokenizer.encode_plus(query)
input_ids = np.expand_dims(np.array(inputs['input_ids']),0)
print("query length:{}".format(input_ids.shape[1]))
input_feed={'input':input_ids}
st=time.time()
output=onnx_session.run(['output'], input_feed=input_feed)
print("time const:{}".format(time.time()-st))
id=np.argmax(output)
pred.append(id)
true.append(l2i[label])
if label==kl[id]:
count+=1
print(text)
print("#######################################")
print("#######ACC:{}########".format(count/10892))
print("#######AVE TIME:{}########".format((time.time()-s1)/10892))
print(classification_report(true,pred,target_names=kl))
def GiB(val):
return val * 1 << 30
def convertTrt(name):
logger=trt.Logger(trt.Logger.VERBOSE)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(logger) as builder,builder.create_network(EXPLICIT_BATCH) as network,trt.OnnxParser(network,logger) as parser:
print('Loading TRT file from path {}...'.format("onnx/{}.trt".format(name)))
builder.max_batch_size = 4
config = builder.create_builder_config()
config.max_workspace_size = GiB(2)
# config.set_flag(trt.BuilderFlag.FP16)
print('Loading ONNX file from path {}...'.format("onnx/{}.onnx".format(name)))
with open("onnx/{}.onnx".format(name),'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
profile = builder.create_optimization_profile()
# input = network.get_input(0)
# print(input.shape)
profile.set_shape("input" ,(1, 1), (1, 32), (1, 512))
config.add_optimization_profile(profile)
#config.flags = 1 << int(trt.BuilderFlag.FP16)
# config.max_workspace_size = 1 << 30
# for i in range(network.num_layers):
# layer=network.get_layer(i)
# print(layer)
engine = builder.build_engine(network, config)
# print("Completed creating a dynamic Engine")
print("engine:", engine)
with open("onnx/{}.trt".format(name), "wb") as f:
f.write(engine.serialize())
def main():
model_path = "model_save/checkpoint_best.pt"
model_config_path = "model_save/model.config"
output_vocab_path = "data/DL/20210727/out_vocab.txt"
device = "cuda"
_inference = InferenceWrapper(model_path, model_config_path, output_vocab_path, device)
while True:
text = input("Input:\n")
ret = _inference.predict_single_query(text)
print(ret)
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
binding_name=[]
stream = cuda.Stream()
for binding in engine:
print(engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
print(size, dtype, binding)
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
binding_name.append(binding)
return inputs, outputs, bindings, stream
def load_engine(trt_file_path, verbose=False):
"""Build a TensorRT engine from a TRT file."""
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
print('Loading TRT file from path {}...'.format(trt_file_path))
with open(trt_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
return engine
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def test_trt(model_path, test_path):
filename=test_path
tokenizer = BertTokenizer.from_pretrained("roberta_chinese_wwm_ext_large")
count=0
engine = load_engine(model_path, verbose=True)
print(engine.get_binding_shape(0),engine.get_binding_dtype(0))
print(engine.get_binding_shape(1),engine.get_binding_dtype(1))
true=[]
pred=[]
with engine.create_execution_context() as context:
context.active_optimization_profile = 0
s1=time.time()
for line in open(filename,'r'):
print(line)
text=line.split(",")[0].split(":")[1].replace("\"","").replace(" ","")
label=line.split(",")[1].split(":")[1][:-2].replace("\"","").replace(" ","")
if "\\u" in text:
text=text.encode('utf-8').decode('unicode_escape')
if "\\u" in label:
label=label.encode('utf-8').decode('unicode_escape')
query = text[:510] #truncate
inputs = tokenizer.encode_plus(query)
input_ids = np.expand_dims(np.array(inputs['input_ids']),0).astype(np.int32)
st=time.time()
print("query length:{}".format(input_ids.shape[1]))
print(context.set_binding_shape(0, input_ids.shape))
inputs, outputs, bindings, stream=allocate_buffers2(engine, context)
inputs[0].host = input_ids
trt_outputs =do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print("time cost:{}".format(time.time()-st))
id=np.argmax(trt_outputs[0])
pred.append(id)
true.append(l2i[label])
if id==l2i[label]:
count+=1
print(classification_report(true,pred,target_names=kl))
# print("#######################################")
print("#######ACC:{}########".format(count/10892))
print("#######AVE TIME:{}########".format((time.time()-s1)/10892))
def get_engine(engine_path):
TRT_LOGGER = trt.Logger()
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_path))
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers2(engine, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for i, binding in enumerate(engine):
size = trt.volume(context.get_binding_shape(i))
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
'''
def revise_onnx():
ex_list=['207']
graph = gs.import_onnx(onnx.load("onnx/checkpoint_best_dy.onnx"))
output_list=[]
node_list=[]
for node in graph.nodes:
node_list.append(node)
if node.outputs[0].name in ex_list:
output_list+=node.outputs
output_list[-1].dtype=np.int32
break
graph.nodes=node_list
graph.outputs=output_list
graph.cleanup
for node in graph.nodes:
print("+++++++++++++++++++++++++++++")
print(node)
print("+++++++++++++++++++++++++++++")
onnx.save(gs.export_onnx(graph),'onnx/checkpoint_best_dy_revise.onnx')
'''
if __name__ == "__main__":
#test_path = "data/DL/test_new.json"
# 测试集
test_path = "../test_new.json"
# 原始model
model_path = "model_save/checkpoint_best_mean_v1.2.pt"
# 新的model name
model_name = "checkpoint_best_mean_v1.2"
#test_torch(model_path, test_path)
# convert_onnx(model_name)
test_onnx("onnx1118/" + model_name+ ".onnx", test_path)
#revise_onnx()
# convertTrt(model_name)
# test_trt("onnx/" + model_name + ".trt", test_path)