如标题所述,我们在代码中做了推量部分,并对于使用gpu还是cpu 做了一个自动处理,修改provider就可以,如果安装的onnxruntime是gpu版那么是调用gpu ,否则使用cpu;对于使用cpu时,开启了多线程,会加快推理速度。
import cv2
import os
import glob
import subprocess
import numpy as np
from tqdm import tqdm
import onnxruntime as ort
import onnx
import sys
import shutil
def normalize(img,scale=None,mean=None,std=None):
if isinstance(scale, str):
scale = eval(scale)
scale = np.float32(scale if scale is not None else 1.0 / 255.0)
mean = mean if mean is not None else [0.485, 0.456, 0.406]
std = std if std is not None else [0.229, 0.224, 0.225]
shape = (1, 1, 3)
mean = np.array(mean).reshape(shape).astype('float32')
std = np.array(std).reshape(shape).astype('float32')
assert isinstance(img,
np.ndarray), "invalid input 'img' in normalize"
img = (img.astype('float32') * scale - mean) / std
return img
model = onnx.load("model.onnx")
# 模型推理
ori_output = copy.deepcopy(model .graph.output)
# 输出模型每层的输出
for node in model.graph.node:
for output in node.output:
if output not in ori_output:
model.graph.output.extend([onnx.ValueInfoProto(name=output)])
#进行配置
if ort.get_device()=="CPU":
config = ort.SessionOptions()
ret,val=subprocess.getstatusoutput("cat /proc/cpuinfo | grep 'core id' |sort |uniq | wc -l")
if ret==0:
cpu_num_thread = int(val)
else:
cpu_num_thread=4
config.intra_op_num_threads = cpu_num_thread
config.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
providers=["CPUExecutionProvider"]
ort_session = ort.InferenceSession(model.SerializeToString(),providers=providers,sess_options=config)
elif ort.get_device()=="GPU":
providers=["CPUExecutionProvider"]
ort_session = ort.InferenceSession(model.SerializeToString(),providers=providers)
image_list=["test.jpg"]
# for root,dir,files in os.walk('need_test/crop/'):
# if len(files):
# for ff in files:
# n = os.path.join(root,ff)
# image_list.append(n)
for img_path in tqdm(image_list):
img = cv2.imread(img_path)
if img is None:
continue
img = img[:, :, ::-1]
img = cv2.resize(img,(224,224))
img = normalize(img)
img = img.transpose((2,0,1))
image = np.expand_dims(img,axis=0)
ort_inputs = {ort_session.get_inputs()[0].name: image}
#获取所有节点输出
outputs = [x.name for x in ort_session.get_outputs()] #ort_session.get_outputs()[0].name是原模型的单一输出
ort_outs = ort_session.run(output_names=outputs, input_feed=ort_inputs)
# 生成字典,便于查找层对应输出
ort_outs = OrderedDict(zip(outputs, ort_outs))
应用场有几个:
- 1、选择不同硬件进行推理并获得结查
- 2、cpu推理实现多线程
- 3、获到网络各层的输出,这有个场景,我们通常会使用onnx sim 或onnx optimizer对模型做转换,这样可以对比转换前后的差异。