导出onnx文件
使用mmpretrain导出mobilenet-v2的onnx模型:
import torch
from mmpretrain import get_model
model = get_model('mobilenet-v2_8xb32_in1k',pretrained='mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth', device='cpu')
input = torch.zeros(1, 3, 224, 224)
out = model(input)
torch.onnx.export(model, input, "mobilenet-v2.onnx", opset_version=11)
安装有mmdeploy的话可以通过如下方法导出:
from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK
img = 'goldfish.jpg'
work_dir = './work_dir/onnx/mobilenet_v2'
save_file = './end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_static.py'
model_cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py'
model_checkpoint = './checkpoints/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth'
device = 'cpu'
# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)
# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
onnxruntime推理
通过onnxruntime进行推理:
import cv2
import numpy as np
import onnxruntime
if __name__ == '__main__':
img = cv2.imread('goldfish.jpg')
if img.shape[0] < img.shape[1]: #h<w
img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
else:
img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))
crop_size = min(img.shape[0], img.shape[1])
left = int((img.shape[1]-crop_size)/2)
top = int((img.shape[0]-crop_size)/2)
img_crop = img[top:top+crop_size, left:left+crop_size]
img_crop = cv2.resize(img_crop, (224,224))
img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB和HWC2CHW
img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395
img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
input = np.expand_dims(img_crop, axis=0)
onnx_session = onnxruntime.InferenceSession("mobilenet_v2.onnx", providers=['CPUExecutionProvider'])
input_name=[]
for node in onnx_session.get_inputs():
input_name.append(node.name)
output_name=[]
for node in onnx_session.get_outputs():
output_name.append(node.name)
input_feed={}
for name in input_name:
input_feed[name] = input
pred = onnx_session.run(None, input_feed)
print(np.argmax(pred))
使用mmdeploy推理:
from mmdeploy.apis import inference_model
model
_cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_static.py'
img = 'goldfish.jpg'
backend_files = ['work_dir/onnx/mobilenet_v2/end2end.onnx']
device = 'cpu'
result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
print(result)
或者
import cv2
from mmdeploy_runtime import Classifier
img = cv2.imread('goldfish.jpg')
classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu')
result = classifier(img)
for label_id, score in result:
print(label_id, score)
导出engine文件
这里通过trtexec转换onnx文件,LZ的版本是TensorRT-8.2.1.8。
./trtexec.exe --onnx=mobilenet_v2.onnx --saveEngine=mobilenet_v2.engine
tensorrt推理
import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit #负责数据初始化,内存管理,销毁等
import pycuda.driver as cuda #GPU CPU之间的数据传输
if __name__ == '__main__':
# 创建logger:日志记录器
logger = trt.Logger(trt.Logger.WARNING)
# 创建runtime并反序列化生成engine
with open("mobilenet_v2.engine", "rb") as f, trt.Runtime(logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
# 分配CPU锁页内存和GPU显存
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# 创建cuda流
stream = cuda.Stream()
img = cv2.imread('goldfish.jpg')
if img.shape[0] < img.shape[1]: #h<w
img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
else:
img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))
crop_size = min(img.shape[0], img.shape[1])
left = int((img.shape[1]-crop_size)/2)
top = int((img.shape[0]-crop_size)/2)
img_crop = img[top:top+crop_size, left:left+crop_size]
img_crop = cv2.resize(img_crop, (224,224))
img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32) #BGR2RGB和HWC2CHW
img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395
img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
input = np.expand_dims(img_crop, axis=0)
np.copyto(h_input, input.ravel())
# 创建context并进行推理
with engine.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, h_input, stream)
# Run inference.
context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# Return the host output. 该数据等同于原始模型的输出数据
pred = np.argmax(h_output)
print(pred)
使用mmdeploy推理:
from mmdeploy.apis import inference_model
model_cfg = 'mmpretrain/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_tensorrt_static-224x224.py'
backend_files = ['work_dir/trt/mobilenet_v2/end2end.engine']
img = 'goldfish.jpg'
device = 'cuda'
result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
print(result)
或者
import cv2
from mmdeploy_runtime import Classifier
img = cv2.imread('goldfish.jpg')
classifier = Classifier(model_path='work_dir/onnx/mobilenet_v2', device_name='cpu')
result = classifier(img)
for label_id, score in result:
print(label_id, score)