jetson orin nano部署yolov8模型
本文主要记录yolov8在nano上的python推理
模型转换
将pt模型转换为onnx模型
yolo export model=yolov8n.pt format=onnx opset=12
将onnx模型转换为engine模型
trtexec --onnx=yolov8n.onnx --saveEngine=yolov8n.engine --fp16
更多trtexec命令行参数可以参考如何熟练的使用trtexec
模型推理Python
首先是模型加载部分engine.py
#!/usr/bin/env python3
# coding:utf-8
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) #* engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# 加载tensorrt引擎文件
def load_engine(trt_path):
# 反序列化引擎
with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
其次是yolov8推理infer.py
#!/usr/bin/env python3
# coding:utf-8
import torch, cv2
import numpy as np
from engine import *
import time
from utils import *
from copy import deepcopy
import pycuda.driver as cuda
class Detection():
def __init__(self) -> None:
super(Detection,self).__init__()
cuda.init()
device = cuda.Device(0)
self.ctx = device.make_context() # 避免多进程使用时报错
self.weights = '/home/jetson/workspace/yolov8n.engine'
self.trt_engine = None
if not self.trt_engine:
print("Loading cached TensorRT engine from {}".format(self.weights))
self.trt_engine = load_engine(self.weights)
self.inputs, self.outputs, self.bindings, self.stream = \
allocate_buffers(self.trt_engine)
self.context = self.trt_engine.create_execution_context()
def preprocess(self,im0):
# stride = int(32) # model stride
img = letterbox(im0, 640, auto=False)[0]
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
img = torch.from_numpy(img)#.to(device)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img
def infer(self,im):
np.copyto(self.inputs[0].host, im.ravel())
self.ctx.push()
pred = do_inference(self.context, bindings=self.bindings, inputs