许多应用场景需要实时处理能力,TensorRT 工具可以将模型优化为 TRT引擎格式,这种格式针对 Nvidia 的硬件进行了优化,可以在 Jetson 上实现更快的推理速度。
import cv2
class HumanDetector_V8(object):
def __init__(self, model_path='human.engine'):
self.iou_thres = 0.65
self.classes = None
self.agnostic_nms = False
self.max_det = 300
imgsz = [640, 640]
self.stride, names = 32, [f'class{i}' for i in range(1000)]
self.imgsz = check_img_sizes(imgsz, s=self.stride)
logger = trt.Logger(trt.Logger.ERROR)
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
with open(model_path, 'rb') as f, trt.Runtime(logger) as runtime:
model = runtime.deserialize_cuda_engine(f.read())
self.bindings = OrderedDict()
if cuda:
self.device = torch.device('cuda')
raise Exception("no cuda device found! ")
for index in range(model.num_bindings):
name = model.get_binding_name(index)
dtype = trt.nptype(model.get_binding_dtype(index))
shape = tuple(model.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
self.context = model.create_execution_context()
def predict(self, img0, conf_thres):
img = self.image_preprocess(img0)
self.binding_addrs['images'] = int(img.data_ptr())
pred = self.bindings['output0'].data
pred = pred.to(torch.device('cpu')).clone().detach()
# pred [1, 21, 8400]
pred = non_max_suppression(pred, conf_thres=0.5,iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), max_det=300, nc=16) # number of classes (optional)
pred = pred[0].detach().numpy()
pred[:, :4] = scale_coordss(img.shape[2:], pred[:,:4], img0.shape)
return pred
def image_preprocess(self, img0):
img = letterbox(img0, self.imgsz, stride=self.stride, auto=False)[0]
img = img.transpose((2, 0, 1))[::-1]
img = np.ascontiguousarray(img)
img = torch.from_numpy(img).to(self.device)
img = img.half()
img /= 255
if len(img.shape) == 3:
img = img[None]
return img
def __call__(self, img0, conf_thres=0.5):
boxes = self.predict(img0, conf_thres)
return boxes
if __name__ == '__main__':
detector = HumanDetector_V8(model_path='human.engine')
img0 = cv2.imread('path/to/image.jpg')
boxes = detector(img0)
创建一个 TensorRT 日志记录器实例,只记录错误级别的日志。
logger = trt.Logger(trt.Logger.ERROR)
定义了一个 namedtuple
类型 Binding
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
使用 with
语句打开模型文件,并使用 TensorRT 运行时反序列化模型文件,得到一个 CUDA 引擎。(序列化和反序列化可以这样理解:举个例子,torch.save就是序列化,torch.load就是反序列化)
with open(model_path, 'rb') as f, trt.Runtime(logger) as runtime:
model = runtime.deserialize_cuda_engine(f.read())
遍历模型的所有绑定,并为每个绑定创建一个 PyTorch 张量,并将其移到 GPU 上。然后创建一个 Binding
对象并将它添加到 bindings
for index in range(model.num_bindings):
name = model.get_binding_name(index)
dtype = trt.nptype(model.get_binding_dtype(index))
shape = tuple(model.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
创建一个执行上下文,用于执行 TensorRT 引擎。 (例如在 PyTorch 中调用 model.forward
方法可以类比于在 TensorRT 中创建执行上下文并调用 execute_v2
self.context = model.create_execution_context()
调用 image_preprocess
def predict(self, img0, conf_thres):
img = self.image_preprocess(img0)
更新绑定地址中的输入图像数据,并执行 TensorRT 引擎以获取输出。
self.binding_addrs['images'] = int(img.data_ptr())
从输出绑定中获取预测结果,并将其从 GPU 复制到 CPU。
pred = self.bindings['output0'].data
pred = pred.to(torch.device('cpu')).clone().detach()
# pred [1, 20, 8400]
pred = non_max_suppression(pred, conf_thres=0.5,iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), max_det=300, nc=16)
后续就是将预测结果转换为 NumPy 数组,并将边界框坐标从网络输出大小缩放到原始图像大小。最后返回处理后的预测结果。