本文介绍了基于openvino的yolov5-seg的python流程。首先要将yolo-seg训练出来的pt转换成为onnx模型,再将onnx模型转换为openvino推理要用的xml文件。这些转换代码,官方代码都有案例,本文主要介绍openvino的推理流程。
第一步:初始化core类,将转换好的模型进行编译得到网络net,根据net的input和output得到net输入输出。我们可以得到yolov5-seg的输入的shape是[1,3,640,640],net的输出有两个,output0代表的是检测的输出shape是[1,25200,38],output1代表的是mask相关的输出,其中shape是[1,32,160,160]。
# Step1: Create OpenVINO Runtime Core
core = Core()
# Step2: Compile the Model, using dGPU A770m
net = core.compile_model("yolov5-seg.xml", "GPU.0")
#outputs0>>{ConstOutput:(1,25200,38)} outputs0>>{ConstOutput:(1,32,160,160)}
output0, output1 = net.outputs[0],net.outputs[1]
#[1,3,640,640]
b,n,input_h,input_w = net.inputs[0].shape
第二步:既然我们要把我们的输入图片送到网络进行推理,那就必须得保证net的输入shape和我们的送入图像必须转换为net所需要的shape。因此我们需要对 输入的图片进行预处理,目的是将原图的shape转换为[1,3,640,640]与net的输入相匹配。首先,对于任意图像的尺寸都是不固定的,我们需要将尺寸都规定到(640,640)。这里我们使用yolov5常用的接口函数letterbox。这里我们的输入的图像的尺寸是(3672,5496),经过letterbox变成了(640,640,3)。首先使用transpose,将HWC变成CHW,再将BGR变成RGB得到blob1,在通过ascontguousarray将其变成连续的数组得到blob2,在归一化得到blob3,在进行维度扩充,得到(1,3,640,640)。这一步的主要目的就是将你的输入转变成网络所需要的输入,进行适配就可以了。
frame =cv2.imread("E:\\yolov5-seg-master\\test_image\\2113029.jpg")
fh, fw, fc = frame.shape
im, r, (dw, dh)= letterbox(frame, new_shape=(input_h,input_w), auto=False) # Resize to new shape by letterbox
blob1 = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
blob2 = np.ascontiguousarray(blob1)
blob3 = np.float32(blob2) / 255.0 # 0 - 255 to 0.0 - 1.0
blob = blob3[None]
第三步:输入弄好后,将输入送进到网络进行推理。至于中间怎么推理的,我们可以不关心,我们只关注结果即可。yolov5-seg得到的推理结果会有两个输出,一个pred,shape是[1,25200,38],其中38=4+1+1+32,4代表的检测框坐标,1代表的检测物体的置信度,1代表的是类别。32代表是mask信息。我们将pred得到的检测框信息用non_max_suppression进行预测框的筛选,将pred的shape[1,25200,38],转换为了[1,38]。其中跟mask相关的mask矩阵就是pred[:,6:]。这和矩阵的维度是[1,32],而net的第二个mask输出proto矩阵是[1,32,160,160]。我们要做的就是要将两个矩阵进行相乘,将proto第一维度删除,并将后面两个维度进行合并,得到[32,25600],将两个矩阵得到[1,25600],最后再将得到mask矩阵转换为[1,160,160]
outputs = net([blob])
#[1,25200,38] [1,32,160,160]
pred, proto = outputs[output0], outputs[output1]
preds = torch.tensor(pred)
#[1,25200,38]>>>>>[1,38]
pred = non_max_suppression(preds, nm=32)[0].numpy() #(n,38) tensor per image [xyxy, conf, cls, masks]
# (n,38) tensor per image [xyxy, conf, cls, masks]
bboxes, confs, class_ids, masks= pred[:,:4], pred[:,4], pred[:,5], pred[:,6:]
proto = np.squeeze(proto) #删除维度为1的维度 (1,32,160,160) >>>(32,160,160)
proto = np.reshape(proto, (32,-1)) #(32,160,160) >>>(32,25600)
obj_masks = np.matmul(masks,proto) #两个矩阵相乘(1,32) (32,256000)
obj_masks = np.reshape(sigmoid(obj_masks), (-1, 160, 160)) #(1,25600) >>> (1,160,160)
第四步:前面第三步,我们已经得到确切的检测框信息,和mask的掩膜矩阵,还没有具体的mask位于掩膜矩阵哪一部分,我们可以根据检测框信息,确定具体的mask位置,就是image_roi。就是我们所需要的mask也就是模型所需要的mask。在接下来我们只要将预测框和mask分别映射到原图就可以了。通过rescale_coords,将预测框的信息,映射原图的真实的坐标信息。根据预测的image_roi和真实的预测框信息,进行映射,下图左一代表是真实的原图的mask的局部,也可以创建一张和原图一样尺寸的大小的原图,将mask的位置映射到原图大小上,就完成了对mask的提取。
第五步:我们可以将提取到的mask图像和原图进行融合。得到我们模型的预测结果。主体思想就是根据检测框信息和mask矩阵,找到mask的确切位置。在将检测框映射到原图,在利用mask和映射回去的检测框位置,得到映射回去的mask区域。这样就完成了基于openvino的yolov5-seg的python推理,下面提供的各种函数的接口,以及完整代码。
import numpy as np
import cv2, yaml, torch,torchvision
from openvino.runtime import Core
import matplotlib.pyplot as plt
import time
def xywh2xyxy(x):
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def box_iou(box1, box2, eps=1e-7):
(a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
def non_max_suppression(prediction,conf_thres=0.25,iou_thres=0.45,classes=None,agnostic=False,multi_label=False,labels=(),max_det=300,nm=0, # number of masks
):
if isinstance(prediction, (list, tuple)): # YOLOv5 model in validation model, output = (inference_out, loss_out)
prediction = prediction[0] # select only inference output
device = prediction.device
mps = 'mps' in device.type # Apple MPS
if mps:
prediction = prediction.cpu()
bs = prediction.shape[0] # batch size
nc = prediction.shape[2] - nm - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
max_wh = 7680 # (pixels) maximum box width and height
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 0.5 + 0.05 * bs # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
mi = 5 + nc # mask start index
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
for xi, x in enumerate(prediction):
x = x[xc[xi]] # confidence
if labels and len(labels[xi]):
lb = labels[xi]
v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
v[:, :4] = lb[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(lb)), lb[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
if not x.shape[0]:
continue
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
box = xywh2xyxy(x[:, :4])
mask = x[:, mi:]
if multi_label:
i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
else: # best class only
conf, j = x[:, 5:mi].max(1, keepdim=True)
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
else:
x = x[x[:, 4].argsort(descending=True)] # sort by confidence
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3):
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if mps:
output[xi] = output[xi].to(device)
return output
def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
shape = im.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # only scale down, do not scale up (for better val mAP)
r = min(r, 1.0)
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
elif scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return im, ratio, (dw, dh)
with open('./data/custom.yaml','r', encoding='utf-8') as f:
result = yaml.load(f.read(),Loader=yaml.FullLoader)
class_list = result['names']
# color palette
colors = [(200, 150, 0), (0, 200, 0), (0, 200, 150), (200, 0, 0)]
def sigmoid(x):
return 1.0/(1+np.exp(-x))
def rescale_coords(ratio, pad, coords):
# Rescale coords (xyxy) from according to r and (dh, dw) from letterbox
coords[:, [1, 3]] -= pad[0] # H padding
coords[:, [0, 2]] -= pad[1] # W padding
coords[:, :4] /= ratio
return coords
# Step1: Create OpenVINO Runtime Core
core = Core()
net = core.compile_model("yolov5-seg.xml", "GPU.0")
#outputs0>>{ConstOutput:(1,25200,38)} outputs0>>{ConstOutput:(1,32,160,160)}
output0, output1 = net.outputs[0],net.outputs[1]
b,n,input_h,input_w = net.inputs[0].shape
# Step2: Preprocess the image before inference
frame =cv2.imread("E:\\yolov5-seg-master\\test_image\\2113029.jpg")
fh, fw, fc = frame.shape
im, r, (dw, dh)= letterbox(frame, new_shape=(input_h,input_w), auto=False) # Resize to new shape by letterbox
blob1 = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
blob2 = np.ascontiguousarray(blob1)
blob3 = np.float32(blob2) / 255.0 # 0 - 255 to 0.0 - 1.0
blob = blob3[None] # expand for batch dim
# Step3: Do the inference
outputs = net([blob])
#[1,25200,38] [1,32,160,160]
pred, proto = outputs[output0], outputs[output1]
# Step5: Postprocess the inference result and visulize it.
preds = torch.tensor(pred)
#[1,25200,38]>>>>>[1,38]
pred = non_max_suppression(preds, nm=32)[0].numpy() #(n,38) tensor per image [xyxy, conf, cls, masks]
# (n,38) tensor per image [xyxy, conf, cls, masks]
bboxes, confs, class_ids, masks= pred[:,:4], pred[:,4], pred[:,5], pred[:,6:]
# Extract the mask of the detected object
proto = np.squeeze(proto) #删除维度为1的维度 (1,32,160,160) >>>(32,160,160)
proto = np.reshape(proto, (32,-1)) #(32,160,160) >>>(32,25600)
obj_masks = np.matmul(masks,proto) #两个矩阵相乘(1,32) (32,25600)
obj_masks = np.reshape(sigmoid(obj_masks), (-1, 160, 160))
#step4
masks_roi = []
for obj_mask, bbox in zip(obj_masks, bboxes):
mx1 = max(0, np.int32((bbox[0] * 0.25)))
my1 = max(0, np.int32((bbox[1] * 0.25)))
mx2 = max(0, np.int32((bbox[2] * 0.25)))
my2 = max(0, np.int32((bbox[3] * 0.25)))
masks_roi.append(obj_mask[my1:my2,mx1:mx2])
#得到的masks_roi是带有mask的目标区域
bboxes = rescale_coords(r[0], (dh, dw), bboxes).astype(int)
color_mask = np.zeros((fh, fw, 3), dtype=np.uint8)
black_mask = np.zeros((fh, fw), dtype=np.float32)
mv = cv2.split(color_mask)
for bbox, conf, class_id, mask_roi in zip(bboxes, confs, class_ids, masks_roi):
x1,y1,x2,y2 = bbox[0], bbox[1], bbox[2], bbox[3]
# Draw Mask 把映射回去的预测框在原图上绘画出来
color = colors[int(class_id) % len(colors)]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
color = colors[int(class_id) % len(colors)]
cv2.rectangle(frame, (x1, y1 - 20), (x2, y1), (0,0,255), -1)
# Draw mask of the detected objects
result_mask = cv2.resize(mask_roi, (bbox[2]-bbox[0], bbox[3]-bbox[1]))
result_mask[result_mask > 0.5] = 1.0
result_mask[result_mask <= 0.5] = 0.0
rh, rw = result_mask.shape
if (y1+rh) >= fh:
rh = fh - y1
if (x1+rw) >= fw:
rw = fw - x1
black_mask[y1:y1+rh, x1:x1+rw] = result_mask[0:rh, 0:rw]
mv[2][black_mask == 1], mv[1][black_mask == 1], mv[0][black_mask == 1] = \
[np.random.randint(0, 256), np.random.randint(0, 256), np.random.randint(0, 256)]
#step5
color_mask = cv2.merge(mv)
dst = cv2.addWeighted(frame, 0.5, color_mask, 0.5, 0)