问题背景
项目中需要快速实现人体手部分割的算法效果,调研了下没有非常匹配的数据集,且大规模数据下载、训练、优化也比较耗时,故想到使用可以实现万物分割的大模型。
mediapipe虽然没有手部分割的功能,但可以实现较为精确的手部关键点检测,刚好可以将该关键点作为大模型的提示,从而实现想要的分割效果。
大模型使用较为轻量化的mobilesam,工程地址如下
https://github.com/ChaoningZhang/MobileSAM
具体实现
使用mobile_sam.pt
模型,将以下脚本放入工程目录下,直接执行可调用电脑摄像头实现手部的实时分割展示。
import torch
import cv2
import copy
import numpy as np
import mediapipe as mp
import os
import psutil
from mobile_sam import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
device = "cuda" if torch.cuda.is_available() else "cpu"
cap = cv2.VideoCapture(0)
use_camera = True
# 设置基于mediapipe的手部关键点检测策略
min_detection_confidence = 0.7
min_tracking_confidence = 0.5
use_static_image_mode = False
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=use_static_image_mode,
max_num_hands=2,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
)
# 计算手部外接矩形框
def calc_bounding_rect(image, landmarks):
image_width, image_height = image.shape[1], image.shape[0]
landmark_array = np.empty((0, 2), int)
for _, landmark in enumerate(landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_point = [np.array((landmark_x, landmark_y))]
landmark_array = np.append(landmark_array, landmark_point, axis=0)
x, y, w, h = cv2.boundingRect(landmark_array)
return np.array([x, y, x + w, y + h])
# 计算手部关键点
def calc_landmark_list(image, landmarks):
image_width, image_height = image.shape[1], image.shape[0]
landmark_point = []
joint = []
for _, landmark in enumerate(landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_z = landmark.z
landmark_v = landmark.visibility
landmark_point.append([landmark_x, landmark_y])
joint.append([landmark_x, landmark_y, landmark_v])
return np.array(landmark_point), joint
while True:
success, img = cap.read()
img = cv2.flip(img, 1) if use_camera else img # 镜像
img = cv2.resize(img, (640, 480))
debug_image = copy.deepcopy(img)
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 调用mediapipe计算手部信息
results = hands.process(imgRGB)
model_type = "vit_t"
sam_checkpoint = "./weights/mobile_sam.pt"
mobile_sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
mobile_sam.to(device=device)
mobile_sam.eval()
predictor = SamPredictor(mobile_sam)
predictor.set_image(imgRGB)
label_rgb = np.zeros((480,640,3))
if results.multi_hand_landmarks is not None:
for hand_landmarks, handedness in zip(results.multi_hand_landmarks,
results.multi_handedness):
# 外接矩形计算
brect = calc_bounding_rect(debug_image, hand_landmarks)
# 关键点计算
landmark_list, joints = calc_landmark_list(debug_image, hand_landmarks)
# print(landmark_list.shape) # [21,2]
# 将手部关键点作为分割的提示
point_labels = [1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1] if len(landmark_list)>0 else []
masks, _, _ = predictor.predict(multimask_output=False, point_coords=landmark_list, point_labels=point_labels, box=brect)
label_rgb[masks[0]==1] = [255,255,255]
# label_rgb[masks[2]==1] = [255,255,255]
imgs_res = np.hstack([img, label_rgb])
# cv2.imshow("Image_Mask", label_rgb)
cv2.namedWindow('Image_Mask', cv2.WINDOW_FREERATIO) # 窗口大小自适应比例
cv2.imshow("Image_Mask", np.uint8(imgs_res))
cv2.waitKey(1)
分割效果还不错,单手、双手、手中握有物体等场景下效果如下
test_video