目录
cannot import name 'initialize_config_module' from 'hydra'
cannot import name ‘_C‘ from ‘sam2‘解决
torchtorch版本不能是2.5及以上,否则会报错:
cuDNN Frontend error: [cudnn_frontend] Error: No execution plans support the graph.
Grounded-SAM-2
https://github.com/IDEA-Research/Grounded-SAM-2
https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth
https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
cannot import name 'initialize_config_module' from 'hydra'
pip install hydra_core
protobuf低于3.20
pip install protobuf==3.19.0
sam2 windows 安装
SAM2 安装与运行问题解决方案_sam2部署-CSDN博客
cannot import name ‘_C‘ from ‘sam2‘解决
推理代码 ok:
import glob
import os
import cv2
import json
import torch
import numpy as np
import supervision as sv
import pycocotools.mask as mask_util
from pathlib import Path
from torchvision.ops import box_convert
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from grounding_dino.groundingdino.util.inference import load_model, load_image, predict
TEXT_PROMPT = "knive"
SAM2_CHECKPOINT = "./checkpoints/sam2.1_hiera_large.pt"
# SAM2_CHECKPOINT = r"E:\project\sam2\sam2-main\checkpoints\sam2.1_hiera_large.pt"
SAM2_MODEL_CONFIG = "configs/sam2.1/sam2.1_hiera_l.yaml"
GROUNDING_DINO_CONFIG = "grounding_dino/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "gdino_checkpoints/groundingdino_swint_ogc.pth"
BOX_THRESHOLD = 0.2
TEXT_THRESHOLD = 0.2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR='dao'
DUMP_JSON_RESULTS = True
os.makedirs(OUTPUT_DIR,exist_ok=True)
# build SAM2 image predictor
sam2_checkpoint = SAM2_CHECKPOINT
model_cfg = SAM2_MODEL_CONFIG
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=DEVICE)
sam2_predictor = SAM2ImagePredictor(sam2_model)
# build grounding dino model
grounding_model = load_model(
model_config_path=GROUNDING_DINO_CONFIG,
model_checkpoint_path=GROUNDING_DINO_CHECKPOINT,
device=DEVICE
)
# setup the input image and text prompt for SAM 2 and Grounding DINO
# VERY important: text queries need to be lowercased + end with a dot
text = TEXT_PROMPT
img_dir=r'E:\data\qijun\dao_daoge/'
imgs=glob.glob(os.path.join(img_dir,'*.jpg'))
for img_path in imgs:
img_name=os.path.basename(img_path)
image_source, image = load_image(img_path)
sam2_predictor.set_image(image_source)
boxes, confidences, labels = predict(
model=grounding_model,
image=image,
caption=text,
box_threshold=BOX_THRESHOLD,
text_threshold=TEXT_THRESHOLD,
)
# process the box prompt for SAM 2
h, w, _ = image_source.shape
boxes = boxes * torch.Tensor([w, h, w, h])
input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
# FIXME: figure how does this influence the G-DINO model
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
if torch.cuda.get_device_properties(0).major >= 8:
# turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
masks, scores, logits = sam2_predictor.predict(
point_coords=None,
point_labels=None,
box=input_boxes,
multimask_output=False,
)
if masks.ndim == 4:
masks = masks.squeeze(1)
confidences = confidences.numpy().tolist()
class_names = labels
class_ids = np.array(list(range(len(class_names))))
labels = [
f"{class_name} {confidence:.2f}"
for class_name, confidence
in zip(class_names, confidences)
]
"""
Visualize image with supervision useful API
"""
img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), 1)
# img = cv2.imread(img_path)
detections = sv.Detections(
xyxy=input_boxes, # (n, 4)
mask=masks.astype(bool), # (n, h, w)
class_id=class_ids
)
box_annotator = sv.BoxAnnotator()
annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)
label_annotator = sv.LabelAnnotator()
annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
cv2.imwrite(os.path.join(OUTPUT_DIR, img_name), annotated_frame)
mask_annotator = sv.MaskAnnotator()
try:
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
cv2.imwrite(os.path.join(OUTPUT_DIR, img_name[:-4] + "_mask.jpg"), annotated_frame)
except Exception as e:
print(e)
def single_mask_to_rle(mask):
rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
rle["counts"] = rle["counts"].decode("utf-8")
return rle
if DUMP_JSON_RESULTS:
# convert mask into rle format
mask_rles = [single_mask_to_rle(mask) for mask in masks]
input_boxes = input_boxes.tolist()
scores = scores.tolist()
# save the results in standard format
results = {
"image_path": img_path,
"annotations" : [
{
"class_name": class_name,
"bbox": box,
"segmentation": mask_rle,
"score": score,
}
for class_name, box, mask_rle, score in zip(class_names, input_boxes, mask_rles, scores)
],
"box_format": "xyxy",
"img_width": w,
"img_height": h,
}
with open(os.path.join(OUTPUT_DIR,img_name[:-4]+ "_results.json"), "w") as f:
json.dump(results, f, indent=4)
linux docker安装:
SAM2:环境安装&代码调试 - 要养家的程序猿 - 博客园
docker pull pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel Error response from daemon: Get "https://registry-1.docker.io/v2/": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)
sam2图片分割原版
import os
# if using Apple MPS, fall back to CPU for unsupported ops
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
#%%
# select the device for computation
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
print(f"using device: {device}")
if device.type == "cuda":
# use bfloat16 for the entire notebook
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
# turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
if torch.cuda.get_device_properties(0).major >= 8:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
print(
"\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might "
"give numerically different outputs and sometimes degraded performance on MPS. "
"See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion."
)
np.random.seed(3)
def show_mask(mask, ax, random_color=False, borders = True):
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
else:
color = np.array([30/255, 144/255, 255/255, 0.6])
h, w = mask.shape[-2:]
mask = mask.astype(np.uint8)
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
if borders:
import cv2
contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# Try to smooth contours
contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
ax.imshow(mask_image)
def show_points(coords, labels, ax, marker_size=375):
pos_points = coords[labels==1]
neg_points = coords[labels==0]
ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
def show_box(box, ax):
x0, y0 = box[0], box[1]
w, h = box[2] - box[0], box[3] - box[1]
ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_labels=None, borders=True):
for i, (mask, score) in enumerate(zip(masks, scores)):
plt.figure(figsize=(10, 10))
plt.imshow(image)
show_mask(mask, plt.gca(), borders=borders)
if point_coords is not None:
assert input_labels is not None
show_points(point_coords, input_labels, plt.gca())
if box_coords is not None:
# boxes
show_box(box_coords, plt.gca())
if len(scores) > 1:
plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
plt.axis('off')
plt.show()
image = Image.open(r"E:\project\sam2\sam2-main\notebooks\images\truck.jpg")
image = np.array(image.convert("RGB"))
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
sam2_checkpoint =r"E:\project\sam2\sam2-main\checkpoints\sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=device)
predictor = SAM2ImagePredictor(sam2_model)
predictor.set_image(image)
input_point = np.array([[500, 375]])
input_label = np.array([1])
if 0:
plt.figure(figsize=(10, 10))
plt.imshow(image)
show_points(input_point, input_label, plt.gca())
plt.axis('on')
plt.show()
print(predictor._features["image_embed"].shape, predictor._features["image_embed"][-1].shape)
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True,
)
sorted_ind = np.argsort(scores)[::-1]
masks = masks[sorted_ind]
scores = scores[sorted_ind]
logits = logits[sorted_ind]
show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label, borders=True)
input_point = np.array([[500, 375], [1125, 625]])
input_label = np.array([1, 1])
mask_input = logits[np.argmax(scores), :, :] # Choose the model's best mask
#%%
masks, scores, _ = predictor.predict(
point_coords=input_point,
point_labels=input_label,
mask_input=mask_input[None, :, :],
multimask_output=False,
)
show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label)
sam2图片分割
https://zhuanlan.zhihu.com/p/714031640
import torch
import numpy as np
import cv2
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
import time
import hydra
New_SAM = True
# use bfloat16 for the entire notebook
if New_SAM:
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
# image = Image.open('/home/taohu/Projects/Data/RGB/thumbnail_Picture1.png')
# image = np.array(image.convert("RGB"))
image = cv2.imread('/home/taohu/Projects/Data/RGB/thumbnail_Picture1.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if New_SAM:
method = "SAM2"
else:
method = "SAM1"
start_time1 = time.time()
if New_SAM:
sam2_checkpoint = "models/sam2_hiera_large.pt"
model_cfg = "sam2_hiera_l.yaml"
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")
predictor = SAM2ImagePredictor(sam2_model)
predictor.set_image(image)
else:
model_type = "vit_h"
sam_checkpoint = "models/sam_vit_h_4b8939.pth"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to("cuda")
predictor = SamPredictor(sam)
predictor.set_image(image)
end_time1 = time.time()
load_time = end_time1 - start_time1
print(f"Loading time ({method}): {load_time} seconds")
input_box = np.array([58,107, 213,281])
input_point = np.array([[104, 163]])
input_label = np.array([1])
start_time2 = time.time()
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
box=input_box,
multimask_output=False,
)
end_time2 = time.time()
execution_time = end_time2 - start_time2
print(f"Execution time ({method}): {execution_time} seconds")
mask_array = np.array(masks[0])
if New_SAM:
mask_array = mask_array.astype(np.uint8)*255 # SAM2 use 0~1 values for the mask
mask_image = Image.fromarray(mask_array)
mask_image.save("sam2-bw.jpg")
else:
mask_image = Image.fromarray(mask_array)
mask_image.save("sam1-bw.jpg")
box测试ok,分数最高的:
import os
import cv2
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
import numpy as np
import torch
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"using device: {device}")
if device.type == "cuda":
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
# turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
if torch.cuda.get_device_properties(0).major >= 8:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
def show_mask(mask, ax, random_color=False, borders = True):
color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
h, w = mask.shape[-2:]
mask = mask.astype(np.uint8)
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
if borders:
import cv2
contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# Try to smooth contours
contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
ax.imshow(mask_image)
if __name__ == '__main__':
np.random.seed(3)
sam2_checkpoint = r"E:\project\Grounded-SAM-2-main\checkpoints\sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=device)
view_debug=1
image = Image.open(r"E:\project\sam2\sam2-main\notebooks\images\truck.jpg")
image = np.array(image.convert("RGB"))
image_o=image
predictor = SAM2ImagePredictor(sam2_model)
predictor.set_image(image)
input_label = np.array([1])
rate=0.1
print(predictor._features["image_embed"].shape, predictor._features["image_embed"][-1].shape)
if 1:
input_box = np.array([102, 235, 1736, 869], dtype=np.int32)
masks, scores, logits = predictor.predict(
point_coords=None,
point_labels=input_label,
multimask_output=True,
box=input_box,
)
sorted_ind = np.argsort(scores)[::-1]
masks = masks[sorted_ind]
scores = scores[sorted_ind]
logits = logits[sorted_ind]
random_color = False
mask = masks[0]
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
else:
color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
h, w = mask.shape[-2:]
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
x0, y0 = input_box[0], input_box[1]
w, h = input_box[2] - input_box[0], input_box[3] - input_box[1]
mask_gray = (mask * 255).astype(np.uint8)
contours, _ = cv2.findContours(mask_gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 找到最大的轮廓(假设为白色区域)
max_area = 0
max_cnt = None
for cnt in contours:
area = cv2.contourArea(cnt)
if area > max_area:
max_area = area
max_cnt = cnt
x, y, w, h = cv2.boundingRect(max_cnt) # 计算点集最外面的矩形边界
if view_debug:
cv2.rectangle(image_o, (x, y), (x+w, y+h), (0, 255, 0), 2)
cv2.imshow("img", image)
cv2.imshow("mask_gray", mask_gray)
cv2.imshow("one", mask_image)
cv2.waitKey(0)
if rate > 0:
input_box = np.array([x - int(w * rate), y - int(h * rate), x + int(w * (1 + rate)), y + int(h * (1 + rate))])