sam2 安装使用踩坑笔记

AI算法网奇

已于 2025-03-18 10:33:38 修改

阅读量7.5w

点赞数 4

分类专栏：深度学习基础文章标签：人工智能计算机视觉

于 2024-08-30 23:13:02 首次发布

本文链接：https://blog.csdn.net/jacke121/article/details/141729531

版权

深度学习基础专栏收录该内容

223 篇文章

订阅专栏

torchtorch版本不能是2.5及以上，否则会报错：

Grounded-SAM-2

cannot import name 'initialize_config_module' from 'hydra'

sam2 windows 安装

cannot import name ‘_C‘ from ‘sam2‘解决

torchtorch版本不能是2.5及以上，否则会报错：

cuDNN Frontend error: [cudnn_frontend] Error: No execution plans support the graph.

cuDNN Frontend error: [cudnn_frontend] Error: No execution plans support the graph._with the cudnn frontend json:-CSDN博客

Grounded-SAM-2

https://github.com/IDEA-Research/Grounded-SAM-2

https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth

https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

cannot import name 'initialize_config_module' from 'hydra'

pip install hydra_core

protobuf低于3.20

pip install protobuf==3.19.0

sam2 windows 安装

SAM2 安装与运行问题解决方案_sam2部署-CSDN博客

cannot import name ‘_C‘ from ‘sam2‘解决

http://t.csdnimg.cn/CEbKa

推理代码 ok：

import glob
import os
import cv2
import json
import torch
import numpy as np
import supervision as sv
import pycocotools.mask as mask_util
from pathlib import Path
from torchvision.ops import box_convert
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from grounding_dino.groundingdino.util.inference import load_model, load_image, predict


TEXT_PROMPT = "knive"

SAM2_CHECKPOINT = "./checkpoints/sam2.1_hiera_large.pt"
# SAM2_CHECKPOINT = r"E:\project\sam2\sam2-main\checkpoints\sam2.1_hiera_large.pt"
SAM2_MODEL_CONFIG = "configs/sam2.1/sam2.1_hiera_l.yaml"
GROUNDING_DINO_CONFIG = "grounding_dino/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT = "gdino_checkpoints/groundingdino_swint_ogc.pth"
BOX_THRESHOLD = 0.2
TEXT_THRESHOLD = 0.2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR='dao'
DUMP_JSON_RESULTS = True


os.makedirs(OUTPUT_DIR,exist_ok=True)


# build SAM2 image predictor
sam2_checkpoint = SAM2_CHECKPOINT
model_cfg = SAM2_MODEL_CONFIG
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=DEVICE)
sam2_predictor = SAM2ImagePredictor(sam2_model)

# build grounding dino model
grounding_model = load_model(
    model_config_path=GROUNDING_DINO_CONFIG, 
    model_checkpoint_path=GROUNDING_DINO_CHECKPOINT,
    device=DEVICE
)


# setup the input image and text prompt for SAM 2 and Grounding DINO
# VERY important: text queries need to be lowercased + end with a dot
text = TEXT_PROMPT

img_dir=r'E:\data\qijun\dao_daoge/'

imgs=glob.glob(os.path.join(img_dir,'*.jpg'))

for img_path in imgs:
    img_name=os.path.basename(img_path)
    image_source, image = load_image(img_path)

    sam2_predictor.set_image(image_source)

    boxes, confidences, labels = predict(
        model=grounding_model,
        image=image,
        caption=text,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD,
    )

    # process the box prompt for SAM 2
    h, w, _ = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()

    # FIXME: figure how does this influence the G-DINO model
    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()

    if torch.cuda.get_device_properties(0).major >= 8:
        # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

    masks, scores, logits = sam2_predictor.predict(
        point_coords=None,
        point_labels=None,
        box=input_boxes,
        multimask_output=False,
    )

    if masks.ndim == 4:
        masks = masks.squeeze(1)

    confidences = confidences.numpy().tolist()
    class_names = labels

    class_ids = np.array(list(range(len(class_names))))

    labels = [
        f"{class_name} {confidence:.2f}"
        for class_name, confidence
        in zip(class_names, confidences)
    ]

    """
    Visualize image with supervision useful API
    """

    img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), 1)
    # img = cv2.imread(img_path)
    detections = sv.Detections(
        xyxy=input_boxes,  # (n, 4)
        mask=masks.astype(bool),  # (n, h, w)
        class_id=class_ids
    )

    box_annotator = sv.BoxAnnotator()
    annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections)

    label_annotator = sv.LabelAnnotator()
    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    cv2.imwrite(os.path.join(OUTPUT_DIR, img_name), annotated_frame)

    mask_annotator = sv.MaskAnnotator()
    try:
        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
        cv2.imwrite(os.path.join(OUTPUT_DIR, img_name[:-4] + "_mask.jpg"), annotated_frame)
    except Exception as e:
        print(e)

    def single_mask_to_rle(mask):
        rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
        rle["counts"] = rle["counts"].decode("utf-8")
        return rle

    if DUMP_JSON_RESULTS:
        # convert mask into rle format
        mask_rles = [single_mask_to_rle(mask) for mask in masks]

        input_boxes = input_boxes.tolist()
        scores = scores.tolist()
        # save the results in standard format
        results = {
            "image_path": img_path,
            "annotations" : [
                {
                    "class_name": class_name,
                    "bbox": box,
                    "segmentation": mask_rle,
                    "score": score,
                }
                for class_name, box, mask_rle, score in zip(class_names, input_boxes, mask_rles, scores)
            ],
            "box_format": "xyxy",
            "img_width": w,
            "img_height": h,
        }

        with open(os.path.join(OUTPUT_DIR,img_name[:-4]+ "_results.json"), "w") as f:
            json.dump(results, f, indent=4)

linux docker安装：

SAM2：环境安装&代码调试 - 要养家的程序猿 - 博客园

docker pull pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel Error response from daemon: Get "https://registry-1.docker.io/v2/": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)

sam2图片分割原版


import os
# if using Apple MPS, fall back to CPU for unsupported ops
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
#%%
# select the device for computation
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"using device: {device}")

if device.type == "cuda":
    # use bfloat16 for the entire notebook
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
elif device.type == "mps":
    print(
        "\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA and might "
        "give numerically different outputs and sometimes degraded performance on MPS. "
        "See e.g. https://github.com/pytorch/pytorch/issues/84936 for a discussion."
    )

np.random.seed(3)

def show_mask(mask, ax, random_color=False, borders = True):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])
    h, w = mask.shape[-2:]
    mask = mask.astype(np.uint8)
    mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    if borders:
        import cv2
        contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        # Try to smooth contours
        contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
        mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)

def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))

def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_labels=None, borders=True):
    for i, (mask, score) in enumerate(zip(masks, scores)):
        plt.figure(figsize=(10, 10))
        plt.imshow(image)
        show_mask(mask, plt.gca(), borders=borders)
        if point_coords is not None:
            assert input_labels is not None
            show_points(point_coords, input_labels, plt.gca())
        if box_coords is not None:
            # boxes
            show_box(box_coords, plt.gca())
        if len(scores) > 1:
            plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
        plt.axis('off')
        plt.show()

image = Image.open(r"E:\project\sam2\sam2-main\notebooks\images\truck.jpg")
image = np.array(image.convert("RGB"))


from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

sam2_checkpoint =r"E:\project\sam2\sam2-main\checkpoints\sam2.1_hiera_large.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"

sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=device)

predictor = SAM2ImagePredictor(sam2_model)

predictor.set_image(image)

input_point = np.array([[500, 375]])
input_label = np.array([1])

if 0:
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    show_points(input_point, input_label, plt.gca())
    plt.axis('on')
    plt.show()
print(predictor._features["image_embed"].shape, predictor._features["image_embed"][-1].shape)

masks, scores, logits = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    multimask_output=True,
)
sorted_ind = np.argsort(scores)[::-1]
masks = masks[sorted_ind]
scores = scores[sorted_ind]
logits = logits[sorted_ind]


show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label, borders=True)

input_point = np.array([[500, 375], [1125, 625]])
input_label = np.array([1, 1])

mask_input = logits[np.argmax(scores), :, :]  # Choose the model's best mask
#%%
masks, scores, _ = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    mask_input=mask_input[None, :, :],
    multimask_output=False,
)

show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label)

sam2图片分割

https://zhuanlan.zhihu.com/p/714031640

import torch
import numpy as np
import cv2
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor

import time

import hydra

New_SAM = True

# use bfloat16 for the entire notebook
if New_SAM:
 torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()

# image = Image.open('/home/taohu/Projects/Data/RGB/thumbnail_Picture1.png')
# image = np.array(image.convert("RGB"))

image = cv2.imread('/home/taohu/Projects/Data/RGB/thumbnail_Picture1.png')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


if New_SAM:
 method = "SAM2"
else:
 method = "SAM1"

start_time1 = time.time()

if New_SAM:
 sam2_checkpoint = "models/sam2_hiera_large.pt"
 model_cfg = "sam2_hiera_l.yaml"

 sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")

 predictor = SAM2ImagePredictor(sam2_model)
 predictor.set_image(image)
else:
 model_type = "vit_h"
 sam_checkpoint = "models/sam_vit_h_4b8939.pth" 
 sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
 sam.to("cuda")
 predictor = SamPredictor(sam)
 predictor.set_image(image)

end_time1 = time.time()
load_time = end_time1 - start_time1
print(f"Loading time ({method}): {load_time} seconds")

input_box = np.array([58,107, 213,281])
input_point = np.array([[104, 163]])
input_label = np.array([1])

start_time2 = time.time()

masks, scores, logits = predictor.predict(
 point_coords=input_point,
 point_labels=input_label,
 box=input_box,
 multimask_output=False,
)

end_time2 = time.time()
execution_time = end_time2 - start_time2
print(f"Execution time ({method}): {execution_time} seconds")

mask_array = np.array(masks[0]) 

if New_SAM:
 mask_array = mask_array.astype(np.uint8)*255 # SAM2 use 0~1 values for the mask
 mask_image = Image.fromarray(mask_array)
 mask_image.save("sam2-bw.jpg")
else:
 mask_image = Image.fromarray(mask_array)
 mask_image.save("sam1-bw.jpg")

box测试ok，分数最高的：


import os
import cv2
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
import numpy as np
import torch
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
print(f"using device: {device}")

if device.type == "cuda":
    torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
    if torch.cuda.get_device_properties(0).major >= 8:
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

def show_mask(mask, ax, random_color=False, borders = True):
    color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    h, w = mask.shape[-2:]
    mask = mask.astype(np.uint8)
    mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    if borders:
        import cv2
        contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        # Try to smooth contours
        contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
        mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
    ax.imshow(mask_image)


if __name__ == '__main__':
    np.random.seed(3)
    sam2_checkpoint = r"E:\project\Grounded-SAM-2-main\checkpoints\sam2.1_hiera_large.pt"
    model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"

    sam2_model = build_sam2(model_cfg, sam2_checkpoint, device=device)
    view_debug=1
    image = Image.open(r"E:\project\sam2\sam2-main\notebooks\images\truck.jpg")
    image = np.array(image.convert("RGB"))
    image_o=image
    predictor = SAM2ImagePredictor(sam2_model)
    predictor.set_image(image)

    input_label = np.array([1])
    rate=0.1
    print(predictor._features["image_embed"].shape, predictor._features["image_embed"][-1].shape)
    if 1:
        input_box = np.array([102, 235, 1736, 869], dtype=np.int32)
        masks, scores, logits = predictor.predict(
            point_coords=None,
            point_labels=input_label,
            multimask_output=True,
            box=input_box,
        )
        sorted_ind = np.argsort(scores)[::-1]
        masks = masks[sorted_ind]
        scores = scores[sorted_ind]
        logits = logits[sorted_ind]

        random_color = False
        mask = masks[0]
        if random_color:
            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
        else:
            color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
        h, w = mask.shape[-2:]
        mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
        x0, y0 = input_box[0], input_box[1]
        w, h = input_box[2] - input_box[0], input_box[3] - input_box[1]

        mask_gray = (mask * 255).astype(np.uint8)

        contours, _ = cv2.findContours(mask_gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        # 找到最大的轮廓（假设为白色区域）
        max_area = 0
        max_cnt = None
        for cnt in contours:
            area = cv2.contourArea(cnt)
            if area > max_area:
                max_area = area
                max_cnt = cnt

        x, y, w, h = cv2.boundingRect(max_cnt)  # 计算点集最外面的矩形边界
        if view_debug:
            cv2.rectangle(image_o, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.imshow("img", image)
            cv2.imshow("mask_gray", mask_gray)
            cv2.imshow("one", mask_image)
            cv2.waitKey(0)
        if rate > 0:
            input_box = np.array([x - int(w * rate), y - int(h * rate), x + int(w * (1 + rate)), y + int(h * (1 + rate))])