弱监督定位——类激活图大显身手

摘要:

        如何不使用目标检测也能把目标定位出来?

        在计算机视觉领域,弱监督定位是一项重要任务,它旨在通过仅使用图像级别的标签来确定图像中物体的位置。其中,类激活图(Class Activation Map,CAM)成为一种常用的方法,它能够可视化模型的注意力区域,帮助我们理解模型在图像分类中的决策依据。而Grad-CAM采用梯度信息实现定位,且加适用于任意网络架构。

        通过可视化与灵活的写入网络体系中,Grad-CAM可以很好的将模型感兴趣的区域定位出来,再结合opencv对图像或视频的处理,可以让我们实现类似于目标检测的效果。

详细步骤

首先,我们来简单的实现一下Grad-CAM,这个在我往期的文章里有讲到:Grad-CAM——模型所关注的

代码:

class GradCAM:
    def __init__(self, model):
        self.model = model
        self.model.eval()
        self.features = None
        self.gradients = None

        self.model.features.register_forward_hook(self.save_features_hook)
        self.model.features.register_backward_hook(self.save_gradients_hook)

    def save_features_hook(self, module, input, output):
        self.features = output

    def save_gradients_hook(self, module, grad_input, grad_output):
        self.gradients = grad_output[0]

    def calculate_cam(self, input_tensor, target_class):
        output = self.model(input_tensor)
        self.model.zero_grad()
        one_hot = torch.zeros_like(output)
        one_hot[0][target_class] = 1
        output.backward(gradient=one_hot, retain_graph=True)

        pooled_gradients = F.adaptive_avg_pool2d(self.gradients, 1)
        cam = torch.mul(self.features, pooled_gradients).sum(dim=1, keepdim=True)
        cam = F.relu(cam)
        cam = F.interpolate(cam, input_tensor.size()[2:], mode="bilinear", align_corners=False)
        cam = cam.squeeze().detach().cpu().numpy()
        cam =(cam - cam.min()) / (cam.max() - cam.min())  # 归一化

        return cam

 载入模型,结合指定类别和输入向量,就可以得到热力图:

#加载训练好的模型
model = torch.load('fruit_model.pth')
model =model.cuda()
grad_cam = GradCAM(model)

# Load and preprocess the image
image_path = 'test.jpg'
image = Image.open(image_path).convert('RGB')
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(image).unsqueeze(0).cuda()
target_class = 0  # 训练模型时的类别索引

# Calculate Grad-CAM
cam = grad_cam.calculate_cam(input_tensor, target_class)
cam = np.uint8(255*cam)
cam = np.uint8(cv2.resize(cam,(image.size[0],image.size[1])))

heatmap = cv2.applyColorMap(cam,cv2.COLORMAP_JET)
heatmap_pil = Image.fromarray(heatmap)
image_pil = Image.fromarray(np.uint8(image))
result = Image.blend(image_pil,heatmap_pil,alpha=0.4)

plt.imshow(result)
plt.axis('off')
plt.show()

 

然后,我们给Grad-CAM添加画框的功能,可以设定阈值,通过图像二值化,就可以画框:

    def generate_bounding_box(self, cam, threshold):
        _, binary_map = cv2.threshold(cam, threshold, 255, cv2.THRESH_BINARY)

        contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        bounding_boxes = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            bounding_boxes.append((x, y, w, h))

        return bounding_boxes

 显示部分改成这样:

# Calculate Grad-CAM
cam = grad_cam.calculate_cam(input_tensor, target_class)
cam = np.uint8(255 * cam)
cam = cv2.resize(cam, (image.size[0], image.size[1]))

# Generate bounding boxes
threshold = 140  # 调整阈值以控制边界框的生成
bounding_boxes = grad_cam.generate_bounding_box(cam, threshold)

# Draw bounding boxes
image_with_boxes = np.array(image)
for (x, y, w, h) in bounding_boxes:
    cv2.rectangle(image_with_boxes, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Display the result
plt.imshow(image_with_boxes)
plt.axis('off')
plt.show()

根据你所设定的阈值,这个框的范围会改变。 

最后,我们来实现一下如何实现类似于目标检测的效果;其实就是将视频变成每一帧的检测:

import torch
import torch.nn.functional as F
from PIL import Image,ImageDraw
from torchvision import transforms
import numpy as np
import cv2
import matplotlib.pyplot as plt
class GradCAM:
    def __init__(self, model):
        self.model = model
        self.model.eval()
        self.features = None
        self.gradients = None

        self.model.features.register_forward_hook(self.save_features_hook)
        self.model.features.register_backward_hook(self.save_gradients_hook)

    def save_features_hook(self, module, input, output):
        self.features = output

    def save_gradients_hook(self, module, grad_input, grad_output):
        self.gradients = grad_output[0]

    def calculate_cam(self, input_tensor, target_class):
        output = self.model(input_tensor)
        self.model.zero_grad()
        one_hot = torch.zeros_like(output)
        one_hot[0][target_class] = 1
        output.backward(gradient=one_hot, retain_graph=True)

        pooled_gradients = F.adaptive_avg_pool2d(self.gradients, 1)
        cam = torch.mul(self.features, pooled_gradients).sum(dim=1, keepdim=True)
        cam = F.relu(cam)
        cam = F.interpolate(cam, input_tensor.size()[2:], mode="bilinear", align_corners=False)
        cam = cam.squeeze().detach().cpu().numpy()
        cam =(cam - cam.min()) / (cam.max() - cam.min())  # 归一化

        return cam
    def generate_bounding_box(self, cam, threshold):
        _, binary_map = cv2.threshold(cam, threshold, 255, cv2.THRESH_BINARY)

        contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        bounding_boxes = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            bounding_boxes.append((x, y, w, h))

        return bounding_boxes

# 加载训练好的模型
model = torch.load('fruit_model.pth')
model = model.cuda()
grad_cam = GradCAM(model)
Labels = {'mihoutao':0, 'ningmeng':1, 'shiliu':2}
# 创建视频捕获对象
video_path = 'fruits_video.mp4'
cap = cv2.VideoCapture(video_path)

# 获取视频的帧率和尺寸
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# 创建视频编写对象
output_path = 'output1_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

# 逐帧处理视频
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # 将帧转换为PIL图像
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # 对帧进行预处理
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    input_tensor = preprocess(frame_pil).unsqueeze(0).cuda()
    with torch.no_grad():
        output = model(input_tensor)
        _,predicted_class = torch.max(output,1)
    target_class = predicted_class.item()
    # 计算Grad-CAM
    cam = grad_cam.calculate_cam(input_tensor, target_class)
    cam = np.uint8(255 * cam)
    cam = cv2.resize(cam, (frame_pil.size[0], frame_pil.size[1]))

    threshold = 120
    bounding_boxes = grad_cam.generate_bounding_box(cam, threshold)

    # 在帧上绘制边界框
    for (x, y, w, h) in bounding_boxes:
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        class_idx = target_class
        class_name = list(Labels.keys())[list(Labels.values()).index(class_idx)]
        cv2.putText(frame,class_name,(x,y-10),cv2.FONT_HERSHEY_SIMPLEX,0.9,(0,255,0),2)
    # 将帧写入输出视频
    out.write(frame)

# 释放资源
cap.release()
out.release()
cv2.destroyAllWindows()

来看结果:

output1_video

        使用Grad-CAM进行目标定位,它可以逐帧处理视频并在每一帧上绘制边界框以定位目标对象。如果优化代码结构,减少预处理量和计算量,其实还可以实现实时定位的,但实时性能取决于计算机的处理能力和视频的帧率。大家可以动手试试!

以上即为全部内容。

 

 

  • 7
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值