摘要:
如何不使用目标检测也能把目标定位出来?
在计算机视觉领域,弱监督定位是一项重要任务,它旨在通过仅使用图像级别的标签来确定图像中物体的位置。其中,类激活图(Class Activation Map,CAM)成为一种常用的方法,它能够可视化模型的注意力区域,帮助我们理解模型在图像分类中的决策依据。而Grad-CAM采用梯度信息实现定位,且加适用于任意网络架构。
通过可视化与灵活的写入网络体系中,Grad-CAM可以很好的将模型感兴趣的区域定位出来,再结合opencv对图像或视频的处理,可以让我们实现类似于目标检测的效果。
详细步骤
首先,我们来简单的实现一下Grad-CAM,这个在我往期的文章里有讲到:Grad-CAM——模型所关注的
代码:
class GradCAM:
def __init__(self, model):
self.model = model
self.model.eval()
self.features = None
self.gradients = None
self.model.features.register_forward_hook(self.save_features_hook)
self.model.features.register_backward_hook(self.save_gradients_hook)
def save_features_hook(self, module, input, output):
self.features = output
def save_gradients_hook(self, module, grad_input, grad_output):
self.gradients = grad_output[0]
def calculate_cam(self, input_tensor, target_class):
output = self.model(input_tensor)
self.model.zero_grad()
one_hot = torch.zeros_like(output)
one_hot[0][target_class] = 1
output.backward(gradient=one_hot, retain_graph=True)
pooled_gradients = F.adaptive_avg_pool2d(self.gradients, 1)
cam = torch.mul(self.features, pooled_gradients).sum(dim=1, keepdim=True)
cam = F.relu(cam)
cam = F.interpolate(cam, input_tensor.size()[2:], mode="bilinear", align_corners=False)
cam = cam.squeeze().detach().cpu().numpy()
cam =(cam - cam.min()) / (cam.max() - cam.min()) # 归一化
return cam
载入模型,结合指定类别和输入向量,就可以得到热力图:
#加载训练好的模型
model = torch.load('fruit_model.pth')
model =model.cuda()
grad_cam = GradCAM(model)
# Load and preprocess the image
image_path = 'test.jpg'
image = Image.open(image_path).convert('RGB')
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(image).unsqueeze(0).cuda()
target_class = 0 # 训练模型时的类别索引
# Calculate Grad-CAM
cam = grad_cam.calculate_cam(input_tensor, target_class)
cam = np.uint8(255*cam)
cam = np.uint8(cv2.resize(cam,(image.size[0],image.size[1])))
heatmap = cv2.applyColorMap(cam,cv2.COLORMAP_JET)
heatmap_pil = Image.fromarray(heatmap)
image_pil = Image.fromarray(np.uint8(image))
result = Image.blend(image_pil,heatmap_pil,alpha=0.4)
plt.imshow(result)
plt.axis('off')
plt.show()
然后,我们给Grad-CAM添加画框的功能,可以设定阈值,通过图像二值化,就可以画框:
def generate_bounding_box(self, cam, threshold):
_, binary_map = cv2.threshold(cam, threshold, 255, cv2.THRESH_BINARY)
contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
bounding_boxes = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
bounding_boxes.append((x, y, w, h))
return bounding_boxes
显示部分改成这样:
# Calculate Grad-CAM
cam = grad_cam.calculate_cam(input_tensor, target_class)
cam = np.uint8(255 * cam)
cam = cv2.resize(cam, (image.size[0], image.size[1]))
# Generate bounding boxes
threshold = 140 # 调整阈值以控制边界框的生成
bounding_boxes = grad_cam.generate_bounding_box(cam, threshold)
# Draw bounding boxes
image_with_boxes = np.array(image)
for (x, y, w, h) in bounding_boxes:
cv2.rectangle(image_with_boxes, (x, y), (x + w, y + h), (0, 255, 0), 2)
# Display the result
plt.imshow(image_with_boxes)
plt.axis('off')
plt.show()
根据你所设定的阈值,这个框的范围会改变。
最后,我们来实现一下如何实现类似于目标检测的效果;其实就是将视频变成每一帧的检测:
import torch
import torch.nn.functional as F
from PIL import Image,ImageDraw
from torchvision import transforms
import numpy as np
import cv2
import matplotlib.pyplot as plt
class GradCAM:
def __init__(self, model):
self.model = model
self.model.eval()
self.features = None
self.gradients = None
self.model.features.register_forward_hook(self.save_features_hook)
self.model.features.register_backward_hook(self.save_gradients_hook)
def save_features_hook(self, module, input, output):
self.features = output
def save_gradients_hook(self, module, grad_input, grad_output):
self.gradients = grad_output[0]
def calculate_cam(self, input_tensor, target_class):
output = self.model(input_tensor)
self.model.zero_grad()
one_hot = torch.zeros_like(output)
one_hot[0][target_class] = 1
output.backward(gradient=one_hot, retain_graph=True)
pooled_gradients = F.adaptive_avg_pool2d(self.gradients, 1)
cam = torch.mul(self.features, pooled_gradients).sum(dim=1, keepdim=True)
cam = F.relu(cam)
cam = F.interpolate(cam, input_tensor.size()[2:], mode="bilinear", align_corners=False)
cam = cam.squeeze().detach().cpu().numpy()
cam =(cam - cam.min()) / (cam.max() - cam.min()) # 归一化
return cam
def generate_bounding_box(self, cam, threshold):
_, binary_map = cv2.threshold(cam, threshold, 255, cv2.THRESH_BINARY)
contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
bounding_boxes = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
bounding_boxes.append((x, y, w, h))
return bounding_boxes
# 加载训练好的模型
model = torch.load('fruit_model.pth')
model = model.cuda()
grad_cam = GradCAM(model)
Labels = {'mihoutao':0, 'ningmeng':1, 'shiliu':2}
# 创建视频捕获对象
video_path = 'fruits_video.mp4'
cap = cv2.VideoCapture(video_path)
# 获取视频的帧率和尺寸
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# 创建视频编写对象
output_path = 'output1_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
# 逐帧处理视频
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# 将帧转换为PIL图像
frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# 对帧进行预处理
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(frame_pil).unsqueeze(0).cuda()
with torch.no_grad():
output = model(input_tensor)
_,predicted_class = torch.max(output,1)
target_class = predicted_class.item()
# 计算Grad-CAM
cam = grad_cam.calculate_cam(input_tensor, target_class)
cam = np.uint8(255 * cam)
cam = cv2.resize(cam, (frame_pil.size[0], frame_pil.size[1]))
threshold = 120
bounding_boxes = grad_cam.generate_bounding_box(cam, threshold)
# 在帧上绘制边界框
for (x, y, w, h) in bounding_boxes:
cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
class_idx = target_class
class_name = list(Labels.keys())[list(Labels.values()).index(class_idx)]
cv2.putText(frame,class_name,(x,y-10),cv2.FONT_HERSHEY_SIMPLEX,0.9,(0,255,0),2)
# 将帧写入输出视频
out.write(frame)
# 释放资源
cap.release()
out.release()
cv2.destroyAllWindows()
来看结果:
output1_video
使用Grad-CAM进行目标定位,它可以逐帧处理视频并在每一帧上绘制边界框以定位目标对象。如果优化代码结构,减少预处理量和计算量,其实还可以实现实时定位的,但实时性能取决于计算机的处理能力和视频的帧率。大家可以动手试试!
以上即为全部内容。