目标检测-SSD代码详解

%matplotlib inline
import torch
import torchvision
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

#对每个锚框进行类别预测
def cls_predictor(num_inputs, num_anchors, num_classes):
    return nn.Conv2d(num_inputs, num_anchors * (num_classes + 1),
                     kernel_size=3, padding=1)
def forward(x, block):
    return block(x)

Y1 = forward(torch.zeros((2, 8, 20, 20)), cls_predictor(8, 5, 10))
Y2 = forward(torch.zeros((2, 16, 10, 10)), cls_predictor(16, 3, 10))
Y1.shape, Y2.shape

down_sample_blk:先经过第一个,第二个kernel size是33,padding=1卷积层,BN层,ReLU层,之后是22,strides=2的最大池化层。这样,特征图的高和宽就减半了,通道数是超参数。

def down_sample_blk(in_channels, out_channels):
    blk = []
    for _ in range(2):
        blk.append(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU())
        in_channels = out_channels
    blk.append(nn.MaxPool2d(2))
    return nn.Sequential(*blk)

借本网络块

def base_net():
    blk = []
    num_filters = [3, 16, 32, 64]
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(num_filters[i], num_filters[i + 1]))
    return nn.Sequential(*blk)

forward(torch.zeros((2, 3, 256, 256)), base_net()).shape
def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 1:
        blk = down_sample_blk(64,128)
    elif i == 4:
        blk = nn.AdaptiveAvgPool2d((1,1))
    else:
        blk = down_sample_blk(128,128)
    return blk
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)
    anchors = d2l.multibox_prior(Y, sizes=size, ratios=ratio)
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
         [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1
def flatten_pred(pred):
    return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1)

def concat_preds(preds):
    return torch.cat([flatten_pred(p) for p in preds], dim=1)

小型的SSD网络模型

class TinySSD(nn.Module):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes #检测的类别
        idx_to_in_channels = [64, 128, 128, 128, 128] #不同block的通道数
        for i in range(5):
            # 即赋值语句 `self.blk_i = get_blk(i)`
            setattr(self, f'blk_{i}', get_blk(i)) #blk块
            setattr(self, f'cls_{i}',
                cls_predictor(idx_to_in_channels[i],    						   num_anchors,num_classes)) # 类别预测,参数为输入通道数,锚框数,类别数
            setattr(self, f'bbox_{i}',
                    bbox_predictor(idx_to_in_channels[i], num_anchors)) # 边界框预测,参数为输入通道数,锚框数

    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5 #尺度为5,因此*5
        for i in range(5):
            # `getattr(self, 'blk_%d' % i)` 即访问 `self.blk_i`
            X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
                X, getattr(self, f'blk_{i}'), sizes[i], ratios[i],
                getattr(self, f'cls_{i}'), getattr(self, f'bbox_{i}')) # 计算5个block中每一个block的输出,anchors,类别预测,边界框偏移预测
        anchors = torch.cat(anchors, dim=1) # 将5个block的产生的锚框合并到一起[1,5444,4],batch为1是因为每个batch的锚框都是一样的
        cls_preds = concat_preds(cls_preds)#[32,8,32,32]32是batch,8是每个像素点上(4*2=8)num_anchors * (num_classes + 1),32,32是特征图的高宽
        cls_preds = cls_preds.reshape(cls_preds.shape[0], -1,self.num_classes + 1) #reshape为batch,锚框数,类别数+1(1表背景)
        bbox_preds = concat_preds(bbox_preds)
        #[1,5444,4],[32,5444,2],[32,21776]
        return anchors, cls_preds, bbox_preds

一个图像一共生成3232+1616+88+44+1*1= 5444个锚框。

net = TinySSD(num_classes=1)
X = torch.tensor((32,3,256,256))
anchors, cls_preds,bbox_preds = net(X) 
print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)
output anchors: torch.Size([1, 5444, 4])
output class preds: torch.Size([32, 5444, 2])
output bbox preds: torch.Size([32, 21776])

读取数据集

batch_size=32
train_iter,_ = d2l.load_data_bananas(batch_size)

device,net = d2l.try_gpu(),TinySSD(num_classes=1)
trainer = torch.optim.SGD(net.parameters(),lr=0.2,weight_decay=5e-4)

损失函数和评价函数

cls_loss = nn.CrossEntropyLoss(reduction='none') #reduction表示不进行其他的sum,mean之类的操作
bbox_loss = nn.L1Loss(reduction='none')

# 损失函数分两部分,一个是类别损失(用交叉熵),一个是边界框损失(用了L1损失,只算了非背景的正例损失)
def calc_loss(cls_preds,cls_labels,bbox_preds,bbox_labels,bbox_masks):
# [32,5444,2]cls_preds:网络输出的锚框预测类别,[32,5444]cls_labels:为锚框标注的类别,
# [32,21776]bbox_preds:网络输出的锚框的预测偏移量,[32,21776]bbox_labels:为锚框标注的偏移量
# bbox_masks:每个锚框对应的类别(0是背景[32,21776]
	batch_size,num_classes = cls_preds.shape[0],cls_preds.shape[2]
	cls = cls_loss(cls_preds.reshape(-1,num_classes),cls_labels.reshape(-1)).rehshape(batch_size,-1).mean(dim=1)
	bbox = bbox_loss(bbox_preds*bbox_masks,bbox_labels*bbox_masks).mean(dim=1)
	return cls + bbox
def cls_eval(cls_preds, cls_labels):                                            
    # 由于类别预测结果放在最后一维, `argmax` 需要指定最后一维。                                        
    return float(                                                               
        (cls_preds.argmax(dim=-1).type(cls_labels.dtype) == cls_labels).sum())  

def bbox_eval(bbox_preds, bbox_labels, bbox_masks):                           
    return float((torch.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())  

num_epochs, timer = 20, d2l.Timer()                                        
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],              
                        legend=['class error', 'bbox mae'])                
net = net.to(device)                                                       

网络模型图

在这里插入图片描述

for epoch in range(num_epochs):
	metric = d2l.Accumulator(4)
	net.train() #开始训练模式
	for features,target in train_iter: #在dataset中定义的getitem取值
		timer.start() #计时
		trainer.zero_grad() #梯度清零
		X,Y = features.to(device),target.to(device) #将数据指定到运算设备上
		anchors,cls_preds,bbox_preds = net(X)#生成多尺度锚框,通过网络模型为每个锚框预测类别和偏移量
		bbox_labels,bbox_masks,cls_labels = d2l.multibox_target(anchors,Y)#为每个锚框标注类别和偏移量
		l = calc_loss(cls_preds,cls_labels,bbox_preds,bbox_labels,bbox_masks)#根据锚框的类别和偏移量的预测和标注进行损失计算
		l.mean().backward() # 反向传播
		trainer.step() #梯度更新
		metric.add(cls_eval(cls_preds,cls_labels),cls_labels.numel(),bbox_eval(bbox_preds,bbox_labels,bbox_masks),bbox_labels.numel())
	cls_err,bbox_mae = 1-metric[0]/metric[1],metric[2]/metric[3] #误差
	animator.add(epoch+1,(cls_err,bbox_mae)) #画图

print(f'class err {cls_err:.2e}, bbox mae {bbox_mae:.2e}')
print(f'{len(train_iter.dataset) / timer.stop():.1f} examples/sec on '
      f'{str(device)}')

预测

X = torchvision.io.read_image('../../pytorch/img/banana.jpg').unsqueeze(0).float() #读进来的维度是[1,3,256,256]
img = X.squeeze(0).permute(1,2,0).long()

def predict(X):
	net.eval()
	anchors, cls_preds,bbox_preds = net(X.to(device))
	cls_probs = F.softmax(cls_preds,dim=2).permute(0,2,1)
	#output[1,5444,6]返回值为batch值,每个锚框的类别索引,置信度,预测边界框坐标
	output = d2l.multibox_detection(cls_preds,bbox_preds,anchors) 
	# 得到类别不是背景的锚框索引
	idx = [i for i,row in enumerate(output[0]) if row[0]!=-1 
	return output[0,idx]
output = predict(X) #取值可以为[128,6]125444】,【121776】,【154444

去除掉置信度低于阈值的锚框,得到最终预测结果

def display(img, output, threshold):
    d2l.set_figsize((5, 5))
    fig = d2l.plt.imshow(img)
    for row in output:
        score = float(row[1])
        if score < threshold:
            continue
        h, w = img.shape[0:2]
        bbox = [row[2:6] * torch.tensor((w, h, w, h), device=row.device)]
        d2l.show_bboxes(fig.axes, bbox, '%.2f' % score, 'w')

display(img, output.cpu(), threshold=0.9)
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值