%matplotlib inline
import torch
import torchvision
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
#对每个锚框进行类别预测
def cls_predictor(num_inputs, num_anchors, num_classes):
return nn.Conv2d(num_inputs, num_anchors * (num_classes + 1),
kernel_size=3, padding=1)
def forward(x, block):
return block(x)
Y1 = forward(torch.zeros((2, 8, 20, 20)), cls_predictor(8, 5, 10))
Y2 = forward(torch.zeros((2, 16, 10, 10)), cls_predictor(16, 3, 10))
Y1.shape, Y2.shape
down_sample_blk:先经过第一个,第二个kernel size是33,padding=1卷积层,BN层,ReLU层,之后是22,strides=2的最大池化层。这样,特征图的高和宽就减半了,通道数是超参数。
def down_sample_blk(in_channels, out_channels):
blk = []
for _ in range(2):
blk.append(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
blk.append(nn.BatchNorm2d(out_channels))
blk.append(nn.ReLU())
in_channels = out_channels
blk.append(nn.MaxPool2d(2))
return nn.Sequential(*blk)
借本网络块
def base_net():
blk = []
num_filters = [3, 16, 32, 64]
for i in range(len(num_filters) - 1):
blk.append(down_sample_blk(num_filters[i], num_filters[i + 1]))
return nn.Sequential(*blk)
forward(torch.zeros((2, 3, 256, 256)), base_net()).shape
def get_blk(i):
if i == 0:
blk = base_net()
elif i == 1:
blk = down_sample_blk(64,128)
elif i == 4:
blk = nn.AdaptiveAvgPool2d((1,1))
else:
blk = down_sample_blk(128,128)
return blk
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
Y = blk(X)
anchors = d2l.multibox_prior(Y, sizes=size, ratios=ratio)
cls_preds = cls_predictor(Y)
bbox_preds = bbox_predictor(Y)
return (Y, anchors, cls_preds, bbox_preds)
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
[0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1
def flatten_pred(pred):
return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1)
def concat_preds(preds):
return torch.cat([flatten_pred(p) for p in preds], dim=1)
小型的SSD网络模型
class TinySSD(nn.Module):
def __init__(self, num_classes, **kwargs):
super(TinySSD, self).__init__(**kwargs)
self.num_classes = num_classes #检测的类别
idx_to_in_channels = [64, 128, 128, 128, 128] #不同block的通道数
for i in range(5):
# 即赋值语句 `self.blk_i = get_blk(i)`
setattr(self, f'blk_{i}', get_blk(i)) #blk块
setattr(self, f'cls_{i}',
cls_predictor(idx_to_in_channels[i], num_anchors,num_classes)) # 类别预测,参数为输入通道数,锚框数,类别数
setattr(self, f'bbox_{i}',
bbox_predictor(idx_to_in_channels[i], num_anchors)) # 边界框预测,参数为输入通道数,锚框数
def forward(self, X):
anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5 #尺度为5,因此*5
for i in range(5):
# `getattr(self, 'blk_%d' % i)` 即访问 `self.blk_i`
X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
X, getattr(self, f'blk_{i}'), sizes[i], ratios[i],
getattr(self, f'cls_{i}'), getattr(self, f'bbox_{i}')) # 计算5个block中每一个block的输出,anchors,类别预测,边界框偏移预测
anchors = torch.cat(anchors, dim=1) # 将5个block的产生的锚框合并到一起[1,5444,4],batch为1是因为每个batch的锚框都是一样的
cls_preds = concat_preds(cls_preds)#[32,8,32,32]32是batch,8是每个像素点上(4*2=8)num_anchors * (num_classes + 1),32,32是特征图的高宽
cls_preds = cls_preds.reshape(cls_preds.shape[0], -1,self.num_classes + 1) #reshape为batch,锚框数,类别数+1(1表背景)
bbox_preds = concat_preds(bbox_preds)
#[1,5444,4],[32,5444,2],[32,21776]
return anchors, cls_preds, bbox_preds
一个图像一共生成3232+1616+88+44+1*1= 5444个锚框。
net = TinySSD(num_classes=1)
X = torch.tensor((32,3,256,256))
anchors, cls_preds,bbox_preds = net(X)
print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)
output anchors: torch.Size([1, 5444, 4])
output class preds: torch.Size([32, 5444, 2])
output bbox preds: torch.Size([32, 21776])
读取数据集
batch_size=32
train_iter,_ = d2l.load_data_bananas(batch_size)
device,net = d2l.try_gpu(),TinySSD(num_classes=1)
trainer = torch.optim.SGD(net.parameters(),lr=0.2,weight_decay=5e-4)
损失函数和评价函数
cls_loss = nn.CrossEntropyLoss(reduction='none') #reduction表示不进行其他的sum,mean之类的操作
bbox_loss = nn.L1Loss(reduction='none')
# 损失函数分两部分,一个是类别损失(用交叉熵),一个是边界框损失(用了L1损失,只算了非背景的正例损失)
def calc_loss(cls_preds,cls_labels,bbox_preds,bbox_labels,bbox_masks):
# [32,5444,2]cls_preds:网络输出的锚框预测类别,[32,5444]cls_labels:为锚框标注的类别,
# [32,21776]bbox_preds:网络输出的锚框的预测偏移量,[32,21776]bbox_labels:为锚框标注的偏移量
# bbox_masks:每个锚框对应的类别(0是背景[32,21776]
batch_size,num_classes = cls_preds.shape[0],cls_preds.shape[2]
cls = cls_loss(cls_preds.reshape(-1,num_classes),cls_labels.reshape(-1)).rehshape(batch_size,-1).mean(dim=1)
bbox = bbox_loss(bbox_preds*bbox_masks,bbox_labels*bbox_masks).mean(dim=1)
return cls + bbox
def cls_eval(cls_preds, cls_labels):
# 由于类别预测结果放在最后一维, `argmax` 需要指定最后一维。
return float(
(cls_preds.argmax(dim=-1).type(cls_labels.dtype) == cls_labels).sum())
def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
return float((torch.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())
num_epochs, timer = 20, d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
legend=['class error', 'bbox mae'])
net = net.to(device)
网络模型图
for epoch in range(num_epochs):
metric = d2l.Accumulator(4)
net.train() #开始训练模式
for features,target in train_iter: #在dataset中定义的getitem取值
timer.start() #计时
trainer.zero_grad() #梯度清零
X,Y = features.to(device),target.to(device) #将数据指定到运算设备上
anchors,cls_preds,bbox_preds = net(X)#生成多尺度锚框,通过网络模型为每个锚框预测类别和偏移量
bbox_labels,bbox_masks,cls_labels = d2l.multibox_target(anchors,Y)#为每个锚框标注类别和偏移量
l = calc_loss(cls_preds,cls_labels,bbox_preds,bbox_labels,bbox_masks)#根据锚框的类别和偏移量的预测和标注进行损失计算
l.mean().backward() # 反向传播
trainer.step() #梯度更新
metric.add(cls_eval(cls_preds,cls_labels),cls_labels.numel(),bbox_eval(bbox_preds,bbox_labels,bbox_masks),bbox_labels.numel())
cls_err,bbox_mae = 1-metric[0]/metric[1],metric[2]/metric[3] #误差
animator.add(epoch+1,(cls_err,bbox_mae)) #画图
print(f'class err {cls_err:.2e}, bbox mae {bbox_mae:.2e}')
print(f'{len(train_iter.dataset) / timer.stop():.1f} examples/sec on '
f'{str(device)}')
预测
X = torchvision.io.read_image('../../pytorch/img/banana.jpg').unsqueeze(0).float() #读进来的维度是[1,3,256,256]
img = X.squeeze(0).permute(1,2,0).long()
def predict(X):
net.eval()
anchors, cls_preds,bbox_preds = net(X.to(device))
cls_probs = F.softmax(cls_preds,dim=2).permute(0,2,1)
#output[1,5444,6]返回值为batch值,每个锚框的类别索引,置信度,预测边界框坐标
output = d2l.multibox_detection(cls_preds,bbox_preds,anchors)
# 得到类别不是背景的锚框索引
idx = [i for i,row in enumerate(output[0]) if row[0]!=-1
return output[0,idx]
output = predict(X) #取值可以为[128,6]
【1,2,5444】,【1,21776】,【1,5444,4】
去除掉置信度低于阈值的锚框,得到最终预测结果
def display(img, output, threshold):
d2l.set_figsize((5, 5))
fig = d2l.plt.imshow(img)
for row in output:
score = float(row[1])
if score < threshold:
continue
h, w = img.shape[0:2]
bbox = [row[2:6] * torch.tensor((w, h, w, h), device=row.device)]
d2l.show_bboxes(fig.axes, bbox, '%.2f' % score, 'w')
display(img, output.cpu(), threshold=0.9)