前言
整理下以前学习的AugFPN代码理解思路。
论文链接:https://arxiv.org/abs/1912.05384
代码链接:https://github.com/Gus-Guo/AugFPN
Aug-FPN的网络结构部分比较容易理解,这里主要看看一致性监督的代码。代码是利用的mmdetection平台写的。我主要讲讲自己学习中感觉需要理解的代码,可能自己也有理解不到位的地方,对新手不太友好。总之,欢迎相互交流!
1、AugFPN:一致性监督
把retinanet_r50_augfpn_1x作为例子,这里主要理解下他的一致性监督的创新点的代码:
训练过程:
需要注意的是:这里,没在配置文件里配置self.auxiliary_bbox_roi_extractor和self.auxiliary_bbox_head,而是直接在这的初始化里直接build。
if self.use_consistent_supervision:
bbox_roi_extractor=dict(
type='AuxAllLevelRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
out_channels=256,
featmap_strides=[8,16,32]) # only apply to feature map belonging to FPN
self.auxiliary_bbox_roi_extractor = builder.build_roi_extractor(
bbox_roi_extractor)
bbox_head=dict(
type='AuxiliarySharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=81,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=False)
self.auxiliary_bbox_head = builder.build_head(bbox_head)
def forward_train(self,
img,
img_metas,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None):
if self.use_consistent_supervision: # 是否使用一致性监督
x, y = self.extract_feat(img)
gt_bboxes_auxiliary = [gt.clone() for gt in gt_bboxes] # 复制GT_BOX:[N,4]
gt_labels_auxiliary = [label.clone() for label in gt_labels]
else:
x = self.extract_feat(img)
outs = self.bbox_head(x) #pre_cls,box
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg) # 预测输出+(gt_bboxes, gt_labels, img_metas, self.train_cfg)
losses = self.bbox_head.loss( # 原来retinanet的输出loss
*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
if self.use_consistent_supervision:
proposal_cfg = self.train_cfg.auxiliary.proposal # 调用配置文件的配置属性
proposal_inputs = outs + (img_metas, proposal_cfg) #相当于做后处理,结果是nms之后保留了1000个bbox.
proposal_list = self.bbox_head.get_bboxes_auxiliary(*proposal_inputs) # [1000,4]预测的TOPK分数还原成BOX,作为侯选框,1000个,
# 建立正负样本的分配方法MaxIoUAssigner和采样方法RandomSampler
bbox_assigner = build_assigner(self.train_cfg.auxiliary.assigner)
bbox_sampler = build_sampler(
self.train_cfg.auxiliary.sampler, context=self)
num_imgs = img.size(0)
if gt_bboxes_ignore is None: # 忽略样本为None
gt_bboxes_ignore = [None for _ in range(num_imgs)]
sampling_results = []
for i in range(num_imgs):
assign_result = bbox_assigner.assign( #预测框与GT框最大IOU分配,返回1000
proposal_list[i], gt_bboxes_auxiliary[i], gt_bboxes_ignore[i],
gt_labels_auxiliary[i])
sampling_result = bbox_sampler.sample( # 采样,是否加入gt
assign_result,
proposal_list[i],
gt_bboxes_auxiliary[i],
gt_labels_auxiliary[i],
feats=[lvl_feat[i][None] for lvl_feat in x])
sampling_results.append(sampling_result)
rois = bbox2roi([res.bboxes for res in sampling_results])
# y:表示FPN中经过1X1卷集后的三个卷积层,论文说是为了在特征融合之前,AugFPN通过一致的监督缩小了不同尺度特征之间的语义差距。
bbox_feats_raw = self.auxiliary_bbox_roi_extractor(y[:self.auxiliary_bbox_roi_extractor.num_inputs], rois) # ROI
cls_score_auxiliary, bbox_pred_auxiliary = self.auxiliary_bbox_head(bbox_feats_raw) # 全连接预测输出
bbox_targets = self.auxiliary_bbox_head.get_target(
sampling_results, gt_bboxes, gt_labels, self.train_cfg.auxiliary.rcnn) #返回【label,label_weight,box,box_weight】
loss_bbox_auxiliary = self.auxiliary_bbox_head.loss(cls_score_auxiliary, bbox_pred_auxiliary,
*bbox_targets, alpha=0.25, num_level=3)
losses.update(loss_bbox_auxiliary) # 把新的loss字典加入
return losses
2.get_bboxes_auxiliary
这里get_bboxes_auxiliary和原来的retinanet的代码一样,就是通过预测结果生成预测的框,这里和rpn有点像(个人见解:因为过程都一样,只不过这更像是反馈。它是利用一阶段的输出作为proposal,但是它没有去利用它调节框,而仅仅是为了监督不同尺寸的特征层上的目标,使他们保持一致性)
def get_bboxes_auxiliary(self, cls_scores, bbox_preds, img_metas, cfg,
rescale=False):
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
mlvl_anchors = [
self.anchor_generators[i].grid_anchors(cls_scores[i].size()[-2:],
self.anchor_strides[i])
for i in range(num_levels)
]
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels) # .detach()
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self.get_bboxes_single_auxiliary(cls_score_list, bbox_pred_list,
mlvl_anchors, img_shape,
scale_factor, cfg, rescale)
result_list.append(proposals)
return result_list
在这里插入代码片
3.sampling_result
就是将gt框作为正样本,拼接到正样本的box里,相应的索引和label及IOU值也随着改变。
eg:proposal[1000,4],gt_box:[3,4],进行cat成:[1003,4]
def sample(self,
assign_result,
bboxes,
gt_bboxes,
gt_labels=None,
**kwargs):
"""Sample positive and negative bboxes.
This is a simple implementation of bbox sampling given candidates,
assigning results and ground truth bboxes.
Args:
assign_result (:obj:`AssignResult`): Bbox assigning results.
bboxes (Tensor): Boxes to be sampled from.
gt_bboxes (Tensor): Ground truth bboxes.
gt_labels (Tensor, optional): Class labels of ground truth bboxes.
Returns:
:obj:`SamplingResult`: Sampling result.
"""
bboxes = bboxes[:, :4]
gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8) # 初始化为0
if self.add_gt_as_proposals:
bboxes = torch.cat([gt_bboxes, bboxes], dim=0) # 将GT加到侯选框里
assign_result.add_gt_(gt_labels) # 分配Label和索引
gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) #初始化GT为1
gt_flags = torch.cat([gt_ones, gt_flags])
num_expected_pos = int(self.num * self.pos_fraction) # 配置里所要求的正样本数量。
pos_inds = self.pos_sampler._sample_pos(
assign_result, num_expected_pos, bboxes=bboxes, **kwargs) # 真正的正样本索引
# We found that sampled indices have duplicated items occasionally.
# (may be a bug of PyTorch)
pos_inds = pos_inds.unique() # 去掉重复的索引
num_sampled_pos = pos_inds.numel() #计算个数
num_expected_neg = self.num - num_sampled_pos # 负样本索引
if self.neg_pos_ub >= 0:
_pos = max(1, num_sampled_pos)
neg_upper_bound = int(self.neg_pos_ub * _pos)
if num_expected_neg > neg_upper_bound:
num_expected_neg = neg_upper_bound
neg_inds = self.neg_sampler._sample_neg( # 从负样本中按随机的方法取相应比列的负样本,比如1:2
assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
neg_inds = neg_inds.unique()
return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
先生成一个gt索引,可以看出gt框的数量是从1开始索引的(0应该是作为背景)
eg.如果有三个gt框,那么就生成1,2,3的值作为框的索引,然后cat到gt_inds上。
同时,所求得IOU也要拼接上与自身得IOU值,即都为1
最后,把相应得label一起拼接上
def add_gt_(self, gt_labels):
self_inds = torch.arange(
1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device) # 对应的侯选框索引,拼接到开头.为什么这里不是从0开始生成呢?因为正负样本分配的时候,负样本为0
self.gt_inds = torch.cat([self_inds, self.gt_inds]) # 后面的不用加num_gt吗?因为本来对因的列就是num_gt多个
self.max_overlaps = torch.cat(
[self.max_overlaps.new_ones(self.num_gts), self.max_overlaps]) # IOU为1
if self.labels is not None:
self.labels = torch.cat([gt_labels, self.labels])
4.bbox2roi
这里和原来的代码不同的是,没有使用bbox_mapping,将roi按面积大小分层,应该是为了符合每层都要有相同目标的ROI去监督。
def bbox2roi(bbox_list):
"""Convert a list of bboxes to roi format.
Args:
bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
of images.
Returns:
Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
"""
rois_list = []
for img_id, bboxes in enumerate(bbox_list):
if bboxes.size(0) > 0:
img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) #维度为(bboxes.size(0), 1),填充img_id
rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1) #将初始化为0的ROI拼接到BOX里:[N,5]
else:
rois = bboxes.new_zeros((0, 5))
rois_list.append(rois)
rois = torch.cat(rois_list, 0)
return rois
5.target
def bbox_target_single(pos_bboxes,
neg_bboxes,
pos_gt_bboxes,
pos_gt_labels,
cfg,
reg_classes=1,
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]):
num_pos = pos_bboxes.size(0)
num_neg = neg_bboxes.size(0)
num_samples = num_pos + num_neg
labels = pos_bboxes.new_zeros(num_samples, dtype=torch.long) # 先初始化为0;[512,]
label_weights = pos_bboxes.new_zeros(num_samples)
bbox_targets = pos_bboxes.new_zeros(num_samples, 4) # 先初始化为0:[512,4]
bbox_weights = pos_bboxes.new_zeros(num_samples, 4)
if num_pos > 0:
labels[:num_pos] = pos_gt_labels # 填上正样本的label
pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
label_weights[:num_pos] = pos_weight
pos_bbox_targets = bbox2delta(pos_bboxes, pos_gt_bboxes, target_means,
target_stds) #编码:按公式,然后还要求个正太分布阳的归一化
bbox_targets[:num_pos, :] = pos_bbox_targets #
bbox_weights[:num_pos, :] = 1
if num_neg > 0:
label_weights[-num_neg:] = 1.0
if reg_classes > 1:
bbox_targets, bbox_weights = expand_target(bbox_targets, bbox_weights, #[512,4]-[512,4*81]
labels, reg_classes) #
return labels, label_weights, bbox_targets, bbox_weights
6.loss
这里计算每层的loss,而不是将每层loss合并起来
def loss(self,
cls_score,
bbox_pred,
labels,
label_weights,
bbox_targets,
bbox_weights,
alpha=0.25,
num_level=4,
reduce=True):
losses = dict()
for i in range(num_level): # 分层
cls_score_level_i = cls_score[i::num_level,:] # 每层的:[512,]
bbox_pred_level_i = bbox_pred[i::num_level, :] #
losses['loss_cls_level%d'%i] = weighted_cross_entropy(cls_score_level_i, labels, label_weights, reduce=reduce)*alpha
losses['loss_reg_level%d'%i] = weighted_smoothl1(bbox_pred_level_i, bbox_targets, bbox_weights, avg_factor=bbox_targets.size(0)) * alpha
return losses
总结
这些仅是个人见解,如有疑问,或者不同的见解,欢迎相互交流,共同学习!