代码:https://github.com/magicshuang/yolov5_distillation
part1:源码赏析
源码:
... ...
# student
rois, cls_prob, bbox_pred, \
rpn_loss_cls, rpn_loss_box, \
RCNN_loss_cls, RCNN_loss_bbox, \
rois_label, stu_feature, mask_batch = fasterRCNN(im_data, im_info, gt_boxes, num_boxes)
# teacher
rois_, cls_prob_, bbox_pred_, \
rpn_loss_cls_, rpn_loss_box_, \
RCNN_loss_cls_, RCNN_loss_bbox_, \
rois_label_, sup_feature, _ = fasterRCNN_sup_vgg16(im_data, im_info, gt_boxes, num_boxes)
mask_list = []
for mask in mask_batch:
mask = (mask > 0).float().unsqueeze(0)
mask_list.append(mask)
mask_batch = torch.stack(mask_list, dim=0)
norms = mask_batch.sum() * 2
stu_feature_adap = fasterRCNN.stu_feature_adap(stu_feature)
sup_loss = (torch.pow(sup_feature - stu_feature_adap, 2) * mask_batch).sum() / norms
sup_loss = sup_loss * args.imitation_loss_weigth
... ...
feature是啥?
# feed image data to base model to obtain base feature map
base_feat = self.RCNN_base(im_data)
mask_batch就是那个Mask,看看作者咋算的?往上找
... ...
def forward(self, input):
# Algorithm:
#
# for each (H, W) location i
# generate 9 anchor boxes centered on cell i
# apply predicted bbox deltas at cell i to each of the 9 anchors
# filter out-of-image anchors
rpn_cls_score = input[0]
gt_boxes = input[1]
im_info = input[2]
num_boxes = input[3]
# map of shape (..., H, W)
height, width = rpn_cls_score.size(2), rpn_cls_score.size(3)
batch_size = gt_boxes.size(0)
feat_height, feat_width = rpn_cls_score.size(2), rpn_cls_score.size(3)
shift_x = np.arange(0, feat_width) * self._feat_stride
shift_y = np.arange(0, feat_height) * self._feat_stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shifts = torch.from_numpy(np.vstack((shift_x.ravel(), shift_y.ravel(),
shift_x.ravel(), shift_y.ravel())).transpose())
shifts = shifts.contiguous().type_as(rpn_cls_score).float()
A = self._num_anchors
K = shifts.size(0)
self._anchors = self._anchors.type_as(gt_boxes) # move to specific gpu.
all_anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4)
all_anchors = all_anchors.view(K * A, 4)
total_anchors = int(K * A)
keep = ((all_anchors[:, 0] >= -self._allowed_border) &
(all_anchors[:, 1] >= -self._allowed_border) &
(all_anchors[:, 2] < long(im_info[0][1]) + self._allowed_border) &
(all_anchors[:, 3] < long(im_info[0][0]) + self._allowed_border))
inds_inside = torch.nonzero(keep).view(-1)
# keep only inside anchors
anchors = all_anchors[inds_inside, :]
# label: 1 is positive, 0 is negative, -1 is dont care
labels = gt_boxes.new(batch_size, inds_inside.size(0)).fill_(-1)
bbox_inside_weights = gt_boxes.new(batch_size, inds_inside.size(0)).zero_()
bbox_outside_weights = gt_boxes.new(batch_size, inds_inside.size(0)).zero_()
overlaps = bbox_overlaps_batch(anchors, gt_boxes)
IOU_map = bbox_overlaps_batch(all_anchors, gt_boxes).view(
batch_size, height, width, A, gt_boxes.shape[1])
mask_batch = []
for i in range(batch_size):
max_iou, _ = torch.max(IOU_map[i].view(height* width* A,
gt_boxes.shape[1]), dim = 0)
mask_per_im = torch.zeros([height, width], dtype=torch.int64).cuda()
for k in range(gt_boxes.shape[1]):
if torch.sum(gt_boxes[i][k]) == 0.:
break
max_iou_per_gt = max_iou[k]*0.5
mask_per_gt = torch.sum(IOU_map[i][:,:,:,k]>max_iou_per_gt,
dim = 2)
mask_per_im +=mask_per_gt
mask_batch.append(mask_per_im)
... ...
# add for mask
outputs.append(mask_batch)
return outputs
再看那个Mask咋用的?
就像公式那样,先适应teacher尺寸,然后直接减,等,比较简单实现
part2:照猫画虎
作为一个裁缝,首先知道哪些布料是能直接用的,哪些需要自己纺织
对于yolo v5来说,这个feature是啥?
与fastrcnn不同的是,yolov5有3个尺度输出。我定义为3个尺度的feature输出
pred = model(imgs) # forward
feature_1,feature_2,feature_3 = pred
- feature_1 size:(6,3,32,32,7)
- feature_2 size:(6,3,16,16,7)
- feature_3 size:(6,3,8,8,7)
当改gt_boxes时我脑袋拧成麻花,怎么也想不通标签box每张图是不一样的,为什么论文能做到一个矩阵里。仔细看了dataloader才发现,,,,还带这么玩的
gt_boxes_padding = torch.FloatTensor(self.max_num_box, gt_boxes.size(1)).zero_()
if keep.numel() != 0:
gt_boxes = gt_boxes[keep]
num_boxes = min(gt_boxes.size(0), self.max_num_box)
gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes]
else:
num_boxes = 0
# permute trim_data to adapt to downstream processing
padding_data = padding_data.permute(2, 0, 1).contiguous()
im_info = im_info.view(3)
很明显,作者设置了个最大数量,每个图最多也就这些box,没有也占这么大地方,当然,多了也放不下。所以能拼成一个矩阵
那咱就接着改吧,他的输入为x1y1x2y2,我的是xywh
def make_gt_boxes(gt_boxes,max_num_box,batch,imgsize):
new_gt_boxes = []
for i in range(batch):
boxes = gt_boxes[gt_boxes[:,0]==i]
num_boxes = boxes.size(0)
if num_boxes<max_num_box:
gt_boxes_padding = torch.FloatTensor(max_num_box, gt_boxes.size(1)).zero_()
gt_boxes_padding[:num_boxes, :] = gt_boxes[:num_boxes]
else:
#gt_boxes_padding = torch.FloatTensor(max_num_box, gt_boxes.size(1)).zero_()
gt_boxes_padding = gt_boxes[:max_num_box]
new_gt_boxes.append(gt_boxes_padding.unsqueeze(0))
new_gt_boxes = torch.cat(new_gt_boxes)
# x_c,y_c to x1y1x2y2
new_gt_boxes_aim = new_gt_boxes
new_gt_boxes_aim[:, :, 2] = (new_gt_boxes[:, :, 2] - 0.5 * new_gt_boxes[:,:,4] )* imgsize[1]
new_gt_boxes_aim[:, :, 3] = (new_gt_boxes[:, :, 3] - 0.5 * new_gt_boxes[:, :, 5])* imgsize[0]
new_gt_boxes_aim[:, :, 4] = (new_gt_boxes[:, :, 2] + 0.5 * new_gt_boxes[:, :, 4])* imgsize[1]
new_gt_boxes_aim[:, :, 5] = (new_gt_boxes[:, :, 3] + 0.5 * new_gt_boxes[:, :, 5])* imgsize[0]
return new_gt_boxes_aim
他的标签1-4维度是box,yolov5是2-6,所以仅仅改这句话,就完了
gt_boxes = gt_boxes[:,:,2:].contiguous()
再来看看anchors,yolov5的anchor是可以设置的9个box,每3个对应一个尺度
例如:
anchors:
- [3,6, 5,11, 7,14] # P3/8
- [9,18, 14,23, 20,32] # P4/16
- [29,46, 43,68, 76,120] # P5/32
那么每个feature对应3个,就可以将anchor部分这么改
def generate_anchors(base_size, anchors):
"""
- [3,6, 5,11, 7,14] # P3/8
return shape=[3,4]
"""
base_anchor = np.array([1, 1, base_size, base_size]) - 1
_, _, x_ctr, y_ctr = _whctrs(base_anchor)
aim_ancher = []
for anchor in anchors:
x1 = x_ctr - 0.5 * base_size * anchor[0]
y1 = y_ctr - 0.5 * base_size * anchor[1]
x2 = x_ctr + 0.5 * base_size * anchor[0]
y2 = y_ctr + 0.5 * base_size * anchor[1]
aim_ancher.append([x1,y1,x2,y2])
return np.array(aim_ancher)
这里面的base_size就是一个小格格代表多大像素,也就是feature_stride
anchors = torch.from_numpy(generate_anchors(feat_stride,anchors))