Faster R-CNN代码分析
build_proposals层:
该层是最难理解的,所有Faster R-CNN的核心全部都在这个函数里面。
def build_proposals(self, is_training, rpn_cls_prob, rpn_bbox_pred, rpn_cls_score):
if is_training:
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") #针对于RPN层的输出,经过了NMS后选出来了以大概2000个框,及对应的分数
#rpn_cls_prob: RPN层输出的objectness的值
#rpn_bbox_pred: RPN层输出的box的取值,即:tx, ty, tw, th
print(rois.shape)
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor") #为每一个anchor中打上标签,选出来的anchor打上1和0,没有选上的打上-1,同时定义了边框回归的目标参数
# Try to have a deterministic order for the computing graph, for reproducibility
with tf.control_dependencies([rpn_labels]):
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
else:
if cfg.FLAGS.test_mode == 'nms':
rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
elif cfg.FLAGS.test_mode == 'top':
rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
else:
raise NotImplementedError
return rois
可以看到该函数主要分为了三个子函数,分别是proposal_layer,anchor_target_layer,proposal_target_layer.那么下面分别讲一下三个layer分别做了哪些事情。这里一定要注意一个细节,就是proposal_target_layer的传入参数是proposal_layer的返回值,言外之意它们两个是有联系的,而anchor_target_layer是没有用到proposal_layer的返回值的,言外之意两者之间没有任何联系。
proposal_layer:
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
"""A simplified version compared to fast/er RCNN
For details please see the technical report
"""
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
if cfg_key == "TRAIN":
pre_nms_topN = cfg.FLAGS.rpn_train_pre_nms_top_n #12000
post_nms_topN = cfg.FLAGS.rpn_train_post_nms_top_n #2000
nms_thresh = cfg.FLAGS.rpn_train_nms_thresh #0.7
else:
pre_nms_topN = cfg.FLAGS.rpn_test_pre_nms_top_n #6000
post_nms_topN = cfg.FLAGS.rpn_test_post_nms_top_n #300
nms_thresh = cfg.FLAGS.rpn_test_nms_thresh #0.7
im_info = im_info[0]
# Get the scores and bounding boxes
scores = rpn_cls_prob[:, :, :, num_anchors:]
rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
scores = scores.reshape((-1, 1))
proposals = bbox_transform_inv(anchors, rpn_bbox_pred) #得出每一个原始anchor,在predict后的真实坐标。
proposals = clip_boxes(proposals, im_info[:2]) #注意!在这里将超出图像边界的proposal进行边界裁剪,使之在图像边界之内,注意啊。这里不是对超出图像边界的proposal进行剔除!只是裁剪不会改变anchor个数的!
# Pick the top region proposals
##对框按照前景分数进行排序,order中指示了框的索引
order = scores.ravel().argsort()[::-1]
if pre_nms_topN > 0:
order = order[:pre_nms_topN] #在还没有进行真正的nms操作前,先取得分数前12000个候选款,注意order是索引
proposals = proposals[order, :] #这才是真正的取出满足条件的proposals
scores = scores[order] #这里取出这些满足条件的分数
# Non-maximal suppression
keep = nms(np.hstack((proposals, scores)), nms_thresh) #这里进行NMS操作
# Pick th top region proposals after NMS
if post_nms_topN > 0:
keep = keep[:post_nms_topN] #我这里只要前2000个,也就是说咱们通过这一步,从大量锚点中选择了最合适的2000个
proposals = proposals[keep, :]
scores = scores[keep]
# Only support single image as input
batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) #这里是加一个batch的索引,因为batch都是1,所以这里的索引就是0了
blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
#print(blob.shape) (1500-2000,5)
# print(scores.shape) (1500-2000,1)
return blob, scores #那么这里返回的就是那2000个框,以及他们的一个scores
anchor_target_layer
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
"""Same as the anchor target layer in original Fast/er RCNN """
A = num_anchors #9
total_anchors = all_anchors.shape[0]
#print(total_anchors) 38*38*9
K = total_anchors / num_anchors
im_info = im_info[0]
# allow boxes to sit over the edge by a small amount
_allowed_border = 0 #允许框是紧贴图像边缘的
# map of shape (..., H, W)
height, width = rpn_cls_score.shape[1:3]
# only keep anchors inside the image
# 过滤掉不在图像范围内的Boxes,首先用where函数加条件筛选出索引,注意这里是过滤!也就是剔除!
inds_inside = np.where(
(all_anchors[:, 0] >= -_allowed_border) &
(all_anchors[:, 1] >= -_allowed_border) &
(all_anchors[:, 2] < im_info[1] + _allowed_border) & # width
(all_anchors[:, 3] < im_info[0] + _allowed_border) # height
)[0]
# keep only inside anchors
anchors = all_anchors[inds_inside, :]
#print(anchors.shape)
# label: 1 is positive, 0 is negative, -1 is dont care
labels = np.empty((len(inds_inside),), dtype=np.float32)
labels.fill(-1)
# overlaps between the anchors and the gt boxes
# overlaps (ex, gt)
overlaps = bbox_overlaps(
np.ascontiguousarray(anchors, dtype=np.float),
np.ascontiguousarray(gt_boxes, dtype=np.float))
# 得到一个array,shape=(3938, K),N=3938,(N,K),N表示候选框个数,K表示真实框个数,相当于一个表格,值为候选框与真实框的iou
#print(overlaps.shape) (3938,1)
argmax_overlaps = overlaps.argmax(axis=1) #按行比较 取得的是每一个被筛选出来的anchor最和哪个gt拥有最大overlaps
#print(argmax_overlaps)
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] #拿到了overlaps的值,每一个anchor和哪个gt拥有最大比例的
gt_argmax_overlaps = overlaps.argmax(axis=0) #按列比较 哪一个anchor和gt拥有最大比例
gt_max_overlaps = overlaps[gt_argmax_overlaps,
np.arange(overlaps.shape[1])]
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
if not cfg.FLAGS.rpn_clobber_positives: #rpn_clobber_positives:False
# assign bg labels first so that positive labels can clobber them
# first set the negatives
labels[max_overlaps < cfg.FLAGS.rpn_negative_overlap] = 0
#对于每一个gt,重叠率最大的那个anchor为fg
# fg label: for each gt, anchor with highest overlap
labels[gt_argmax_overlaps] = 1
# fg label: above threshold IOU
labels[max_overlaps >= cfg.FLAGS.rpn_positive_overlap] = 1
if cfg.FLAGS.rpn_clobber_positives:
# assign bg labels last so that negative labels can clobber positives
labels[max_overlaps < cfg.FLAGS.rpn_negative_overlap] = 0
# subsample positive labels if we have too many
#随机的从正样本中抽取128个
num_fg = int(cfg.FLAGS.rpn_fg_fraction * cfg.FLAGS.rpn_batchsize) #128
fg_inds = np.where(labels == 1)[0]
if len(fg_inds) > num_fg:#如果超过数量,调用npr.choice()随机采样
disable_inds = npr.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False) #随机抽取 False代表不能取重复元素
labels[disable_inds] = -1#采样之后标记为-1,计算时忽略
# subsample negative labels if we have too many
num_bg = cfg.FLAGS.rpn_batchsize - np.sum(labels == 1)
bg_inds = np.where(labels == 0)[0]
if len(bg_inds) > num_bg:
disable_inds = npr.choice(
bg_inds, size=(len(bg_inds) - num_bg), replace=False)
labels[disable_inds] = -1
#print(gt_boxes.shape) (1:5)
#print(gt_boxes[argmax_overlaps, :].shape) (3938,5)
#print(gt_boxes[argmax_overlaps, :])
#print(argmax_overlaps.shape) (3938,)
#print(type(argmax_overlaps)) #ndarray
bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) #anchors为筛选过后的锚,gt_rois筛选过后的真实框 算回归损失的dx,dh,dw,dy
bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
# only the positive ones have regression targets
bbox_inside_weights[labels == 1, :] = np.array(cfg.FLAGS2["bbox_inside_weights"])
bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
if cfg.FLAGS.rpn_positive_weight < 0:
# uniform weighting of examples (given non-uniform sampling)
num_examples = np.sum(labels >= 0)
positive_weights = np.ones((1, 4)) * 1.0 / num_examples
negative_weights = np.ones((1, 4)) * 1.0 / num_examples
else:
assert ((cfg.FLAGS.rpn_positive_weight > 0) &
(cfg.FLAGS.rpn_positive_weight < 1))
positive_weights = (cfg.FLAGS.rpn_positive_weight /
np.sum(labels == 1))
negative_weights = ((1.0 - cfg.FLAGS.rpn_positive_weight) /
np.sum(labels == 0))
bbox_outside_weights[labels == 1, :] = positive_weights
bbox_outside_weights[labels == 0, :] = negative_weights
# map up to original set of anchors
labels = _unmap(labels, total_anchors, inds_inside, fill=-1) #前面对为total_anchors的回归目标,labels以及权重进行了赋值,还有一些我们没有进行赋值,这些其实都是没用的。所以通过unmap函数将其补全,比如labels补为-1.其它的类似
bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
# labels
labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
labels = labels.reshape((1, 1, A * height, width))
rpn_labels = labels
# bbox_targets
bbox_targets = bbox_targets \
.reshape((1, height, width, A * 4))
rpn_bbox_targets = bbox_targets
# bbox_inside_weights
bbox_inside_weights = bbox_inside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_inside_weights = bbox_inside_weights
# bbox_outside_weights
bbox_outside_weights = bbox_outside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_outside_weights = bbox_outside_weights
return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
注意这里的函数跟前面proposal_layer没有任何关系,这里要做的是为每一个anchor打上rpn标签,属于前景还是背景。同时定义了一些rpn的目标回归参数。这个rpn网络训练只选取了128个正样本,128个负样本这些都是去训练RPN网络的。这是独立的!在测试阶段是没有这一步的!
proposal_target_layer()
def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
"""
# Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
# (i.e., rpn.proposal_layer.ProposalLayer), or any other source
all_rois = rpn_rois
all_scores = rpn_scores
# Include ground-truth boxes in the set of candidate rois
if cfg.FLAGS.proposal_use_gt:
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
all_rois = np.vstack(
(all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
)
# not sure if it a wise appending, but anyway i am not using it
all_scores = np.vstack((all_scores, zeros))
# TRAIN.BATCH_SIZE是感兴趣区域的数量
# rois_per_image就是每一张图片允许的roi区域batch。
# 在其他地方也遇到了rois_per_image,名字不一样,其实就是一个限制参数。
num_images = 1
rois_per_image = cfg.FLAGS.batch_size / num_images #256/1
fg_rois_per_image = np.round(cfg.FLAGS.proposal_fg_fraction * rois_per_image) #64
# Sample rois with classification labels and bounding box regression
# targets
#_sample_rois函数,对每张图片的Batch按照参数设置随机采样
labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
all_rois, all_scores, gt_boxes, fg_rois_per_image,
rois_per_image, _num_classes)
rois = rois.reshape(-1, 5)
roi_scores = roi_scores.reshape(-1)
labels = labels.reshape(-1, 1)
bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
这里还有一个很重要的函数。_sample_rois
def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
"""Generate a random sample of RoIs comprising foreground and background
examples.
"""
# overlaps: (rois x gt_boxes)
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
gt_assignment = overlaps.argmax(axis=1) #按行比较,对于每个roi,找到跟其重叠最大的gt索引
max_overlaps = overlaps.max(axis=1) #对于每个roi,找到与gt_box重合的最大的overlap
labels = gt_boxes[gt_assignment, 4] #对于每个roi,找到归属的类别
# Select foreground RoIs as those with >= FG_THRESH overlap
fg_inds = np.where(max_overlaps >= cfg.FLAGS.roi_fg_threshold)[0] #cfg.FLAGS.roi_fg_threshold 0.5
# Guard against the case when an image has fewer than fg_rois_per_image
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.FLAGS.roi_bg_threshold_high) &
(max_overlaps >= cfg.FLAGS.roi_bg_threshold_low))[0] #0-0.5之间
# Small modification to the original version where we ensure a fixed number of regions are sampled
if fg_inds.size > 0 and bg_inds.size > 0:
fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
bg_rois_per_image = rois_per_image - fg_rois_per_image
to_replace = bg_inds.size < bg_rois_per_image
bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
elif fg_inds.size > 0:
to_replace = fg_inds.size < rois_per_image
fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = rois_per_image
elif bg_inds.size > 0:
to_replace = bg_inds.size < rois_per_image
bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = 0
else:
raise Exception()
# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
labels[int(fg_rois_per_image):] = 0
rois = all_rois[keep_inds]
roi_scores = all_scores[keep_inds]
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
整个函数,其实就做了几件事为Proposal_layer选出来的那些ROIs打上真实类别标签,而且选1/4的正样本(64个),负样本为(256-64=192个)去训练Fast R-CNN。同时也是定义了Fast R-CNN的一些回归参数等。注意两个256的区别!anchor_target_layer中的256是128个正样本,128个负样本,标签为前背景标签,这是去训练RPN网络的,Proposal_target_layer中的256是选出64个正样本,192个负样本。标签为真实类别的标签。这是去训练Fast R-CNN网络的!那么看到这个源码就可以理解Faster R-CNN中的整个训练过程,
Faster R-CNN = RPN +Fast R-CNN的,RPN和Fast R-CNN是两个独立的网络,类似于GAN网络那样,生成器和判别器也是两个独立的网络。那么Faster R-CNN整个训练过程应该如下所示。
- 在预训练的model上,训练RPN网络
- 利用训练好的RPN
- 第一次训练Fast-RCNN网络
- 第二次训练RPN网络
- 再次利用步骤4,训练好的RPN网络搜集proposals
- 第二次训练Fast-RCNN网络