前言
这块我们学习的事rpn、proposal和ROI过程,这里是对经过vgg16网络后的特征的处理先上整体的流程图:
先让我们看一下proposal前面的过程。
_region_proposal()
可以看到RPN网络实际分为2条线,上面一条通过softmax分类anchors获得positive和negative分类,下面一条用于计算对于anchors的bounding box regression偏移量,以获得精确的proposal。而最后的Proposal层则负责综合positive anchors和对应bounding box regression偏移量获取proposals,同时剔除太小和超出边界的proposals。其实整个网络到了Proposal Layer这里,就完成了相当于目标定位的功能。
def _region_proposal(self, net_conv, is_training, initializer):
rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,
scope="rpn_conv/3x3")
self._act_summaries.append(rpn)
rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_cls_score')
# change it so that the score has 2 as its channel size
rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred")
rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
if is_training:
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
# Try to have a deterministic order for the computing graph, for reproducibility
with tf.control_dependencies([rpn_labels]):
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
else:
if cfg.TEST.MODE == 'nms':
rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
elif cfg.TEST.MODE == 'top':
rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
else:
raise NotImplementedError
self._predictions["rpn_cls_score"] = rpn_cls_score
self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
self._predictions["rpn_cls_prob"] = rpn_cls_prob
self._predictions["rpn_cls_pred"] = rpn_cls_pred
self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
self._predictions["rois"] = rois
return rois
这里可以看到首先经过一个33的卷积层,这里是融合像素点周围八个点的信息,可以使特征更鲁棒,然后经过11卷积层这是二分类那一支,然后reshape成[1, 2x9, H, W],经过softmax分类,在reshape回[1, 2, 9xH, W],这就完成了二分类那一支,然后是坐标预测的1*1卷积。这里有几个结果要说明一下:
self._predictions["rpn_cls_score"] = rpn_cls_score # 每个位置的9个anchors是正样本还是负样本
self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape # 每个anchors是正样本还是负样本
self._predictions["rpn_cls_prob"] = rpn_cls_prob # 每个位置的9个anchors是正样本和负样本的概率
self._predictions["rpn_cls_pred"] = rpn_cls_pred # 每个位置的9个anchors预测的类别,[1,?,9,?]的列向量
self._predictions["rpn_bbox_pred"] = rpn_bbox_pred # 每个位置的9个anchors回归位置偏移
self._predictions["rois"] = rois # 256个anchors的类别(第一维)及位置(后四维)
proposal_layer.py
# --------------------------------------------------------
# Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from model.config import cfg
from model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_tf, clip_boxes_tf
from model.nms_wrapper import nms
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
"""A simplified version compared to fast/er RCNN
For details please see the technical report
"""
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
# Get the scores and bounding boxes
scores = rpn_cls_prob[:, :, :, num_anchors:]
rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
scores = scores.reshape((-1, 1))
proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
proposals = clip_boxes(proposals, im_info[:2])
# Pick the top region proposals
order = scores.ravel().argsort()[::-1]
if pre_nms_topN > 0:
order = order[:pre_nms_topN]
proposals = proposals[order, :]
scores = scores[order]
# Non-maximal suppression
keep = nms(np.hstack((proposals, scores)), nms_thresh)
# Pick th top region proposals after NMS
if post_nms_topN > 0:
keep = keep[:post_nms_topN]
proposals = proposals[keep, :]
scores = scores[keep]
# Only support single image as input
batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
return blob, scores
def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
# Get the scores and bounding boxes
scores = rpn_cls_prob[:, :, :, num_anchors:]
scores = tf.reshape(scores, shape=(-1,))
rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))
proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
proposals = clip_boxes_tf(proposals, im_info[:2])
# Non-maximal suppression
indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)
boxes = tf.gather(proposals, indices)
boxes = tf.to_float(boxes)
scores = tf.gather(scores, indices)
scores = tf.reshape(scores, shape=(-1, 1))
# Only support single image as input
batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32)
blob = tf.concat([batch_inds, boxes], 1)
return blob, scores
这里只对后面的进行注解:首先构造分数图形式为[1,?,?,9],然后对生成的分数reshape,把坐标预测改为4列,已知anchors和偏移求预测的坐标 anchors[wh9,4] rpn_bbox_pred[?,4],限制坐标在原始图内,然后通过nms操作得到2000或3000个anchors,获得经过nms操作的anchors的索引,根据索引获得分数,将分数化为1列,最后初始化成全0列表。返回结果。
bbox_transform_inv_tf()
对于窗口一般使用四维向量 [公式] 表示,分别表示窗口的中心点坐标和宽高。对于图 11,红色的框A代表原始的positive Anchors,绿色的框G代表目标的GT,我们的目标是寻找一种关系,使得输入原始的anchor A经过映射得到一个跟真实窗口G更接近的回归窗口G’,即:
那么经过何种变换F才能从图10中的anchor A变为G’呢? 比较简单的思路就是:
先做平移
再做缩放
def bbox_transform_inv_tf(boxes, deltas):
boxes = tf.cast(boxes, deltas.dtype)
widths = tf.subtract(boxes[:, 2], boxes[:, 0]) + 1.0
heights = tf.subtract(boxes[:, 3], boxes[:, 1]) + 1.0
ctr_x = tf.add(boxes[:, 0], widths * 0.5)
ctr_y = tf.add(boxes[:, 1], heights * 0.5)
dx = deltas[:, 0]
dy = deltas[:, 1]
dw = deltas[:, 2]
dh = deltas[:, 3]
pred_ctr_x = tf.add(tf.multiply(dx, widths), ctr_x)
pred_ctr_y = tf.add(tf.multiply(dy, heights), ctr_y)
pred_w = tf.multiply(tf.exp(dw), widths)
pred_h = tf.multiply(tf.exp(dh), heights)
pred_boxes0 = tf.subtract(pred_ctr_x, pred_w * 0.5)
pred_boxes1 = tf.subtract(pred_ctr_y, pred_h * 0.5)
pred_boxes2 = tf.add(pred_ctr_x, pred_w * 0.5)
pred_boxes3 = tf.add(pred_ctr_y, pred_h * 0.5)
return tf.stack([pred_boxes0, pred_boxes1, pred_boxes2, pred_boxes3], axis=1)
这里和公式完全对应就不一一解释。这块的最终结果后续用来作为坐标的修正。
clip_boxes_tf()
这块是用来限制anchors在图片中,这里不做详细解释,只写两个函数的注解:
tf.minimum(boxes[:, 0], im_info[1] - 1 保证预测的宽高不超出真实图片的宽高范围
tf.maximum(x,0) 保证预测宽高的值大于等于0
最后
今天的学习先写到这里,这里主要是RPN操作的一部分后面还会有,在这里向各位前辈致以诚挚的敬意。