tensorflwo版Faster RCNN结构分析(代码角度)



  相较YOLO系列的目标识别模型,Faster RCNN损失函数比较简单,但是模型的结构比较复杂。
  1. 经过一个卷积神经网络对图片的特征进行提取.
  2. 把提取出来的特征输入到RPN网络,生成候选区域。
  从整体角度来看Faster RCNN模型是三个网络构成的一个整体,输入图片直接得到了目标的分类与位置,是实现了端到端的检测算法,但是从局部来看,三个部分均可单独运行,类似俄罗斯套娃,三个拼一起是俄罗斯套娃,如果每个单独取出来它就是不倒翁。下面,依次分析三个部分。


  第一部分就是对图片进行特征提取,也就是将图片输入到卷积神经网络。下面以Vgg16为例子,具体参考源码\lib\nets\vgg16.py文件中的 _image_to_head函数。

def _image_to_head(self, is_training, reuse = True):
	with tf.variable_scope(self._scope, self._scope, reuse = reuse):
		net = slim.repeat(self._image, 2, slim.conv2d, 64, [3,3], trainable = False, scope = 'conv1')  #重复两次,输出通道为64, 卷积核大小为3*3步长为1*1的卷积层,其余类似
		net = slim.max_pool2(net, [2,2], padding = "SAME", scope = "pool1")#max_pool层, 默认步长为卷积核大小为2*2, 默认步长为2*2,其余类似
		net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
                        trainable=False, scope='conv2')
     	net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
      	net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
                        trainable=is_training, scope='conv3')
      	net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
      	net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
                        trainable=is_training, scope='conv4')
      	net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
      	net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
                        trainable=is_training, scope='conv5')

    	self._layers['head'] = net
    	return net

  经过这个函数,将图片的特征提取出来,存储到 self._layers[“head”] 的这个集合中以供后续使用。


  说道RPN网络就必须要说到anchor。Faster Rcnn的本质就是在原始800*600的图片上生成许许多多的大小不同框,这个些框也就是anchor。为特征图上的每一个点设置9个anchor。图片经过Vgg16网络下采样 16 倍。那么生成的 anchor 大约为 17100 个。下面看一下 anchor 的生成(具体参见 \lib\layer_utils\generate_anchors.py):

def generate_anchor(base_size = 16, ratios = [0.5,1,2], scales = 2 ** np.arange(3,6)):
	base_anchor = np.array([1,1,base_size, base_size]) - 1 #图片向左上平移一个像素点,使 (0,0) 处有像素点。
	ratio_anchors = _ratio_enum(base_anchor, ratio) #以 ratios 中的元素为比率,生成三个anchor
	anchors = np.vstack([ _scale_enum(ratio_anchors[i,:], scales) for i in range(ratio_anchors.shape[0])]) #把生成好的anchor拼接起来
	return anchors
def _whctrs(self, anchor):#由anchor得到宽、高、中心点
	w = anchor[2] - anchor[0] + 1 #让anchor的左端点从0开始
	h = anchor[3] - anchor[1] + 1
	x_ctr = anchor[0] + 0.5 * (w - 1) #计算中心点
	y_ctr = anchor[1] + 0.5 * (h - 1)
	return w,h,x_ctr,y_ctr

def _mkanchors(ws, hs, x_ctr, y_ctr):#给定宽、高、中心点生成anchors
	ws = ws[:, np.newaxis]#升一维,维度为(len(ws), 1), 主要为了后边方便拼接。
	hs = hs[:, np.newaxis]#不升维也可以使用 np.vstack在用np.transpose()处理
	anchors = np.hstack([x_ctr - 0.5 * (ws - 1),
						y_ctr - 0.5 * (hs - 1),
						x_ctr + 0.5 * (ws - 1),
						x_ctr + 0.5 * (ws - 1)])
 	return anchors
 def _ratio_enum(anchor, ratios):
 	w,h,x_ctr,y_ctr = _whctrs(anchor)
 	size = w * h 
 	ratio_sizes = size / ratios
 	ws = np.round(np.sqrt(ratio_sizes)) #四舍五入
 	hs = np.round(ws * ratios)
 	anchors = _mkanchors(ws, hs, x_ctr, y_ctr)#生成anchor
 	return anchors
 def _scale_enum(anchor, scales):
 	w, h, x_ctr, y_ctr = _whctrs(anchor)
 	ws = w * scales
 	hs = h * scales
 	anchors = _mkanchors(ws,hs,x_ctr,y_ctr)
 	return anchors

  值得注意,基础框为以(0,0)为端点的 16 * 16 为大小的方框,举个简单的例子base_size = 4, scales = 2, ratios = 1,即如下图所示:
  我们在此处生成了论文中要求的9个 anchors,下面以特征图上的像素点为 anchors的中心点,将anchors映射回原始图片。思考一下,如果能够得到特征图上的像素点对应到原始图片上的坐标(x’y’,x’y’),那么我们用这个坐标加上anchors的坐标就能得到相对于原始图片的anchor(参见\lib\layer_utils\snippets.py)。

def generate_anchors_pre_tf(height, width, feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
  shift_x = tf.range(width) * feat_stride # 得到特征图对应到原始图片的x坐标 
  shift_y = tf.range(height) * feat_stride # 得到特征图对应到原始图片的y坐标 
  shift_x, shift_y = tf.meshgrid(shift_x, shift_y) #shift_x维度为(len(shift_y), len(shift_x)) ,shift_y为shift_x的转置
  #理解起来很简单,有多少个y,对应x轴要沿着axis =0拼接多少次,有多少个x,对应y轴要沿着axis =1拼接多少次
  sx = tf.reshape(shift_x, shape=(-1,))
  sy = tf.reshape(shift_y, shape=(-1,))
  shifts = tf.transpose(tf.stack([sx, sy, sx, sy]))
  K = tf.multiply(width, height)#计算总像素的个数
  shifts = tf.transpose(tf.reshape(shifts, shape=[1, K, 4]), perm=(1, 0, 2))
  [[[0, 0, 0, 0]],
   [[16, 0, 16, 0]],
   [[32, 0, 32, 0]]] 
  anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales))#维度为[9,4]
  A = anchors.shape[0] #anchor数量
  anchor_constant = tf.constant(anchors.reshape((1, A, 4)), dtype=tf.int32) #anchor_constant维度为[1,A,4]
  length = K * A  #width * height * A 生成anchor的总数
  anchors_tf = tf.reshape(tf.add(anchor_constant, shifts), shape=(length, 4))
  # anchor_tf 为(K*A, 4) 
  return tf.cast(anchors_tf, dtype=tf.float32), length



def _region_proposal(self, net_conv, is_training, initializer):
	rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], traniable=is_training, weights_initializer=initializer, scope = "rpn_conv/3*3")
	rpn_cls_score = slim.conv2d(rpn, self._num_anchors*2, [1,1], trainable=is_training, weights_initializer = initializer, padding='VALID', activation_fn=None, scope="rpn_cls_score")
    rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, "rpn_cls_score_reshape")
	rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, “rpn_cls_prob_reshape”)
    rpn_cls_pred_reshape = tf.argmax(tf.reshape(rpn_cls_prob_reshape, [-1,2]), axis = -1, name = "rpn_cls_pred_reshape")
    rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")

    rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope = "rpn_bbox_pred")
	if is_training:
    	roi, roi_scores=self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
    	rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
   		with tf.control_dependences([rpn_labels]):#保证RPN层在RCNN层之前执行
   			rois,_ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
   		if cfg.TEST.MODE == 'nms':
   			rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
   		elif cfg.TEST.MODE == 'top':
   			rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
			raise NotImplementedError
    self._predictions["rpn_cls_score"] = rpn_cls_score
    self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
    self._predictions["rpn_cls_prob"] = rpn_cls_prob
    self._predictions["rpn_cls_pred"] = rpn_cls_pred
    self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
    self._predictions["rois"] = rois

    return rois


def _reshape_layer(self, bottom, num_dim, name):
    input_shape = tf.shape(bottom)
    with tf.variable_scope(name) as scope:
      # change the channel to the caffe format
      to_caffe = tf.transpose(bottom, [0, 3, 1, 2])
      # then force it to have channel 2
      reshaped = tf.reshape(to_caffe,
                            tf.concat(axis=0, values=[[1, num_dim, -1], [input_shape[2] ]]))
      # then swap the channel back
      to_tf = tf.transpose(reshaped, [0, 2, 3, 1])
      return to_tf

  def _softmax_layer(self, bottom, name):
    if name.startswith('rpn_cls_prob_reshape'):
      input_shape = tf.shape(bottom)
      bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]])
      reshaped_score = tf.nn.softmax(bottom_reshaped, name=name)
      return tf.reshape(reshaped_score, input_shape)
    return tf.nn.softmax(bottom, name=name)

  为了方便理解利用numpy包,逐步实现下其中的每个操作。以下面的例子为准,发现reshape层的目的是inputs[0,0,0,:] 的前一半表示背景,后一半表示前景。对rpn_cls_score而言,其中每个像素点的anchor是9,通过reshape层,可以使rpn_cls_score[0,0,0,:] 前9个元素表示前景,后9个元素表示背景。而softmax_layer似乎并没有什么用处,这个地方依然存有疑问。

import numpy as np
>>> inputs = np.random.randint(-10,10,(1,2,2,6))
>>> to_caffe = np.transpose(inputs, [0,3,1,2])
>>> to_reshape = to_caffe.reshape((1,2,-1,2))
>>> to_tf = to_reshape.transpose([0,2,3,1])
>>> inputs
array([[[[ -4,   5,   8,  -7,   5,   9],
         [ -1,   6,  -9,  -9,   3,   2]],

        [[-10,  -1,   5,  -1,  -2,   1],
         [  8,   0,   2, -10, -10,  -3]]]])
>>> to_tf
array([[[[ -4,  -7],
         [ -1,  -9]],

        [[-10,  -1],
         [  8, -10]],

        [[  5,   5],
         [  6,   3]],

        [[ -1,  -2],
         [  0, -10]],

        [[  8,   9],
         [ -9,   2]],

        [[  5,   1],
         [  2,  -3]]]])
>>> to_tf_shape = to_tf.shape
>>> bottom_reshape = np.reshape(to_tf,[-1,to_tf_shape[-1]])
>>> reshape_score = tf.nn.softmax(bottom_reshape.astype(np.float32))
>>> reshape_score = sess.run(tf.nn.softmax(bottom_reshape.astype(np.float32)))
>>> output = reshape_score.reshape(to_tf_shape)#经过softmax层得到的输出
>>> output2 = sess.run(tf.nn.softmax(to_tf.astype(np.float32)))#直接softmax得到的输出
>>> output == output2#判断二者输出是否相等
array([[[[ True,  True],
         [ True,  True]],

        [[ True,  True],
         [ True,  True]],

        [[ True,  True],
         [ True,  True]],

        [[ True,  True],
         [ True,  True]],

        [[ True,  True],
         [ True,  True]],

        [[ True,  True],
         [ True,  True]]]])
#类似rpn_cls_prob 的操作
>>> to_caffe1 = np.transpose(to_tf, [0,3,1,2])
>>> to_reshape1 = np.reshape(to_caffe1, [1,6,2,2])
>>> to_tf1 = np.transpose(to_reshape1, [0,2,3,1])
>>> inputs
array([[[[ -4,   5,   8,  -7,   5,   9],
         [ -1,   6,  -9,  -9,   3,   2]],

        [[-10,  -1,   5,  -1,  -2,   1],
         [  8,   0,   2, -10, -10,  -3]]]])
>>> to_tf1
array([[[[ -4,   5,   8,  -7,   5,   9],
         [ -1,   6,  -9,  -9,   3,   2]],

        [[-10,  -1,   5,  -1,  -2,   1],
         [  8,   0,   2, -10, -10,  -3]]]])




def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
	A = num_anchors
	total_anchors = all_anchors.shape[0]
	K = total_anchors / num_anchors
	_allowed_border = 0
	height, width = rpn_cls_score.shape[1:3]
	ind_inside = np.where((total_anchors[:,0] >= _allowed_border)&
						  (total_anchors[:,1] >= _allowed_border)&
						  (total_anchors[:,2] <= im_info[1] + _allowed_border)&
						  (total_anchors[:,3] <= im_info[0] + allowed_border))[0]
	anchors = all_anchors[ind_inside,:]
	labels = np.empty((len(ind_inside), ), dtype = np.float32)
	#通过bbox_overlaps函数, 计算anchors与真实标签的Iou
	overlapes = bbox_overlaps(
		np.ascontiguousarray(anchors, dtype = np.float32),
		np.ascontiguousarray(gt_boxes, dtype = np.float32))
	argmax_overlaps = overlaps.argmax(axis = -1)#得到与每个ancho Iou最大的那个gt_bboxes
	max_overlaps = overlaps[np.arange(len(argmax_overlaps)), argmax_overlaps]#得到最大的那个iou值
	gt_argmax_overlaps = overlaps.argmax(axis = 0)
	gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
	gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] #得到所有的下标
		labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
	labels[gt_argmax_overlaps] = 1
	labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
		labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
	fg_inds = np.where(labels == 1)[0]
	if len(fg_inds) > num_fg:
		disable_inds = np.random.choice(fg_inds, size = (len(fg_inds) - num_fg), replace = False)
		labels[fg_inds] = -1
	num_bg = int(cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1))
	bg_inds = np.where(labels == 0)[0]
	if len(bg_inds) > num_bg:
		disable_inds = np.random.choice(bg_inds, size = (len(bg_inds) - num_bg), replace = False)
		labels[dis_inds] = -1
	#构造坐标偏移量与权重, 学习从anchor变换到gt_boxes的映射
	bbox_targets = np.zeros((len(ind_inside), 4), dtype = np.float32)
	bbox_targets = _comput_targets(anchors, gt_boxes[argmax_overlaps,:])	
	bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    bbox_inside_weights=bbox_inside[labels == 1, :]=np.array(cfg.TRAIN_BBOX_INSIDE_WEIGHTS)
	bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype = np.float32)
		num_examples = np.sum(labels>=0)
		positive_weights = np.ones((1,4)) * 1.0 / num_examples
		negative_weights = np.ones((1,4)) * 1.0 / num_examples
		positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1))
		negative_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 0))
    bbox_outside_weights[labels == 1,:] = positive_weights
    bbox_outside_weights[labels == 0,:] = negative_weights
	labels = _unmap(labels, total_anchors, inds_inside, fill = -1)
	bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill = 0)
	bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
	bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
	labels = labels.reshape(1,height,width, A).transpose(0,3,1,2)
	labels = labels.reshape((1,1,A*height, width))
	ron_labels = labels
	bbox_targets = bbox_targets.reshape((1,height, width, A*4))
	rob_bbox_targers = bbox_targets 
	bbox_inside_weights = bbox_inside_weights.reshape((1,height, width, A * 4))
	rpn_bbox_inside_weights = bbox_inside_weights
	bbox_outside_weights = bbox_outside_weights.reshape((1,height,width,A*4))
	rpn_bbox_inside_weights = bbox_outside_weights
	return  rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights

	def _unmap(data, count, inds, fill=0):
		if len(data.shape) == 1:
			ret = np.empty((count,))
			ret[inds] = data
			ret = np.empty((count,) + data.shape[1:]) #(5,) + (2,5) = (5,2,5)
			ret[inds,:] = data
		return ret 
	def _compute_targets(ex_rois, gt_rois):
		assert ex_rois.shape[0] == gt_rois.shape[0]
		assert ex_rois.shape[0] == 4
		assert gt_rois.shape[1] == 5
		return bbox_transform(ex_rois, gt_rois[:,:4]).astype(np.float32, copy = False)



  图片经过特征提取网络下采样16倍,特征图上的每一个像素点对应着9个anchor,总的anchor数目大概有17100个anchor,相应的rpn层输出的预测框大概也有17100个,首先以 rpn 层的输出的 rpn_cls_prob 为置信度,对 anchors 进行初步筛选。

def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
	if type(cfg_key) == "bytes":
		cfg_key = cfg_key.decode('utf-8')
	pre_nms_topN = cfg[cfg_key].RPN_PER_NMS_TOP_N
	post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
    nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
	scores = rpn_cls_prob[:, :, :, num_anchors:]
	rpn_bbox_pred = rpn_bbox_pred.reshape((-1,4))
	scores = scores.reshape((-1,1))
	proposals = bbox_transform_inv(anhors, rpn_bbox_pred)
	proposals = clip_boxes(proposals, im_info[:2]) 
  	order = scores.ravel().agrsort()[::-1] 
  	if per_nms_topN >0:
  		order = oreder[:per_nms_topN] 
  	proposals = proposals[order,:]
  	scores = scores[order]
  	keep = nms(tf.hstack((proposals, scores)), nms_thresh)
	if post_nms_topN >0:
		keep = keep[:post_nms_topN]
	proposals = proposals[keep,:]
	scores = scores[keep]
	batch_inds = np.zeros((proposals.shape[0], 1), dtype = np.flaot32)
	blob = tf.hstack((batch_inds, proposals.astype(np.float32, copy = False)))	
	return blob, scores

def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
	if type(cfg.key) == bytes:
		cfg_key = cfg_key.decode("utf-8")
	pre_nms_topN = cfg[cfg_key].RPN_PER_NMS_TOP_N
	post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
	nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
	scores = rpn_cls_prob[:, :, :, num_anchors :]
	scores = tf.reshape(scores, (-1, ))
	rpn_bbox_pred = tf.reshape(rpn_bbox_pred, (-1,4))
	proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
	proposals = clip_boxes_tf(proposal, im_info[:2])
	if pre_nms_topN > 0:
		order = scores.ravel().agrsort()[::-1] 
		proposals = proposals[order[:pre_nms_topN],:]
		scores = scores[order[:pre_nms_topN]]

	indices = tf.image.non_max_supression(proposals, scores, max_output_size = post_nms_topN, iou_threshold = nms_thres)	
	boxes = tf.gather(proposals, indices)
	boxes = tf.to_float(boxes)
	scores = tf.gather(scores, indices)
	scores = tf.reshape(scores, (-1, 1))
	batch_inds = tf,zeros((tf.shape(indices)[0], 1), dtype = tf.float32)
	bolb = tf.concat([batch_inds, boxes], 1)
	return blob, scores 


def py_cpu_nms(dets, thresh):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter) #len(over) = len(order) - 1
        #ovr的shape为(len(order) - 1)
        inds = np.where(ovr <= thresh)[0]
        #inds + 1 才能对应会order的坐标
        order = order[inds + 1] 
    return keep




def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
	all_rois = rpn_rois
	all_scores = rpn_scores
	if cfg.TRAIN.USE_GT:
		zeros = np.zeros((gt_boxes.shape[0], 1), dtype = gt_boxes.dtype)
		all_rois = np.vstack((all_rois, np.hstack((zeros, gt_boxes[:,:-1]))))
		all_scores = np.vstack((all_scores, zeros))
	num_images = 1
	rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
	fg_rois_per_image = np.round(rois_per_image * cfg.TRAIN.FG_FRACTION) #四舍五入 
	labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, _num_classes)
	rois = rois.reshape((-1,5))
	roi_scores = roi_scores.reshape((-1,))
	labels = labels.reshape((-1,1))
	bbox_targets = bbox_targets.reshape((-1,_num_classes * 4))
	bbox_inside_weights = bbox_inside_weights.reshape((-1,_num_classes * 4))
	bbox_outside_weights = np.array(bbox_inside_weights >0).astype(np.float32)
	return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
	overlaps = bbox_overlaos(
		np.ascontiguousarray(all_rois[:,1:5], dtype = np.float)
		np.ascontiguousarray(gt_boxes[:,:4], dtype = np.float))#维度为(len(all_rois), len(gt_boxes))
	gt_assignment = overlaps.argmax(axis = -1)
	max_overlaps = overlaps.max(axis = -1)
	labels = gt_boxes[gt_assignment, 4]
  	fd_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
	bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 
						(max_overlaps >= cfg.TRAIN.BG-THRESH_LO))[0]  	
	if fg_inds.size >0 and bg_inds.size > 0:
		fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
		fg_inds = np.random.choice(fd_inds, size = int(fg_ros_per_image), replace = False)
		bg_rois_per_image = roi_per_image - fg_rois_per_image
		to_replace = bg_inds.size < bg_rois_per_image
		bg_inds = np.random.choice(bg_inds, size = (bg_rois_per_image), replace = to_replace)
	elif fg_inds.size > 0:
		to_replace = fg_inds.size < rois_per_image
		fg_inds = np.random.choice(fg_inds, size = int(rois_per_image), replace = to_replace)
		fg_rois_per_image = rois_per_image
	elif bg_inds.size >0:
		to_replace = bg_inds.size < rois_per_image
		bg_inds = np.random.choice(bg_inds, size = (rois_per_image), replcae = to_replace)
		fg_rois_per_image = 0 
		import pdb
	keep_inds = np.append(fg_inds, bg_inds)
	labels = labels[keep_inds]
	labels[int(fg_rois_per_image):] = 0	
	rois = all_rois[keep_inds]
	roi_scores = all_scores[keep_inds]
	bbox_target_data = _compute_targets(rois[:,1:], gt_boxes[gt_assignment[keep_ids], :4], labels)
	bbox_targets, bbox_inside_weights = _gt_bbox_regression_labels(bbox_target_data, num_classes)
	return labels, rois, roi_scores, bbo_targets, bbox_inside_weights

def _compute_targets(ex_rois, gt_rois, labels):
	assert ex_rois.shape[0] == gt_rois.shape[0]
	assert ex_rois.shape[1] == 4
	assert gt_rois.shape[1] == 4
	targets = bbox_transform(ex_rois, gt_rois)
		targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) /\ 	
	return np.hstack((labels[:,np.newaxis], targets)).astype(np.float32, copy = False)
def _get_bbox_regression_labels(bbox_target_data, num_classes):
	clss = bbox_target_data[:,0]	
	bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype = np.float32)
	bbox_inside_weights = np.zeros(bbox_targets.shape, dtype = np.float32)
	inds = np.where(clss > 0)[0]
	for ind in inds:
		clss = clss[ind]
		start = int(4*cls)
		end = start + 4
		bbox_targets[ind,start:end] = bbox_target_data[ind,1:]
		bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
		return bbox_targets, bbox_inside_weights


  通过 RPN 网络与 proposal_target_layer,我们得到了很多候选区域。下面我们将候选区域池化到相同大小,将其输入到RCNN层计算最后的回归损失和分类损失。切记一点,此处的roi并非是RPN层输出的17100个anchor了,而是经过 proposal_target_layer 层输出的 128 个区域。

def _crop_pool_layer(self, bottom, rois, name):
	with tf.variable_scope("name") as scope:
		batch_inds = tf.sequeeze(tf.slice(rois, [0,0], [-1,1]), [1])#取rois的第一列,并将其降维输出维度为(roi.shape[0], )
		bottom_shape = tf.shape(bottom)
		height = (tf.to_float(bottom.shape[1]) - 1) * np.float32(self._feat_stride[0])
		width = (tf.to_float(bottom.shape[2]) - 1) * np.float32(self._feat_stride[1])
		x1 = tf.slice(rois, [0,1], [-1,1], name = "x1") / width
		y1 = tf.slice(rois, [0,2], [-1,1], name = "y1") / height
		x2 = tf.slice(rois, [0,3], [-1,1], name = "x2") / width
		y2 = tf.slice(rois, [0,4], [-1,1], name = "y2") / height
		boxes = tf.stop_gradient(tf.concat([x1,y1,x2,y2], axis = -1)) 

		#进行roi pooling
		pre_pool_size = cfg.POOLING_SIZE * 2 
		crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_inds), [pre_pool_size, pre_pool_size], name = "crops")
		return slim.max_pool2d(crops, [2,2], padding = "SAME")

#将roi pooling得到的结果拉展,再经过两个全连接层
def head_to_tail(self, pool5, is_training, reuse = None):
	with tf.variable_scope(self._scope, self._scope, reuse = reuse):
		pool5_flatten = slim.flatten(pool5, scope = "flatten")#输出维度[batch_size, -1]
		fc6 = slim.fully_connected(pool5_flatten, 4096, scope = "fc6")
		if is_training:
			fc6 = slim.dropout(fc6, keep_prob = 0.5, is_training = True, scope = "dropout6")
		fc7 = slim.fully_connect(fc6, 4096, scope = "fc7")
		if is_training:
			fc7 = slim.dropout(fc7, keep_prob = 0.5, is_training = True, scope = "fc7")
		return fc7 

def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
	cls_score = slim.fully_connected(fc7, self._num_classes,weights_initializer = intializer,trainable = is_training,scope = "cls_score")
	cls_prob = self._softmax_layer( cls_score,"cls_prob")
	cls_pred = tf.argmax(cls_prob, axis = 1, name = "cls_pred")
	bbox_pred = slim.fullt_connected(dc7, self._num_claseese * 4, wieghts_initializer = initializer_bbox, trainable = is_trainable, name = "bbox_pred")
	self._predictions["cls_score"] = cls_score
    self._predictions["cls_pred"] = cls_pred
    self._predictions["cls_prob"] = cls_prob
    self._predictions["bbox_pred"] = bbox_pred

    return cls_prob, bbox_pred


  对于分类损失,我们采用交叉熵计算,对于回归损失采用Smooth L1 Loss。下面构造smooth L1 损失函数。另外值得注意的是在Faster RCNN中,通过sigma控制下图蓝色曲线边界的横坐标,蓝色区域的边间为正负1/(sigma ** 2)。对Smooth L1 Loss的细节不在赘述。


  首先写 smooth L1 损失函数的代码。

def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, signa = 1.0, dim = [1]):
	sigma_2 = sigma ** 2
	box_diff = bbox_pred - bbox_targets
	in_box_diff = bbox_inside_weighs * bbox_diff
	abs_in_box_diff = tf.abs(in_box_diff)
	smoothL1_sign = tf.stop_gradien(tf.to_float(tf.less(abs_in_box_diff, 1 / sigma_2))) #大于1 / sigma_2返回1,小与1 / sigma_2返回0,维度为(batch_size, num_classes * 4)
	in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2) * smoothL1_sign\
					+ (abs_in_box_diff - (0.5 / sigma_2)) * (1 - smoothL1_sign)
    out_loss_box = bbox_outside_weights * in_loss_box
    loss_box = tf.reduce_mean(tf.reduce_sum(out_loss_box, axis = 1))
    return loss_box

  下面计算Faster RCNN的losses。

def _add_losses(self, sigma_rpn=3.0):
	with tf.variable_scope("LOSS_" + self._tag) as scope:
		rpn_cls_score = tf.reshape(self._predictions["rpn_cls_score_reshape"], [-1,2])
		rpn_label = tf.reshape(self._anchor_targets["rpn_labels"], [-1,])
		rpn_select = tf.where(tf.not_equal(rpn_label, -1))
		rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_select), [-1,2])
		rpn_label = tf.reshape(tf.gather(rpn_label, rpn_select), [-1])
		rpn_cross_entropy =   tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=rpn_cls_score, labels=rpn_label)) #标签不是one_hot形式的。
		rpn_bbox_pred = self._predictions["rpn_bbox_pred"]
		rpn_bbox_target = self._anchor_targets["rpn_bbox_targets"]
		rpn_bbox_inside_weights = self._anchors_targets["rpn_bbox_inside_weights"]
		rpn_bbox_outside_weights = self._anchor_targets["rpn_bbox_outside_weights"]
		rpn_loss_box = self._smooth_l1_loss(rpn_bbox_pred, rpn_bbox_target, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma = sigma_rpn, dim = [1,2,3])
		#RCNN 分类损失
		cls_score = self._predictions["cls_score"]
		label = tf.reshape(self._proposal_targets["labels"][-1])
		cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=cls_score, labels=label))
		bbox_pred = self._predictions["bbox_pred"]
		bbox_targets = self._proposal_targets["bbox_targets"]
		bbox_inside_weights = self._proposal_targets['bbox_inside_weights']
		bbox_outside_weights = self._proposal_targets['bbox_outside_weights']
		loss_box = self._smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights)
		self._losses['cross_entropy'] = cross_entropy
		self._losses['loss_box'] = loss_box
		self._losses['rpn_cross_entropy'] = rpn_cross_entropy
		self._losses['rpn_loss_box'] = rpn_loss_box

		loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box
		regularization_loss = tf.add_n(tf.losses.get_regularization_losses(), 'regu')
		self._losses['total_loss'] = loss + regularization_loss

		return loss

  最后还有一点值得注意,我们在生成RPN层的标签的时候只与anchor 和gt_boxes有关。在生成RCNN层标签的时候,只与RPN层输出的ROI与gt_boxes有关,其实在源码中也可以看到,尽管在构造RCNN标签的时候输出了roi_scores,但是我们后续并没有用的,甚至没有保存。

if is_training:
      rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
      rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
      # Try to have a deterministic order for the computing graph, for reproducibility
      with tf.control_dependencies([rpn_labels]):
        rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")

  疫情阶段,在家太无聊了,后面陆陆续续的会整理些机器学习的内容和YOLO算法后续系列。Faster RCNN的训练阶段代码就比较简单了,不再继续分析了。

