rpnnp的matlab,faster-rcnn的rpn网络源码解析(转)

解析的Faster

R-CNN代码是tensorflow版本的,链接地址https://github.com/kevinjliang/tf-Faster-RCNN

首先,在faster_rcnn_resnet50ish.py文件中,我们看一下训练时数据层输出的是:

# Train data

self.x['TRAIN'] = tf.placeholder(tf.float32, [1, None, None,

3]) #图片

self.im_dims['TRAIN'] = tf.placeholder(tf.int32, [None, 2])

#图像尺度 [height, width]

self.gt_boxes['TRAIN'] = tf.placeholder(tf.int32, [None, 5])

#目标框

可以看到,输入网络的首先是图片。然后图像的宽高,因为对于不同尺寸的图像生成的anchor坐标也是不同的。最后是目标框信息,目标框信息的第二维包含五元,前四元是目标的坐标,最后一元是目标的类别。

然后,我们进入faster_rcnn_networks.py文件,可以看到rpn类,按照笔者的风格我们还是先贴出注释的源码:

# -*- coding: utf-8 -*-

"""

Created on Fri Dec 30 16:14:48 2016

@author: Kevin Liang

Faster R-CNN detection and classification networks.

Contains the Region Proposal Network (RPN), ROI proposal

layer, and the RCNN.

TODO: -Split off these three networks into their own files OR

add to Layers

"""

import sys

sys.path.append('../')

from Lib.TensorBase.tensorbase.base import Layers

from Lib.faster_rcnn_config import cfg

from Lib.loss_functions import rpn_cls_loss, rpn_bbox_loss,

fast_rcnn_cls_loss, fast_rcnn_bbox_loss

from Lib.roi_pool import roi_pool

from Lib.rpn_softmax import rpn_softmax

from Networks.anchor_target_layer import

anchor_target_layer

from Networks.proposal_layer import proposal_layer

from Networks.proposal_target_layer import

proposal_target_layer

import tensorflow as tf

class rpn:

'''

Region Proposal Network

(RPN): From the convolutional feature maps

(TensorBase Layers

object) of the last layer, generate bounding boxes

relative to anchor boxes

and give an "objectness" score to each

In evaluation mode

(eval_mode==True), gt_boxes should be None.

'''

def __init__(self,

featureMaps, gt_boxes, im_dims, _feat_stride, eval_mode):

self.featureMaps = featureMaps #得到共享特征

self.gt_boxes = gt_boxes #得到标签 shape: [None,

5],记录左上角和右下角的坐标以及类别

self.im_dims = im_dims #图像尺度 shape: [None

,2],记录图像的宽度与高度

self._feat_stride = _feat_stride

#记录图像经过特征图缩小的尺度

self.anchor_scales = cfg.RPN_ANCHOR_SCALES

#记录anchor的尺度 [8, 16, 32]

self.eval_mode = eval_mode #记录是训练还是测试

self._network() #执行_network函数

def

_network(self):

# There shouldn't be any gt_boxes if in

evaluation mode

if self.eval_mode is True: #如果是测试的话,那么就没有ground

truth

assert

self.gt_boxes is None, \

'Evaluation mode should not

have ground truth boxes (or else what are you detecting

for?)'

_num_anchors = len(self.anchor_scales)*3

#_num_anchors为9(3×3),指一次滑动对应9个anchor

rpn_layers = Layers(self.featureMaps)

#将共享特征赋给rpn_layers

with tf.variable_scope('rpn'):

# Spatial

windowing

for i in

range(len(cfg.RPN_OUTPUT_CHANNELS)):# 在这里先用3×3的核输出512个通道

rpn_layers.conv2d(filter_size=cfg.RPN_FILTER_SIZES[i],

output_channels=cfg.RPN_OUTPUT_CHANNELS[i])

features =

rpn_layers.get_output()

with

tf.variable_scope('cls'):

# Box-classification layer

(objectness)

self.rpn_bbox_cls_layers =

Layers(features) #在这里使用1×1的核输出18(9×2)个通道

self.rpn_bbox_cls_layers.conv2d(filter_size=1,

output_channels=_num_anchors*2, activation_fn=None)

with

tf.variable_scope('target'): #在这里得到每个anchor对应的target

# Only calculate targets in

train mode. No ground truth boxes in evaluation mode

if self.eval_mode is

False:

# Anchor Target Layer (anchors and deltas)

rpn_cls_score =

self.rpn_bbox_cls_layers.get_output()

self.rpn_labels, self.rpn_bbox_targets,

self.rpn_bbox_inside_weights, self.rpn_bbox_outside_weights =

\

anchor_target_layer(rpn_cls_score=rpn_cls_score,

gt_boxes=self.gt_boxes, im_dims=self.im_dims,

_feat_stride=self._feat_stride,

anchor_scales=self.anchor_scales)

with

tf.variable_scope('bbox'): #在这里使用1×1的核输出36(9×4)个通道

# Bounding-Box regression

layer (bounding box predictions)

self.rpn_bbox_pred_layers =

Layers(features)

self.rpn_bbox_pred_layers.conv2d(filter_size=1,

output_channels=_num_anchors*4, activation_fn=None)

# Get functions

def

get_rpn_cls_score(self): #返回rpn网络判断的anchor前后景分数

return

self.rpn_bbox_cls_layers.get_output()

def

get_rpn_labels(self): #返回每个anchor属于前景还是后景的ground truth

assert self.eval_mode is False, 'No RPN labels

without ground truth boxes'

return self.rpn_labels

def

get_rpn_bbox_pred(self): #返回rpn判断的anchor的四个偏移值

return

self.rpn_bbox_pred_layers.get_output()

def

get_rpn_bbox_targets(self): #返回每个anchor对应的事实的四个偏移值

assert self.eval_mode is False, 'No RPN bounding

box targets without ground truth boxes'

return self.rpn_bbox_targets

def

get_rpn_bbox_inside_weights(self):

#在训练计算边框误差时有用,仅对未超出图像边界的anchor有用

assert self.eval_mode is False, 'No RPN inside

weights without ground truth boxes'

return self.rpn_bbox_inside_weights

def

get_rpn_bbox_outside_weights(self):

#在训练计算边框误差时有用,仅对未超出图像边界的anchor有用

assert self.eval_mode is False, 'No RPN outside

weights without ground truth boxes'

return self.rpn_bbox_outside_weights

# Loss functions

def

get_rpn_cls_loss(self): #计算rpn的分类loss

assert self.eval_mode is False, 'No RPN cls loss

without ground truth boxes'

rpn_cls_score = self.get_rpn_cls_score()

rpn_labels = self.get_rpn_labels()

return rpn_cls_loss(rpn_cls_score,

rpn_labels)

def

get_rpn_bbox_loss(self):

#计算rpn的边界损失loss,请注意在这里用到了inside和outside_weights

assert self.eval_mode is False, 'No RPN bbox

loss without ground truth boxes'

rpn_bbox_pred = self.get_rpn_bbox_pred()

rpn_bbox_targets =

self.get_rpn_bbox_targets()

rpn_bbox_inside_weights =

self.get_rpn_bbox_inside_weights()

rpn_bbox_outside_weights =

self.get_rpn_bbox_outside_weights()

return rpn_bbox_loss(rpn_bbox_pred,

rpn_bbox_targets, rpn_bbox_inside_weights,

rpn_bbox_outside_weights)

我们可以看一下,rpn类在训练的时候主要有两个功能,第一个是get_rpn_cls_loss计算的rpn网络分类loss,第二个是get_rpn_bbox_loss计算的rpn网络的anchor边界回归loss。那么,要计算两个loss,最难的地方是如何去获得ground

truth。这个ground

truth的获得是通过anchor_target_layer函数实现的,那么,我们首先来进入这个函数,按照惯例先放出源码:

# -*- coding: utf-8 -*-

"""

Created on Sun Jan 1 16:11:17 2017

@author: Kevin Liang (modifications)

Anchor Target Layer: Creates all the anchors in the final

convolutional feature

map, assigns anchors to ground truth boxes, and applies labels

of "objectness"

Adapted from the official Faster R-CNN

repo:

https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/anchor_target_layer.py

"""

#

--------------------------------------------------------

# Faster R-CNN

# Copyright (c) 2015 Microsoft

# Licensed under The MIT License [see LICENSE for

details]

# Written by Ross Girshick and Sean Bell

#

--------------------------------------------------------

import sys

sys.path.append('../')

import numpy as np

import numpy.random as npr

import tensorflow as tf

from Lib.bbox_overlaps import bbox_overlaps

from Lib.bbox_transform import bbox_transform

from Lib.faster_rcnn_config import cfg

from Lib.generate_anchors import generate_anchors

#该函数计算每个anchor对应的ground truth(前景/背景,坐标偏移值)

def anchor_target_layer(rpn_cls_score, gt_boxes, im_dims,

_feat_stride, anchor_scales):

'''

Make Python version of

_anchor_target_layer_py below Tensorflow compatible

'''

#执行_anchor_target_layer_py函数,传参有网络预测的rpn分类分数,ground_truth_box,图像的尺寸,与原图相比特征图缩小的比例和anchor的尺度

rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights

= \

tf.py_func(_anchor_target_layer_py,

[rpn_cls_score, gt_boxes, im_dims, _feat_stride,

anchor_scales],

[tf.float32, tf.float32, tf.float32,

tf.float32])

#转化成tensor

rpn_labels =

tf.convert_to_tensor(tf.cast(rpn_labels,tf.int32), name =

'rpn_labels')

rpn_bbox_targets =

tf.convert_to_tensor(rpn_bbox_targets, name =

'rpn_bbox_targets')

rpn_bbox_inside_weights

= tf.convert_to_tensor(rpn_bbox_inside_weights , name =

'rpn_bbox_inside_weights')

rpn_bbox_outside_weights

= tf.convert_to_tensor(rpn_bbox_outside_weights , name =

'rpn_bbox_outside_weights')

return rpn_labels,

rpn_bbox_targets, rpn_bbox_inside_weights,

rpn_bbox_outside_weights

def _anchor_target_layer_py(rpn_cls_score, gt_boxes, im_dims,

_feat_stride, anchor_scales):

"""

Python

version

Assign anchors to

ground-truth targets. Produces anchor classification

labels and bounding-box

regression targets.

# Algorithm:

#

# for each (H, W)

location i

# generate 9 anchor boxes centered on cell i

# apply predicted bbox deltas at cell i to each of

the 9 anchors

# filter out-of-image

anchors

# measure GT

overlap

"""

im_dims = im_dims[0]

#获得原图的尺度[height, width]

_anchors =

generate_anchors(scales=np.array(anchor_scales))# 生成9个锚点,shape:

[9,4]

_num_anchors =

_anchors.shape[0] #_num_anchors值为9

# allow boxes to sit

over the edge by a small amount

_allowed_border

= 0 #将anchor超出边界的限度设置为0

# Only minibatch of 1

supported 在这里核验batch_size是否为1

assert

rpn_cls_score.shape[0] == 1, \

'Only single item batches are

supported'

# map of shape (..., H,

W)

height, width =

rpn_cls_score.shape[1:3] #在这里得到了rpn输出的H和W,总的anchor数目应该是H×W×9

# 1. Generate proposals

from bbox deltas and shifted anchors

#下面是在原图上生成anchor

shift_x = np.arange(0,

width) * _feat_stride #shape: [width,]

shift_y = np.arange(0,

height) * _feat_stride #shape: [height,]

shift_x, shift_y =

np.meshgrid(shift_x, shift_y) #生成网格 shift_x shape: [height, width],

shift_y shape: [height, width]

shifts =

np.vstack((shift_x.ravel(), shift_y.ravel(),

shift_x.ravel(), shift_y.ravel())).transpose() #

shape[height*width, 4]

# add A anchors (1, A,

4) to

# cell K shifts (K, 1,

4) to get

# shift anchors (K, A,

4)

# reshape to (K*A, 4)

shifted anchors

A = _num_anchors # A =

9

K = shifts.shape[0] #

K=height*width(特征图上的)

all_anchors =

(_anchors.reshape((1, A, 4)) +

shifts.reshape((1, K, 4)).transpose((1, 0, 2)))

#shape[K,A,4] 得到所有的anchor

all_anchors =

all_anchors.reshape((K * A, 4))

total_anchors = int(K *

A) #total_anchors记录anchor的数目

# anchors inside the

image inds_inside所有的anchor中没有超过图像边界的

inds_inside =

np.where(

(all_anchors[:, 0] >= -_allowed_border)

&

(all_anchors[:, 1] >= -_allowed_border)

&

(all_anchors[:, 2] < im_dims[1] +

_allowed_border) & # width

(all_anchors[:, 3] < im_dims[0] +

_allowed_border) #

height

)[0]

# keep only inside

anchors

anchors =

all_anchors[inds_inside, :]#在这里选出合理的anchors,指的是没超出边界的

# label: 1 is positive,

0 is negative, -1 is dont care

labels =

np.empty((len(inds_inside), ),

dtype=np.float32)#labels的长度就是合法的anchor的个数

labels.fill(-1)

#先用-1填充labels

# overlaps between the

anchors and the gt boxes

# overlaps (ex,

gt)

#对所有的没超过图像边界的anchor计算overlap,得到的shape: [len(anchors),

len(gt_boxes)]

overlaps =

bbox_overlaps(

np.ascontiguousarray(anchors,

dtype=np.float),

np.ascontiguousarray(gt_boxes,

dtype=np.float))

argmax_overlaps =

overlaps.argmax(axis=1) #对于每个anchor,找到对应的gt_box坐标。shape:

[len(anchors),]

max_overlaps =

overlaps[np.arange(len(inds_inside)), argmax_overlaps]

#对于每个anchor,找到最大的overlap的gt_box shape: [len(anchors)]

gt_argmax_overlaps =

overlaps.argmax(axis=0)

#对于每个gt_box,找到对应的最大overlap的anchor。shape[len(gt_boxes),]

gt_max_overlaps =

overlaps[gt_argmax_overlaps,

np.arange(overlaps.shape[1])]#对于每个gt_box,找到与anchor的最大IoU值。shape[len(gt_boxes),]

gt_argmax_overlaps =

np.where(overlaps ==

gt_max_overlaps)[0]#再次对于每个gt_box,找到对应的最大overlap的anchor。shape[len(gt_boxes),]

if not

cfg.TRAIN.RPN_CLOBBER_POSITIVES:

#如果不需要抑制positive的anchor,就先给背景anchor赋值,这样在赋前景值的时候可以覆盖。

# assign bg labels first so that positive labels

can clobber them

labels[max_overlaps <

cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

#在这里将最大IoU仍然小于阈值(0.3)的某些anchor置0

# fg label: for each gt,

anchor with highest overlap

labels[gt_argmax_overlaps] = 1 #在这里将每个gt_box对应IoU最大的anchor置1

# fg label: above

threshold IOU

labels[max_overlaps

>= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

#在这里将最大IoU大于阈值(0.7)的某些anchor置1

if

cfg.TRAIN.RPN_CLOBBER_POSITIVES:

#如果需要抑制positive的anchor,就将背景anchor后赋值

# assign bg labels last so that negative labels

can clobber positives

labels[max_overlaps <

cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

#在这里将最大IoU仍然小于阈值(0.3)的某些anchor置0

# subsample positive

labels if we have too many

num_fg =

int(cfg.TRAIN.RPN_FG_FRACTION *

cfg.TRAIN.RPN_BATCHSIZE)#计算出一个训练batch中需要的前景的数量

fg_inds =

np.where(labels == 1)[0] #找出被置为前景的anchors

if len(fg_inds) >

num_fg:

disable_inds = npr.choice(

fg_inds,

size=(len(fg_inds) - num_fg), replace=False)

labels[disable_inds] = -1

#如果事实存在的前景anchor大于了所需值,就随机抛弃一些前景anchor

# subsample negative

labels if we have too many

num_bg =

cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)

##计算出一个训练batch中需要的背景的数量

bg_inds =

np.where(labels == 0)[0] #找出被置为背景的anchors

if len(bg_inds) >

num_bg:

disable_inds = npr.choice(

bg_inds,

size=(len(bg_inds) - num_bg), replace=False)

labels[disable_inds] = -1

#如果事实存在的背景anchor大于了所需值,就随机抛弃一些背景anchor

# bbox_targets: The

deltas (relative to anchors) that Faster R-CNN

should

# try to predict at each

anchor

# TODO: This "weights"

business might be deprecated. Requires investigation

#返回的是,对于每个anchor,得到四个坐标变换值(tx,ty,th,tw)。

bbox_targets =

np.zeros((len(inds_inside), 4), dtype=np.float32)

#对每个在原图内部的anchor,用全0初始化坐标变换值

bbox_targets =

_compute_targets(anchors, gt_boxes[argmax_overlaps, :])

#对于每个anchor,找到变换到对应的最大的overlap的gt_box的四个值

bbox_inside_weights =

np.zeros((len(inds_inside), 4), dtype=np.float32)

#使用全0初始化inside_weights

bbox_inside_weights[labels == 1, :] =

np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) #在前景anchor处赋权重

bbox_outside_weights =

np.zeros((len(inds_inside), 4), dtype=np.float32)

#使用全0初始化outside_weights

if

cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:

#如果RPN_POSITIVE_WEIGHT小于0的话,

# uniform weighting of examples (given

non-uniform sampling)

num_examples = np.sum(labels >= 0)

positive_weights = np.ones((1, 4)) * 1.0 /

num_examples #则positive_weights和negative_weights都一样

negative_weights = np.ones((1, 4)) * 1.0 /

num_examples

else:

assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0)

&

(cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))

#如果RPN_POSITIVE_WEIGHT位于0和1之间的话,

positive_weights =

(cfg.TRAIN.RPN_POSITIVE_WEIGHT /

np.sum(labels == 1))

negative_weights = ((1.0 -

cfg.TRAIN.RPN_POSITIVE_WEIGHT) /

np.sum(labels == 0))

#则positive_weights和negative_weights分别赋值

bbox_outside_weights[labels == 1, :] = positive_weights

bbox_outside_weights[labels == 0, :] = negative_weights

#将positive_weights和negative_weights赋给bbox_outside_weights

# map up to original set

of anchors

labels = _unmap(labels,

total_anchors, inds_inside,

fill=-1)#把图像内部的anchor对应的label映射回总的anchor(加上了那些超出边界的anchor,类别填充-1)

bbox_targets =

_unmap(bbox_targets, total_anchors, inds_inside,

fill=0)#把图像内部的anchor对应的bbox_target映射回所有的anchor(加上了那些超出边界的anchor,填充0)

bbox_inside_weights =

_unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)

#把图像内部的anchor对应的inside_weights映射回总的anchor(加上了那些超出边界的anchor,填充0)

bbox_outside_weights =

_unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

#把图像内部的anchor对应的outside_weights映射回总的anchor(加上了那些超出边界的anchor,填充0)

# labels

labels =

labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)

labels =

labels.reshape((1, 1, A * height, width))

#将anchor的类别label数组形状置为[1,1,9*height,width]

rpn_labels =

labels

# bbox_targets

rpn_bbox_targets =

bbox_targets.reshape((1, height, width, A * 4)).transpose(0, 3, 1,

2) #将anchor的位置映射数组的形状置为[1,9*4,height,width]

#

bbox_inside_weights

rpn_bbox_inside_weights

= bbox_inside_weights.reshape((1, height, width, A *

4)).transpose(0, 3, 1, 2)

#将anchor的inside_weights数组的形状置为[1,9*4,height,width]

#

bbox_outside_weights

rpn_bbox_outside_weights

= bbox_outside_weights.reshape((1, height, width, A *

4)).transpose(0, 3, 1, 2)

#将anchor的outside_weights数组的形状置为[1,9*4,height,width]

return

rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights

#返回所有的ground truth值

def _unmap(data, count, inds, fill=0):

#_unmap函数将图像内部的anchor映射回到生成的所有的anchor

""" Unmap a subset of

item (data) back to the original set of items (of

size count) """

if len(data.shape) ==

1:

ret = np.empty((count, ),

dtype=np.float32)

ret.fill(fill)

ret[inds] = data

else:

ret = np.empty((count, ) + data.shape[1:],

dtype=np.float32)

ret.fill(fill)

ret[inds, :] = data

return ret

def _compute_targets(ex_rois, gt_rois):

#_compute_targets函数计算anchor和对应的gt_box的位置映射

"""Compute bounding-box

regression targets for an image."""

assert ex_rois.shape[0]

== gt_rois.shape[0]

assert ex_rois.shape[1]

== 4

assert gt_rois.shape[1]

== 5

return

bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32,

copy=False)

anchor_target_layer函数主要还是调用了_anchor_target_layer_py函数,然后将输出转化为tensor。下面,我们就来仔细分析一下_anchor_target_layer_py函数。在该函数中,首先通过generate_anchors函数生成了9个候选框,然后按照在共享特征上每滑动一次对应到原图的位置生成候选框,即all_anchors。紧接着,排除了全部边框超过图像边界的候选框,得到anchors,之后的操作都是针对图像内部的anchors。然后,通过bbox_overlaps函数计算了所有边界内anchor与包围框之间的IoU值。接着,排除了IoU在0.3到0.7之间的anchor(通过将labels对应的值置为-1),并且为训练安排了合适数量的前景anchor和背景anchor。然后,通过_compute_targets函数计算出了每个anchor对应的坐标变换值(tx,ty,th,tw),存在bbox_targets数组里面。再计算了bbox_inside_weights和bbox_outside_weights,这两个数组在训练anchor边框修正时有重大作用。最后,通过_unmap函数将所有图像边框内部的anchor映射回所有的anchor。

笔者朋友们初看上面的解析可能觉得有些混乱,请不要着急。anchor_target_layer主要就是为了得到两个东西,第一个东西是对应的一张图像生成的anchor的类别,在训练时需要赋予一定数量的正样本(前景)和一定数量的负样本(背景),其余的需要全部置成-1,表示训练的时候会忽略掉。第二个东西是对于每一个anchor的边框修正,在进行边框修正loss的计算时,只有前景anchor会起作用,可以看到这是bbox_inside_weights和bbox_outside_weights在实现。非前景和背景anchor对应的bbox_inside_weights和bbox_outside_weights都为0。

在anchor_target_layer函数中,有几个比较重要的函数,第一个函数就是generate_anchors,这个函数的主要作用是生成9个anchor,包含3种长宽比和3种面积。源代码及注释如下:

# -*- coding: utf-8 -*-

"""

Created on Sun Jan 1 16:11:17 2017

@author: Kevin Liang (modifications)

generate_anchors and supporting functions: generate reference

windows (anchors)

for Faster R-CNN. Specifically, it creates a set of k (default

of 9) relative

coordinates. These references will be added on to all

positions of the final

convolutional feature maps.

Adapted from the official Faster R-CNN

repo:

https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py

Note: the produced anchors have indices off by 1 of what the

comments claim.

Probably due to MATLAB being 1-indexed, while Python is

0-indexed.

"""

#

--------------------------------------------------------

# Faster R-CNN

# Copyright (c) 2015 Microsoft

# Licensed under The MIT License [see LICENSE for

details]

# Written by Ross Girshick and Sean Bell

#

--------------------------------------------------------

import numpy as np

# Verify that we compute the same anchors as Shaoqing's matlab

implementation:

#

# >> load

output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat

# >> anchors

#

# anchors =

#

# -83 -39 100 56

# -175 -87 192 104

# -359 -183 376 200

# -55 -55 72 72

# -119 -119 136 136

# -247 -247 264 264

# -35 -79 52 96

# -79 -167 96 184

# -167 -343 184 360

#array([[ -83., -39., 100., 56.],

# [-175., -87., 192., 104.],

# [-359., -183., 376., 200.],

# [ -55., -55., 72., 72.],

# [-119., -119., 136., 136.],

# [-247., -247., 264., 264.],

# [ -35., -79., 52., 96.],

# [ -79., -167., 96., 184.],

# [-167., -343., 184., 360.]])

def generate_anchors(base_size=16, ratios=[0.5, 1, 2],

scales=2**np.arange(3,

6)):

"""

Generate anchor

(reference) windows by enumerating aspect ratios X

scales wrt a reference

(0, 0, 15, 15) window.

"""

#请注意anchor的表示形式有两种,一种是记录左上角和右下角的坐标,一种是记录中心坐标和宽高

#这里生成一个基准anchor,采用左上角和右下角的坐标表示[0,0,15,15]

base_anchor =

np.array([1, 1, base_size, base_size]) - 1 #[0,0,15,15]

ratio_anchors =

_ratio_enum(base_anchor, ratios) #shape:

[3,4],返回的是不同长宽比的anchor

anchors =

np.vstack([_scale_enum(ratio_anchors[i, :], scales)

for i in range(ratio_anchors.shape[0])])#生成九个候选框

shape: [9,4]

return anchors

def

_whctrs(anchor):#传入anchor的左上角和右下角的坐标,返回anchor的中心坐标和长宽

"""

Return width, height, x

center, and y center for an anchor (window).

"""

w = anchor[2] -

anchor[0] + 1

h = anchor[3] -

anchor[1] + 1

x_ctr = anchor[0] + 0.5

* (w - 1)

y_ctr = anchor[1] + 0.5

* (h - 1)

return w, h, x_ctr,

y_ctr

def _mkanchors(ws, hs, x_ctr,

y_ctr):#由anchor中心和长宽坐标返回window,记录左上角和右下角的坐标

"""

Given a vector of widths

(ws) and heights (hs) around a center

(x_ctr, y_ctr), output a

set of anchors (windows).

"""

ws = ws[:, np.newaxis]

#shape: [3,1]

hs = hs[:, np.newaxis]

#shape: [3,1]

anchors =

np.hstack((x_ctr - 0.5 * (ws - 1),

y_ctr - 0.5 * (hs - 1),

x_ctr + 0.5 * (ws - 1),

y_ctr + 0.5 * (hs - 1)))

return anchors #shape

[3,4],对于每个anchor,返回了左上角和右下角的坐标值

def _ratio_enum(anchor, ratios):

#这个函数计算不同长宽尺度下的anchor的坐标

"""

Enumerate a set of

anchors for each aspect ratio wrt an anchor.

"""

w, h, x_ctr, y_ctr =

_whctrs(anchor) #找到anchor的中心点和长宽

size = w * h

#返回anchor的面积

size_ratios = size /

ratios #为了计算anchor的长宽尺度设置的数组:array([512.,256.,128.])

ws =

np.round(np.sqrt(size_ratios))

#计算不同长宽比下的anchor的宽:array([23.,16.,11.])

hs = np.round(ws *

ratios) #计算不同长宽比下的anchor的长 array([12.,16.,22.])

#请大家注意,对应位置上ws和hs相乘,面积都为256左右

anchors = _mkanchors(ws,

hs, x_ctr, y_ctr)#返回新的不同长宽比的anchor

返回的数组shape:[3,4],请注意anchor记录的是左上角和右下角的坐标

return anchors

def _scale_enum(anchor, scales):

#这个函数对于每一种长宽比的anchor,计算不同面积尺度的anchor坐标

"""

Enumerate a set of

anchors for each scale wrt an anchor.

"""

w, h, x_ctr, y_ctr =

_whctrs(anchor) #找到anchor的中心坐标

ws = w * scales #shape

[3,] 得到不同尺度的新的宽

hs = h * scales #shape

[3,] 得到不同尺度的新的高

anchors = _mkanchors(ws,

hs, x_ctr, y_ctr) #得到不同面积尺度的anchor信息,对应的是左上角和右下角的坐标

return anchors

if __name__ == '__main__':

import time

t = time.time()

a =

generate_anchors()

print(time.time() -

t)

print(a)

from IPython import

embed; embed()

在上面的代码中,主要的原理就是最开始生成一个基准anchor。然后,通过这个基准anchor生成三个不同长宽比,面积一样的anchor。最后,对每个长宽比anchor生成三个不同面积尺度的anchor,最终生成9个anchor,详情请见代码注释。

第二个重要的函数,是bbox_overlaps函数,这个函数对于每一个anchor,和所有的ground

truth box计算IoU值,代码如下:

# -*- coding: utf-8 -*-

"""

Created on Sun Jan 1 20:25:19 2017

@author: Kevin Liang (modification)

Calculates bounding box overlaps between N bounding boxes, and

K query boxes

(anchors) and return a matrix of overlap proportions

Written in Cython for optimization.

"""

#

--------------------------------------------------------

# Fast R-CNN

# Copyright (c) 2015 Microsoft

# Licensed under The MIT License [see LICENSE for

details]

# Written by Sergey Karayev

#

--------------------------------------------------------

cimport cython

import numpy as np

cimport numpy as np

DTYPE = np.float

ctypedef np.float_t DTYPE_t

def bbox_overlaps(#计算重合程度,两个框之间的重合区域的面积 / 两个区域一共加起来的面积

np.ndarray[DTYPE_t, ndim=2] boxes,

np.ndarray[DTYPE_t, ndim=2] query_boxes):

"""

Parameters

----------

boxes: (N, 4) ndarray of

float

query_boxes: (K, 4)

ndarray of float

Returns

-------

overlaps: (N, K) ndarray

of overlap between boxes and query_boxes

"""

cdef unsigned int N =

boxes.shape[0]

cdef unsigned int K =

query_boxes.shape[0]

cdef np.ndarray[DTYPE_t,

ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)

cdef DTYPE_t iw, ih,

box_area

cdef DTYPE_t ua

cdef unsigned int k,

n

for k in range(K):

box_area = (

(query_boxes[k, 2] - query_boxes[k, 0] + 1) *

(query_boxes[k, 3] - query_boxes[k, 1] + 1)

)

for n in range(N):

iw =

(

min(boxes[n, 2],

query_boxes[k, 2]) -

max(boxes[n, 0],

query_boxes[k, 0]) + 1

)

if iw >

0:

ih = (

min(boxes[n, 3], query_boxes[k, 3]) -

max(boxes[n, 1], query_boxes[k, 1]) + 1

)

if ih > 0:

ua = float(

(boxes[n,

2] - boxes[n, 0] + 1) *

(boxes[n,

3] - boxes[n, 1] + 1) +

box_area -

iw * ih

)

overlaps[n, k] = iw * ih / ua

return overlaps

第三个重要的部分是,在计算anchor的坐标变换值的时候,使用到了bbox_transform函数,请注意在计算坐标变换的时候是将anchor的表示形式变成中心坐标与长宽。该函数代码及注释如下所示:

# -*- coding: utf-8 -*-

"""

Created on Sun Jan 1 21:18:58 2017

@author: Kevin Liang (modifications)

bbox_transform and its inverse operation

"""

#

--------------------------------------------------------

# Fast R-CNN

# Copyright (c) 2015 Microsoft

# Licensed under The MIT License [see LICENSE for

details]

# Written by Ross Girshick

#

--------------------------------------------------------

import numpy as np

def bbox_transform(ex_rois, gt_rois):

'''

Receives two sets of

bounding boxes, denoted by two opposite

corners

(x1,y1,x2,y2), and

returns the target deltas that Faster R-CNN should

aim

for.

'''

ex_widths = ex_rois[:,

2] - ex_rois[:, 0] + 1.0

ex_heights = ex_rois[:,

3] - ex_rois[:, 1] + 1.0

ex_ctr_x = ex_rois[:, 0]

+ 0.5 * ex_widths

ex_ctr_y = ex_rois[:, 1]

+ 0.5 * ex_heights #计算得到每个anchor的中心坐标和长宽

gt_widths = gt_rois[:,

2] - gt_rois[:, 0] + 1.0

gt_heights = gt_rois[:,

3] - gt_rois[:, 1] + 1.0

gt_ctr_x = gt_rois[:, 0]

+ 0.5 * gt_widths

gt_ctr_y = gt_rois[:, 1]

+ 0.5 * gt_heights #计算每个anchor对应的ground truth

box对应的中心坐标和长宽

targets_dx = (gt_ctr_x -

ex_ctr_x) / ex_widths #计算四个坐标变换值

targets_dy = (gt_ctr_y -

ex_ctr_y) / ex_heights

targets_dw =

np.log(gt_widths / ex_widths)

targets_dh =

np.log(gt_heights / ex_heights)

targets =

np.vstack(

(targets_dx, targets_dy, targets_dw,

targets_dh)).transpose()#对于每一个anchor,得到四个关系值 shape: [4,

num_anchor]

return targets

到这里,anchor_target_layers解析就完成了。这是rpn源码中最重要的函数之一,因为会返回所有anchor对应的类别和对应的边框修正值,方便在计算loss时计算。顺便提供一下计算rpn的loss的函数,代码及注释如下所示:

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

Created on Tue Jan 17 15:05:05 2017

@author: Kevin Liang

Loss functions

"""

from .faster_rcnn_config import cfg

import tensorflow as tf

def rpn_cls_loss(rpn_cls_score,rpn_labels):

'''

Calculate the Region

Proposal Network classifier loss. Measures how

well

the RPN is able to

propose regions by the performance of its

"objectness"

classifier.

Standard cross-entropy

loss on logits

'''

with

tf.variable_scope('rpn_cls_loss'):

# input shape dimensions

shape = tf.shape(rpn_cls_score)

# Stack all classification scores into 2D

matrix

rpn_cls_score =

tf.transpose(rpn_cls_score,[0,3,1,2])

rpn_cls_score =

tf.reshape(rpn_cls_score,[shape[0],2,shape[3]//2*shape[1],shape[2]])

rpn_cls_score =

tf.transpose(rpn_cls_score,[0,2,3,1])

rpn_cls_score =

tf.reshape(rpn_cls_score,[-1,2])

# Stack labels

rpn_labels = tf.reshape(rpn_labels,[-1])

#在这里先讲label展开成one_hot向量

# Ignore label=-1 (Neither object nor

background: IoU between 0.3 and 0.7)

#在这里对应label中为-1值的位置排除掉score中的值,并且变成[-1,2]的形状方便计算交叉熵loss

rpn_cls_score =

tf.reshape(tf.gather(rpn_cls_score,tf.where(tf.not_equal(rpn_labels,-1))),[-1,2])

#在这里留下label中的非-1的值,表示对应的anchor与gt的IoU在0.7以上

rpn_labels =

tf.reshape(tf.gather(rpn_labels,tf.where(tf.not_equal(rpn_labels,-1))),[-1])

# Cross entropy error 在这里计算交叉熵loss

rpn_cross_entropy =

tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score,

labels=rpn_labels))

return

rpn_cross_entropy

def rpn_bbox_loss(rpn_bbox_pred, rpn_bbox_targets,

rpn_inside_weights, rpn_outside_weights):

'''

Calculate the Region

Proposal Network bounding box loss. Measures how

well

the RPN is able to

propose regions by the performance of its localization.

lam/N_reg * sum_i(p_i^*

* L_reg(t_i,t_i^*))

lam: classification vs

bbox loss balance parameter

N_reg: Number of anchor

locations (~2500)

p_i^*: ground truth

label for anchor (loss only for positive anchors)

L_reg: smoothL1

loss

t_i: Parameterized

prediction of bounding box

t_i^*: Parameterized

ground truth of closest bounding box

'''

with

tf.variable_scope('rpn_bbox_loss'):

# Transposing

rpn_bbox_targets =

tf.transpose(rpn_bbox_targets, [0,2,3,1])

rpn_inside_weights =

tf.transpose(rpn_inside_weights, [0,2,3,1])

rpn_outside_weights =

tf.transpose(rpn_outside_weights, [0,2,3,1])

# How far off was the prediction?

#在这里将预测的tx,ty,th,tw和标签做减法,并乘以rpn_inside_weights,意思是只对positive

anchor计算bbox loss

diff = tf.multiply(rpn_inside_weights,

rpn_bbox_pred - rpn_bbox_targets)

#在这里计算smooth_L1结果

diff_sL1 = smoothL1(diff, 3.0)

# Only count loss for positive anchors. Make

sure it's a sum.

#在这里将上面的运算结果乘以rpn_outside_weights并且求和,同样是只对positive

anchor计算bbox loss

rpn_bbox_reg =

tf.reduce_sum(tf.multiply(rpn_outside_weights, diff_sL1))

# Constant for weighting bounding box loss with

classification loss

#在这里将边框误差再乘以一个lambda参数,作为最终的边框误差

rpn_bbox_reg = cfg.TRAIN.RPN_BBOX_LAMBDA *

rpn_bbox_reg

return rpn_bbox_reg

#返回最终的误差

如上函数所示,在计算rpn_cls_loss的时候,排除掉了label中对应值为-1的值,也就是说,只保留了图像边界内的与ground

truth

box最大IoU在0.7以上或者0.3以下的anchor。在计算rpn_bbox_loss的时候,从最开始乘以rpn_inside_weights来看,只计算了前景anchor的bbox

loss,因为其余非前景anchor对应的rpn_inside_weights都为0。

到此为止,Faster

R-CNN的RPN代码就接近尾声了。RPN代码中比较巧妙的部分笔者认为有如下两个:

1)

如何生成H×W×9个anchor:做法是先生成9个不同长宽比不同面积anchor,然后在图上各个滑动区域上都生成这9个anchor。

2) 如何计算每个anchor的类别(前景背景)和边框变换值。做法是首先为每个anchor计算与ground truth

box对应的IoU值,排除IoU为0.3~0.7的anchor。0.3以下的为背景anchor,0.7以上的为前景anchor。对于边框变化值,是计算的anchor与IoU重合最大的ground

truth box对应的tx,ty,th,tw四个值。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值