1、detectron整个的模型建立过程,以关键点为例说明
2、整体的流程如上,但是从
FPN.add_fpn_rpn_outputs(model, blob_in, dim_in, spatial_scale_in)开始我们要详细介绍一下
1)在这个函数里面会除了加上rpn的输出之外,还会顺带加上proposal的生成过程
model.GenerateProposals(
[rpn_cls_probs_fpn, rpn_bbox_pred_fpn, 'im_info'],
['rpn_rois_fpn' + slvl, 'rpn_roi_probs_fpn' + slvl],
anchors=lvl_anchors,
spatial_scale=sc
)#产生的proposal将以rpn_rois_fpn_x命名,rpn_roi_probs_fpn_x将会表示每一个proposal的分数
2)接下来是model.CollectAndDistributeFpnRpnProposals()这个函数,这个函数将上面产生的proposal分配给各个level的FPN网络并且声称一系列的标签
class CollectAndDistributeFpnRpnProposalsOp(object):
def __init__(self, train):
self._train = train
def forward(self, inputs, outputs):
"""See modeling.detector.CollectAndDistributeFpnRpnProposals for
inputs/outputs documentation.
"""
# inputs is
# [rpn_rois_fpn2, ..., rpn_rois_fpn6,
# rpn_roi_probs_fpn2, ..., rpn_roi_probs_fpn6]
# If training with Faster R-CNN, then inputs will additionally include
# + [roidb, im_info] #输入的blob是这些
rois = collect(inputs, self._train) #collect函数的作用是将之前各个level的rois都连在一起,取分数最高的前2000个,获得这些rois和对应的分数
if self._train:
# During training we reuse the data loader code. We populate roidb
# entries on the fly using the rois generated by RPN.
# im_info: [[im_height, im_width, im_scale], ...]
im_info = inputs[-1].data #获取到图像信息
im_scales = im_info[:, 2] #把图像缩放尺度拿出来
roidb = blob_utils.deserialize(inputs[-2].data) #roidb拿出来
# For historical consistency with the original Faster R-CNN
# implementation we are *not* filtering crowd proposals.
# This choice should be investigated in the future (it likely does
# not matter).
json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0) #向roidb里面加入proposal的信息,详细看下面该函数的介绍,经过了这个函数,roidb已经焕然一新了,加入proposal的信息了
# Compute training labels for the RPN proposals; also handles
# distributing the proposals over FPN levels
output_blob_names = roi_data.fast_rcnn.get_fast_rcnn_blob_names() #获得fasterrcnn所需要的blob
blobs = {k: [] for k in output_blob_names}
roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb) #往faster-rcnn的blob里面添加对应的元素
for i, k in enumerate(output_blob_names):
blob_utils.py_op_copy_blob(blobs[k], outputs[i]) #将blob里面的送到output里面去
else:
# For inference we have a special code path that avoids some data
# loader overhead
distribute(rois, None, outputs, self._train)
def collect(inputs, is_training):
cfg_key = 'TRAIN' if is_training else 'TEST'
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
k_max = cfg.FPN.RPN_MAX_LEVEL
k_min = cfg.FPN.RPN_MIN_LEVEL
num_lvls = k_max - k_min + 1
roi_inputs = inputs[:num_lvls]
score_inputs = inputs[num_lvls:]
if is_training:
score_inputs = score_inputs[:-2]
# rois are in [[batch_idx, x0, y0, x1, y2], ...] format
# Combine predictions across all levels and retain the top scoring
rois = np.concatenate([blob.data for blob in roi_inputs])
#pdb.set_trace()
scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
inds = np.argsort(-scores)[:post_nms_topN]
rois = rois[inds, :]
return rois
def add_proposals(roidb, rois, scales, crowd_thresh):
"""Add proposal boxes (rois) to an roidb that has ground-truth annotations
but no proposals. If the proposals are not at the original image scale,
specify the scale factor that separate them in scales.
"""
box_list = []
for i in range(len(roidb)):
inv_im_scale = 1. / scales[i] #算出将图像变为原来的尺度需要多大的缩放
idx = np.where(rois[:, 0] == i)[0] #因为roi都是有编号的,所以把属于该张图片的roi拿出来
box_list.append(rois[idx, 1:] * inv_im_scale) #把roi乘以刚刚算出来的尺度变换参数把他变换到原来的图片空间上的proposal!!!!!!!!!!!!!!!!!这里是第一次设计尺度还原
_merge_proposal_boxes_into_roidb(roidb, box_list) #调用该函数把proposal加入到roidb中区
if crowd_thresh > 0:
_filter_crowd_proposals(roidb, crowd_thresh) #过滤人群
_add_class_assignments(roidb) #设置每个proposal的分类,是属于哪个分类的,以及最大重叠式多少同时做一系列的检查
def _merge_proposal_boxes_into_roidb(roidb, box_list):
"""Add proposal boxes to each roidb entry."""
assert len(box_list) == len(roidb)
for i, entry in enumerate(roidb):
boxes = box_list[i]
num_boxes = boxes.shape[0]
gt_overlaps = np.zeros(
(num_boxes, entry['gt_overlaps'].shape[1]),
dtype=entry['gt_overlaps'].dtype
)
box_to_gt_ind_map = -np.ones(
(num_boxes), dtype=entry['box_to_gt_ind_map'].dtype
)
# Note: unlike in other places, here we intentionally include all gt
# rois, even ones marked as crowd. Boxes that overlap with crowds will
# be filtered out later (see: _filter_crowd_proposals).
gt_inds = np.where(entry['gt_classes'] > 0)[0]
if len(gt_inds) > 0:
gt_boxes = entry['boxes'][gt_inds, :] #将gt框都拿出来
gt_classes = entry['gt_classes'][gt_inds] #将gt对应的类别拿出来
proposal_to_gt_overlaps = box_utils.bbox_overlaps( #计算proposal和gt之间的overlap
boxes.astype(dtype=np.float32, copy=False),
gt_boxes.astype(dtype=np.float32, copy=False)
)
# Gt box that overlaps each input box the most
# (ties are broken arbitrarily by class order)
argmaxes = proposal_to_gt_overlaps.argmax(axis=1) #算出每个proposal对应的gt是谁
# Amount of that overlap
maxes = proposal_to_gt_overlaps.max(axis=1) #把重叠最大的面积都拿出来,找出重叠不为0的那些
# Those boxes with non-zero overlap with gt boxes
I = np.where(maxes > 0)[0]
# Record max overlaps with the class of the appropriate gt box
gt_overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] #gt_overlap对应的分类的地方就要设置相应的分数,例如proposalA与gtB最大重合,就在gtB所对应的位置设置重叠
box_to_gt_ind_map[I] = gt_inds[argmaxes[I]] #每一个proposal对应的是哪个gt
entry['boxes'] = np.append( #之后向roidb的每个入口加入proposal的信息,同时保留之前的gt信息
entry['boxes'],
boxes.astype(entry['boxes'].dtype, copy=False),
axis=0
)
entry['gt_classes'] = np.append(
entry['gt_classes'],
np.zeros((num_boxes), dtype=entry['gt_classes'].dtype)
)
entry['seg_areas'] = np.append(
entry['seg_areas'],
np.zeros((num_boxes), dtype=entry['seg_areas'].dtype)
)
entry['gt_overlaps'] = np.append(
entry['gt_overlaps'].toarray(), gt_overlaps, axis=0
)
entry['gt_overlaps'] = scipy.sparse.csr_matrix(entry['gt_overlaps'])
entry['is_crowd'] = np.append(
entry['is_crowd'],
np.zeros((num_boxes), dtype=entry['is_crowd'].dtype)
)
entry['box_to_gt_ind_map'] = np.append(
entry['box_to_gt_ind_map'],
box_to_gt_ind_map.astype(
entry['box_to_gt_ind_map'].dtype, copy=False
)
)
def _filter_crowd_proposals(roidb, crowd_thresh):
"""Finds proposals that are inside crowd regions and marks them as
overlap = -1 with each ground-truth rois, which means they will be excluded
from training.
"""
for entry in roidb:
gt_overlaps = entry['gt_overlaps'].toarray()
crowd_inds = np.where(entry['is_crowd'] == 1)[0]
non_gt_inds = np.where(entry['gt_classes'] == 0)[0]
if len(crowd_inds) == 0 or len(non_gt_inds) == 0:
continue
crowd_boxes = box_utils.xyxy_to_xywh(entry['boxes'][crowd_inds, :])
non_gt_boxes = box_utils.xyxy_to_xywh(entry['boxes'][non_gt_inds, :])
iscrowd_flags = [int(True)] * len(crowd_inds)
ious = COCOmask.iou(non_gt_boxes, crowd_boxes, iscrowd_flags)
bad_inds = np.where(ious.max(axis=1) > crowd_thresh)[0]
gt_overlaps[non_gt_inds[bad_inds], :] = -1
entry['gt_overlaps'] = scipy.sparse.csr_matrix(gt_overlaps)
def _add_class_assignments(roidb):
"""Compute object category assignment for each box associated with each
roidb entry.
"""
for entry in roidb:
gt_overlaps = entry['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1)
entry['max_classes'] = max_classes
entry['max_overlaps'] = max_overlaps
# sanity checks
# if max overlap is 0, the class must be background (class 0)
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# if max overlap > 0, the class must be a fg class (not class 0)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)
再来看add_fast_rcnn_blobs
def add_fast_rcnn_blobs(blobs, im_scales, roidb):
"""Add blobs needed for training Fast R-CNN style models."""
# Sample training RoIs from each image and append them to the blob lists
for im_i, entry in enumerate(roidb):
frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i) #这个函数也是返回一个blob,这个blob里面有什么呢?看下面非代码部分的解释
for k, v in frcn_blobs.items():
blobs[k].append(v)
# Concat the training blob lists into tensors
for k, v in blobs.items():
if isinstance(v, list) and len(v) > 0:
blobs[k] = np.concatenate(v)
# Add FPN multilevel training RoIs, if configured
if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
_add_multilevel_rois(blobs)
# Perform any final work and validity checks after the collating blobs for
# all minibatch images
valid = True
if cfg.MODEL.KEYPOINTS_ON:
valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)
return valid
sample_rois这个函数最终返回的是从proposal里面按照一个batch的大小拿出来的proposal,正负样本比例参考配置文件,默认256,最终的blob返回的是
blob_dict = dict(
labels_int32=sampled_labels.astype(np.int32, copy=False),
rois=sampled_rois,
bbox_targets=bbox_targets,
bbox_inside_weights=bbox_inside_weights,
bbox_outside_weights=bbox_outside_weights
)
如果有关键点加入训练的话,还要加入关键点的一些信息,关键点有哪些信息呢?
blobs[‘keypoint_rois’] = sampled_fg_rois
blobs[‘keypoint_locations_int32’] = heats.astype(np.int32, copy=False)
blobs[‘keypoint_weights’] = weights
经过了rois之后的blob的输出是
对于关键点部分的blob加入,是在前景对象上面设置关键点的信息
def add_keypoint_rcnn_blobs(
blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
):
"""Add Mask R-CNN keypoint specific blobs to the given blobs dictionary."""
# Note: gt_inds must match how they're computed in
# datasets.json_dataset._merge_proposal_boxes_into_roidb
gt_inds = np.where(roidb['gt_classes'] > 0)[0] #找出gt
max_overlaps = roidb['max_overlaps'] #找出roidb里面的max_overlaps的对应
gt_keypoints = roidb['gt_keypoints'] #找出gt的keypoint信息
ind_kp = gt_inds[roidb['box_to_gt_ind_map']] #找出所有的box对应的gt的索引值,长度为N,也即boxes的数量
within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) #判断所有的关键点是否都在roidb的boxes范围内,此时within——box是一个Nx17的数组,其中N是所有的boxes的数量
vis_kp = gt_keypoints[ind_kp, 2, :] > 0 #vis_kp的大小为Nx17,也即每个box对应的那个gt的每个关键点的可见性
is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 #这句代码的目的是为了判断N个roi是否都有可见的关键点,is_visible是一个N维的向量,只有那些可见关键点在roi内的数量大于0的roi才会被采纳作为下一步的roi进行使用
kp_fg_inds = np.where(
np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) #找出那些重叠度大于一定阈值的作为关键点部分proposal的正样本
)[0]
kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) #
if kp_fg_inds.size > kp_fg_rois_per_this_image:
kp_fg_inds = np.random.choice(
kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False
)
sampled_fg_rois = roidb['boxes'][kp_fg_inds] #找出选取的那些作为人体关键点对应的那些box
box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] #找出选取的那些作为人体关键点对应的那些box所对应的gt
num_keypoints = gt_keypoints.shape[2] #Nx3x17
sampled_keypoints = -np.ones( #预先定义采样的关键点,通通设置为-1,-1标签的点都是不参与训练的
(len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints),
dtype=gt_keypoints.dtype
)
for ii in range(len(sampled_fg_rois)):
ind = box_to_gt_ind_map[ii]
if ind >= 0:
sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] #找到采样得到的框,然后找到其对应的关键点的gt信息,然后赋给它
assert np.sum(sampled_keypoints[ii, 2, :]) > 0
heats, weights = keypoint_utils.keypoints_to_heatmap_labels( #接下来制作heatmap标签,这是最关键的一步,给出sampled-roi和sampled-keypoints,来制作对应的heat和weight
sampled_keypoints, sampled_fg_rois
)
shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) #N*17
heats = heats.reshape(shape) #reshape成Nx17
weights = weights.reshape(shape) #reshape成NX17
sampled_fg_rois *= im_scale #将rois的尺度刻画到缩放后的图像的尺度
repeated_batch_idx = batch_idx * blob_utils.ones(
(sampled_fg_rois.shape[0], 1)
)
sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) #然后将选择出来的roi加上第几张图片这个信息
blobs['keypoint_rois'] = sampled_fg_rois #将blob里面加上对应的信息
blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
blobs['keypoint_weights'] = weights
接下来看一下keypoint是怎么转换为最终的label的
最终的map是要转为56x56的,所以会计算将原来的roi变成56x56,x和y方向各需要放大缩小多少倍,记住是要计算roi的怎么缩放到这个的倍数
1)首先计算缩放倍数,把关键点映射到最终的56x56的图上
2)来看一下关键点是不是合理的,也即是否映射到56x56的map图上
3)要看关键点合不合理,除了要做上述的判断还要看一下关键点是否是可见
4)然后将关键点映射到56x56map图上的2维位置变成一维的
5)最后返回生成的label和weight,都是NX17的大小
6)返回后将label和weight拉成一维向量
看一下关键点roi的选择原则:
1)首先roi内必须有可见关键点
2)然后roi的max_overlap必须达到一定的阈值
只有这些roi才有资格入选keypoint的roi,在关键点标签制作的时候,那么有一点就是那么那些在roi之外的关键点怎么办?
在刚刚的keypoint装换为label的时候会将roi之外的可见关键点设置为invalid,也即valid是false的
valid_loc = np.logical_and(
np.logical_and(x >= 0, y >= 0),
np.logical_and(
x < cfg.KRCNN.HEATMAP_SIZE, y < cfg.KRCNN.HEATMAP_SIZE))
就是这段话,会判断一个位置是不是一个合理的位置,要看gt_keypoints 经过映射之后是否还在界内,通过这些也可以发现关键点部分的训练是单独训练关键点的,就是用RPN提出来的proposal来训练关键点。详细内容参考keypoint.py文件和keypoint_rcnn.py
上面流程图中把distribute在肢解就变成了如下形式
最后的add_multilevel_roi_blobs的这个函数将roi分配给对应level的blob,因为roi分配的时候不同的level对应的索引是不同的,为了方便之后的恢复,blob里面有一个key就是做这个工作的
def add_multilevel_roi_blobs(
blobs, blob_prefix, rois, target_lvls, lvl_min, lvl_max
):
"""Add RoI blobs for multiple FPN levels to the blobs dict.
blobs: a dict mapping from blob name to numpy ndarray
blob_prefix: name prefix to use for the FPN blobs
rois: the source rois as a 2D numpy array of shape (N, 5) where each row is
an roi and the columns encode (batch_idx, x1, y1, x2, y2)
target_lvls: numpy array of shape (N, ) indicating which FPN level each roi
in rois should be assigned to
lvl_min: the finest (highest resolution) FPN level (e.g., 2)
lvl_max: the coarest (lowest resolution) FPN level (e.g., 6)
"""
rois_idx_order = np.empty((0, ))
rois_stacked = np.zeros((0, 5), dtype=np.float32) # for assert
for lvl in range(lvl_min, lvl_max + 1):
idx_lvl = np.where(target_lvls == lvl)[0]
blobs[blob_prefix + '_fpn' + str(lvl)] = rois[idx_lvl, :]
rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
rois_stacked = np.vstack(
[rois_stacked, blobs[blob_prefix + '_fpn' + str(lvl)]]
)
pdb.set_trace()
rois_idx_restore = np.argsort(rois_idx_order).astype(np.int32, copy=False)
blobs[blob_prefix + '_idx_restore_int32'] = rois_idx_restore
# Sanity check that restore order is correct
assert (rois_stacked[rois_idx_restore] == rois).all()
_idx_restore_int32这个参数就是存储对应顺序的一个参数,当然这只是个后缀,在分配roi的时候一共有4个level有别于rpn的5个level,其中边长224代表的是第四个level,112代表的是第三个level,56是第二个,448是第四个
做完这些,有一步是
if cfg.MODEL.KEYPOINTS_ON:
valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)
return valid
来判断这一个batch是不是合格,判断标准是valid的关键点的数量是多少,是不是大于20个,如果这一个batch的参与训练的关键点的数量没有达到阈值,那么就是不合格的,valid返回false,否则返回true,除此之外,还会加入一个关键点的norm参数作为一个blob,blobs[‘keypoint_loss_normalizer’] = np.array(norm, dtype=np.float32)
3、加入fast-rcnn的head
来看一下RoIFeatureTransform函数
这个函数实现的是将各个level的roi进行roipooling,detectron采用的是roialign,由于roi之前存在了blob里面,按照不同的level进行了存储,不同型号的roi对应的level自然也是不一样的,rois_fpn_2,rois_fpn_3,rois_fpn_4,rois_fpn_5,对于上述的4中roi分别用fpn_res2_2_sum,fpn_res3_3_sum,fpn_res4_5_sum,fpn_res5_2_sum这几层的特征进行roi-pooling对于每一个level的roi做完pooling之后,将所有的pooling特征concat在一起,形成所有的roi的特征,也即
xform_shuffled, _ = self.net.Concat(
bl_out_list, [blob_out + '_shuffled', '_concat_' + blob_out],
axis=0
)
之前存储的时候保存了一个blob[‘roi_idx_restore_int32’]这样的一个blob,目的是为了恢复没有进行roi的level分配之前的roi顺序,因为之前制作标签的时候都是没有分配的顺序,所以为了之后的损失是和对应的标签是对应的,所以最后要将顺序进行还原,用这个restore_int32就可以实现
xform_out = self.net.BatchPermutation(
[xform_shuffled, restore_bl], blob_out
)
总体代码如下
def RoIFeatureTransform(
self,
blobs_in,
blob_out,
blob_rois='rois',
method='RoIPoolF',
resolution=7,
spatial_scale=1. / 16.,
sampling_ratio=0
):
"""Add the specified RoI pooling method. The sampling_ratio argument
is supported for some, but not all, RoI transform methods.
RoIFeatureTransform abstracts away:
- Use of FPN or not
- Specifics of the transform method
"""
assert method in {'RoIPoolF', 'RoIAlign'}, \
'Unknown pooling method: {}'.format(method)
has_argmax = (method == 'RoIPoolF')
if isinstance(blobs_in, list):
# FPN case: add RoIFeatureTransform to each FPN level
k_max = cfg.FPN.ROI_MAX_LEVEL # coarsest level of pyramid
k_min = cfg.FPN.ROI_MIN_LEVEL # finest level of pyramid
assert len(blobs_in) == k_max - k_min + 1
bl_out_list = []
for lvl in range(k_min, k_max + 1):
bl_in = blobs_in[k_max - lvl] # blobs_in is in reversed order
sc = spatial_scale[k_max - lvl] # in reversed order
bl_rois = blob_rois + '_fpn' + str(lvl)
bl_out = blob_out + '_fpn' + str(lvl)
bl_out_list.append(bl_out)
bl_argmax = ['_argmax_' + bl_out] if has_argmax else []
self.net.__getattr__(method)(
[bl_in, bl_rois], [bl_out] + bl_argmax,
pooled_w=resolution,
pooled_h=resolution,
spatial_scale=sc,
sampling_ratio=sampling_ratio
)
# The pooled features from all levels are concatenated along the
# batch dimension into a single 4D tensor.
xform_shuffled, _ = self.net.Concat(
bl_out_list, [blob_out + '_shuffled', '_concat_' + blob_out],
axis=0
)
# Unshuffle to match rois from dataloader
restore_bl = blob_rois + '_idx_restore_int32'
xform_out = self.net.BatchPermutation(
[xform_shuffled, restore_bl], blob_out
)
else:
# Single feature level
bl_argmax = ['_argmax_' + blob_out] if has_argmax else []
# sampling_ratio is ignored for RoIPoolF
xform_out = self.net.__getattr__(method)(
[blobs_in, blob_rois], [blob_out] + bl_argmax,
pooled_w=resolution,
pooled_h=resolution,
spatial_scale=spatial_scale,
sampling_ratio=sampling_ratio
)
# Only return the first blob (the transformed features)
return xform_out
4、加入kcnn的head
顺序类似上面,先搭网络,再加输出和损失