lib/roi_data_layer/roidb.py
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
"""Transform a roidb into a trainable roidb by adding a bunch of metadata."""
import numpy as np
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
import PIL
def prepare_roidb(imdb): #roidb还要再加一些额外的衍生信息,方便训练
"""Enrich the imdb's roidb by adding some derived 衍生quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size #得到图片的大小(宽,高)
for i in xrange(imdb.num_images)]
# 当在‘Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model’阶段中,roidb由rpn_roidb()
# 方法生成,其中的每一张图像的box不仅仅只有gtbox,还包括rpn_file里面的box。
roidb = imdb.roidb
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i) #添加图片路径、宽、高等信息
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
# #为了argmax的计算,需要把gt_overlaps转换为一个稠密矩阵
gt_overlaps = roidb[i]['gt_overlaps'].toarray() #成为矩阵
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1) #找到每一列中与gt的重叠率最大的那个值,按行来找最大值
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes #为roidb中的max_classes赋值
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks 合理性检查
# max overlap of 0 => class should be zero (background)
# max overlap==0意味着背景,否则非背景
zero_inds = np.where(max_overlaps == 0)[0]#找到所有的max overlap==0处的坐标,x为索引
assert all(max_classes[zero_inds] == 0)#判断所有的零索引对应的classes为背景
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]#非零的索引
assert all(max_classes[nonzero_inds] != 0)#对应的classes不是背景
def add_bbox_regression_targets(roidb): #添加训练bounding-box regressors时需要的信息
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'#如果不满足条件(没有运行prepare_roidb()函数)则打印后面的信息
num_images = len(roidb) #获取图片的个数
# Infer number of classes from the number of columns in gt_overlaps
# .shape[] 读取矩阵长度,shape[0]就是读取矩阵第一维度的长度(行)shape[1] 列。
num_classes = roidb[0]['gt_overlaps'].shape[1]#gt框的个数
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
# bbox_targets:每个box的类别,以及与最接近的gt-box的4个方位偏移
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes)#有添加了一个key
#这里的config是false
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
# 使用固定的均值和方差代替经验值
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:#对bbox的坐标值进行归一化
# Compute values needed for means and stds 计算所需的均值和方差
# var(x) = E(x^2) - E(x)^2
# 计数各个类别出现box的数量
class_counts = np.zeros((num_classes, 1)) + cfg.EPS #加上cfg.EPS防止除0出错,__C.EPS = 1e-14
sums = np.zeros((num_classes, 4))
# 21类*4个位置,如果出现box的类别与其中某一类相同,将该box的4个target加入4个列元素中
squared_sums = np.zeros((num_classes, 4))
# 21类*4个位置,如果出现box的类别与其中某一类相同,将该box的4个target的平方加入4个列元素中
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes): #cls是类别号
cls_inds = np.where(targets[:, 0] == cls)[0]
#比如20个类,则按照1-20的顺序找出属于每一个类的bboxde 索引值
if cls_inds.size > 0: # box的类别与该类匹配,计入
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
#计算坐标的均值(class counts的每一行值是属于某一类的bbox的个数),means的维数:num_classes x4
stds = np.sqrt(squared_sums / class_counts - means ** 2)
#计算坐标的方差,stds的维数:num_classes x4
print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
#对每个box归一化target
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:#如果需要归一化,则减去均值并除以方差
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# 均值和方差也用于预测
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()#将多维数组展开为一维数组
def _compute_targets(rois, overlaps, labels):
#rois = roidb[i]['boxes'],overlaps = roidb[i]['max_overlaps',labels = roidb[i]['max_classes']
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs 这个函数主要是判断RPN产生的proposal的回归目标是哪一个gt_box
gt_inds = np.where(overlaps == 1)[0]
#这里的overlaps是roidb['max_overlaps'],得到索引值对应的是boxes中gt框的索引值
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
# 不存在gt ROI,返回空数组
return np.zeros((rois.shape[0], 5), dtype=np.float32)
#rois.shape[0]是一张图中框的个数
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] #cfg.TRAIN.BBOX_THRESH=0.5
# BBOX阈值,只有ROI与gt的重叠度大于阈值,这样的ROI才能用作bb回归的训练样本
# Get IoU overlap between each ex ROI and gt ROI
# 计算ex ROI and gt ROI的IoU
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
#ex存储的是经过cfg.TRAIN.BBOX_THRESH筛选后的(ex)框和gt框的IOU数组
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target找到ex框IOU最大时对应的gt框
# 这里每一行代表一个ex_roi,列代表gt_roi,元素数值代表两者的IoU
gt_assignment = ex_gt_overlaps.argmax(axis=1) #按行求最大,返回索引.
gt_rois = rois[gt_inds[gt_assignment], :] #得到每一个ex_rois对应的gt_rois,与下面ex_roi数量相同
ex_rois = rois[ex_inds, :]
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds] #得到满足条件后bbox的类别号
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)#生成论文中需要的tx,ty,tw,th四个量,即ex_box与gt_box的4个方位的偏移
return targets