这里”C++版本的代码”是指: https://github.com/galian123/cpp_faster_rcnn_detect .
py-faster-rcnn中demo.py代码, 是指 https://github.com/rbgirshick/py-faster-rcnn/blob/master/tools/demo.py 以及
https://github.com/rbgirshick/py-faster-rcnn/tree/master/lib 目录下的一些代码.
涉及到的.py文件都是 https://github.com/rbgirshick/py-faster-rcnn/ 中的.
★ forward
♦ python代码
def im_detect(net, im, boxes=None):
# 将blob数据转换成float类型后, 赋值给forward_kwargs, 对应的key是'data'
forward_kwargs = {'data': blobs['data'].astype(np.float32, copy=False)}
if cfg.TEST.HAS_RPN: # HAS_RPN在demo.py中置为true了.
# 图片的信息, 由之前的分析可知, 'im_info'对应的数据是一维数组,其值为(高, 宽, 缩放倍率)
forward_kwargs['im_info'] = blobs['im_info'].astype(np.float32, copy=False)
else:
forward_kwargs['rois'] = blobs['rois'].astype(np.float32, copy=False)
blobs_out = net.forward(**forward_kwargs)
- 对 blobs_out = net.forward(**forward_kwargs) 的解释
(1) forward()
参数中的两个星号**
的含义是, 将字典(键值对)作为参数传入.
python官方文档中对**
的解释:
If the syntax **expression appears in the function call, expression must evaluate to a mapping, the contents of which are treated as additional keyword arguments. In the case of a keyword appearing in both expression and as an explicit keyword argument, a TypeError exception is raised.
(2) net.forward
是在 python/caffe/pycaffe.py 中定义的,
Net.forward = _Net_forward
_Net_forward:
def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
"""
Forward pass: prepare inputs and run the net forward.
Parameters
----------
blobs : list of blobs to return in addition to output blobs.
kwargs : Keys are input blob names and values are blob ndarrays.
For formatting inputs for Caffe, see Net.preprocess().
If None, input is taken from data layers.
start : optional name of layer at which to begin the forward pass
end : optional name of layer at which to finish the forward pass
(inclusive)
Returns
-------
outs : {blob name: blob ndarray} dict.
"""
if blobs is None:
blobs = []
if start is not None:
start_ind = list(self._layer_names).index(start)
else:
start_ind = 0 # 执行这里
if end is not None:
end_ind = list(self._layer_names).index(end)
outputs = set(self.top_names[end] + blobs)
else: # 执行这里
end_ind = len(self.layers) - 1
outputs = set(self.outputs + blobs)
if kwargs:
# self.inputs是test.prototxt中的输入, input: "data" 和 input: "im_info"
if set(kwargs.keys()) != set(self.inputs):
raise Exception('Input blob arguments do not match net inputs.')
# Set input according to defined shapes and make arrays single and
# C-contiguous as Caffe expects.
for in_, blob in six.iteritems(kwargs):
if blob.shape[0] != self.blobs[in_].shape[0]:
raise Exception('Input is not batch sized')
self.blobs[in_].data[...] = blob
# _forward是在_caffe.cpp中实现的, 在声明Net类时,定义了:
# .def("_forward", &Net<Dtype>::ForwardFromTo)
# 所以将会执行 ForwardFromTo()
self._forward(start_ind, end_ind)
# Unpack blobs to extract
return {out: self.blobs[out].data for out in outputs}
♦ C++ 代码
// 这里的net_->input_blobs()[0] 与net_->blob_by_name("data")指向相同的地址
Blob<float> * input_blobs= net_->input_blobs()[0];
switch(Caffe::mode()){
case Caffe::CPU:
memcpy(input_blobs->mutable_cpu_data(), data_buf, sizeof(float) * input_blobs->count());
break;
case Caffe::GPU: #执行这里
caffe_gpu_memcpy(sizeof(float)* input_blobs->count(), data_buf, input_blobs->mutable_gpu_data());
break;
}
float im_info[3];
im_info[0] = cv_resized.rows;
im_info[1] = cv_resized.cols;
im_info[2] = img_scale;
net_->blob_by_name("im_info")->set_cpu_data(im_info);
net_->ForwardFrom(0);
★ 获取roi的boxes
♦ python代码
def im_detect(net, im, boxes=None):
blobs_out = net.forward(**forward_kwargs)
if cfg.TEST.HAS_RPN:
# 每次只处理一张图片
assert len(im_scales) == 1, "Only single-image batch implemented"
# 获取所有的ROI
rois = net.blobs['rois'].data.copy()
# rois是缩放后的, 需要将rois还原为原始的大小.
boxes = rois[:, 1:5] / im_scales[0] # 取rois每一行中的后4个数据, 见"解释1"
- 解释1: rois的内容
打印rois的内容, 如下:
rois.shape: (300, 5)
rois: [[ 0. 483.77429199 0. 999. 467.63253784]
[ 0. 63.49487305 24.32870483 615.18811035 568.28393555]
[ 0. 452.25259399 278.47637939 999. 595. ]
...,
[ 0. 245.8664856 0. 585.41247559 470.83172607]
[ 0. 50.10795212 286.49438477 96.97434998 327.50134277]
[ 0. 299.90609741 130.08828735 358.32467651 211.07351685]]
rois[:, 1:5]
将第一列(下标为0的列)的值过滤掉了, 所以boxes的shape是:(300, 4), 每一行4个数值, 含义为(xmin, ymin, xmax, ymax).
♦ C++代码
const float* rois;
num = net_->blob_by_name("rois")->num(); // num为300
// ROIs
rois = net_->blob_by_name("rois")->cpu_data();
boxes = new float[num*4];
for (int n = 0; n < num; n++){
for (int c = 0; c < 4; c++){
/*rois一行是5个数据,所以是n*5. 每行第一个数据为0, 所以取1~4位置的数据.
boxes一行是4个数据,所以是n*4.
rois记录的是缩放后的大小, 除以img_scale转换为原图中的大小.*/
boxes[n * 4 + c] = rois[n * 5 + c + 1] / img_scale;
}
}
★ 获取分数
♦ python代码
def im_detect(net, im, boxes=None):
if cfg.TEST.SVM: # cfg.TEST.SVM的值为False, 见"解释1".
# use the raw scores before softmax under the assumption they
# were trained as linear SVMs
scores = net.blobs['cls_score'].data # 见"解释2"
else:
# 执行这里
# use softmax estimated probabilities
scores = blobs_out['cls_prob']
- 解释1: cfg.TEST.SVM的值
py-faster-rcnn/lib/fast_rcnn/config.py
# Experimental: treat the (K+1) units in the cls_score layer as linear
# predictors (trained, eg, with one-vs-rest SVMs).
__C.TEST.SVM = False
- 解释2: scores的shape
scores的shape是(300, 21). 因为在test.prototxt中设置了cls_score layer的输出为21.
♦ C++代码
const float* scores;
scores = net_->blob_by_name("cls_prob")->cpu_data();
★ 转换boxes
BBOX_REG的值为True.
def im_detect(net, im, boxes=None):
if cfg.TEST.BBOX_REG: # 执行这里, 见"解释1"
# Apply bounding-box regression deltas
box_deltas = blobs_out['bbox_pred'] # 见"解释2"
pred_boxes = bbox_transform_inv(boxes, box_deltas) # 见下一节
pred_boxes = clip_boxes(pred_boxes, im.shape)
else:
# Simply repeat the boxes, once for each class
pred_boxes = np.tile(boxes, (1, scores.shape[1]))
- 解释1: cfg.TEST.BBOX_REG的值
py-faster-rcnn/lib/fast_rcnn/config.py
# Test using bounding-box regressors
__C.TEST.BBOX_REG = True
- 解释2: box_deltas的维度
box_deltas的shape是(300, 84), 这里的300是在config.py中设置的(如下), 之选取分数最高的前300个box.
## Number of top scoring boxes to keep after applying NMS to RPN proposals
__C.TEST.RPN_POST_NMS_TOP_N = 300
这里的84是test.prototxt中layer bbox_pred的输出.
在检测时,我使用的是下面这个路径的prototxt :
py-faster-rcnn/models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
打开这个文件,可以看到:
num_output: 84
, 这个84即是box_deltas的第二个维度.
layer {
name: "bbox_pred"
type: "InnerProduct"
bottom: "fc7"
top: "bbox_pred"
略
inner_product_param {
num_output: 84
略
}
}
- deltas的内容(举例)
deltas.shape: (300, 84), dtype: float32
deltas: [[ 9.60100806e-05 1.26006280e-03 -2.43484465e-04 ..., -4.72921133e-02
-2.12732166e-01 -1.51322618e-01]
[ -1.77061767e-04 7.68931466e-04 -6.74572366e-04 ..., -1.69632081e-02
-1.13817520e-01 -1.23578422e-01]
[ 2.23828043e-04 8.45583039e-04 -2.06545158e-03 ..., -1.12596154e-01
8.17453414e-02 1.14649057e-01]
...,
[ 1.48329753e-04 1.67545048e-03 -2.05304706e-03 ..., 4.25588563e-02
-3.89761664e-02 -8.92069712e-02]
[ 7.72749656e-04 -2.51855049e-03 -3.18117486e-03 ..., -8.37800950e-02
5.03335238e-01 1.02863863e-01]
[ 7.43390410e-04 -7.27616716e-04 -1.65828911e-03 ..., 1.51133044e-02
1.52454287e-01 1.12097539e-01]]
★ bbox_transform_inv
♦ python代码
from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
def im_detect(net, im, boxes=None):
if cfg.TEST.BBOX_REG: # 执行这里
box_deltas = blobs_out['bbox_pred']
pred_boxes = bbox_transform_inv(boxes, box_deltas)
py-faster-rcnn/lib/fast_rcnn/bbox_transform.py
def bbox_transform_inv(boxes, deltas):
if boxes.shape[0] == 0:
return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
# 数据类型转换, deltas.dtype 是float32
boxes = boxes.astype(deltas.dtype, copy=False)
# boxes的shape(300, 4), 每一行4个数值, 含义为(xmin, ymin, xmax, ymax)
widths = boxes[:, 2] - boxes[:, 0] + 1.0 # xmax - xmin + 1, 所有boxes的宽度
heights = boxes[:, 3] - boxes[:, 1] + 1.0 # ymax - ymin + 1, 所有boxes的高度
ctr_x = boxes[:, 0] + 0.5 * widths # x + 0.5 * widths, 是宽的中心点
ctr_y = boxes[:, 1] + 0.5 * heights # y + 0.5 * heights, 是高的中心点
# deltas的维度是(300, 84), 所以dx,dy等的维度为(300, 21)
dx = deltas[:, 0::4] # 取每一行中, 0, 4, 8, ..., 位置的数据
dy = deltas[:, 1::4] # 取每一行中, 1, 5, 9, ..., 位置的数据
dw = deltas[:, 2::4] # 取每一行中, 2, 6, 10, ..., 位置的数据
dh = deltas[:, 3::4] # 取每一行中, 3, 7, 11, ..., 位置的数据
# dx是二维数组, 维度为(300, 21),
# 这里用到了numpy的broadcasting机制.
# widths是一维数组(300,), widths[:, np.newaxis]扩展到二维, 并将每一列都用第一列的数据填充
# ctr_x是一维数组(300,), ctr_x[:, np.newaxis]扩展到二维, 并将每一列都用第一列的数据填充
# pred_ctr_x 是二维数组, 其中的每一个元素都是dx中的值 * 宽度, 再加上中心点位置.
pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
# np.exp(dw)是对dw(二维数组)中的每一项都记算以e为底数, 数组中的数值为指数的幂运算
# 然后数组中的每一项都与相应的宽度相乘
pred_w = np.exp(dw) * widths[:, np.newaxis]
pred_h = np.exp(dh) * heights[:, np.newaxis]
# deltas.shape是(300, 84), dtype为float32, 所以pred_boxes的维度是(300, 84)
pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
# x1
pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
# y1
pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
# x2
pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
# y2
pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
return pred_boxes
♦ C++代码
注意: C++代码中bbox_transform_inv
同时实现了python中的bbox_transform_inv
和clip_boxes
.
所以介绍完python的clip_boxes
之后,再来说明C++中的bbox_transform_inv
.
bbox_transform_inv(num, bbox_delt, scores, boxes, pred, cv_img.rows, cv_img.cols);
★ clip_boxes
♦ python代码
由上面可知, BBOX_REG的值为True.
def im_detect(net, im, boxes=None):
if cfg.TEST.BBOX_REG:
# Apply bounding-box regression deltas
box_deltas = blobs_out['bbox_pred']
pred_boxes = bbox_transform_inv(boxes, box_deltas)
# im.shape是(高,宽,3), pred_boxes的维度是(300, 84)
pred_boxes = clip_boxes(pred_boxes, im.shape) # 执行这里
py-faster-rcnn/lib/fast_rcnn/bbox_transform.py
注意: 这里的boxes的维度是(300, 84)
def clip_boxes(boxes, im_shape):
"""
Clip boxes to image boundaries.
"""
# x1 >= 0
boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) # 见"解释1", "解释2"
# y1 >= 0
boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
# x2 < im_shape[1]
boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
# y2 < im_shape[0]
boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
return boxes
- 解释1:
np.minimum(boxes[:, 0::4], im_shape[1] - 1)
numpy.minimum :
https://docs.scipy.org/doc/numpy/reference/generated/numpy.minimum.html#numpy.minimum
boxes的维度是(300, 84), 所以boxes[:, 0::4]
的维度是(300, 21).
im_shape[1]
是宽度. im_shape[1] - 1
是宽度方向的最大下标值.
minimum首先将im_shape[1] - 1
扩展到二维数组(300, 21), 其中每一个元素值都是im_shape[1] - 1
.
np.minimum(boxes[:, 0::4], im_shape[1] - 1)
的含义是, 取(300,21)这个二维数组中每一个元素值不超过im_shape[1] - 1
, 超过im_shape[1] - 1
的就取im_shape[1] - 1
.
- 解释2:
np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
在minimum的基础之上, 去掉二维数组中的负值, 如果为负值, 则以0来取代.
最终, boxes[:, 0::4]中每一个的值将是在[0, im_shape[1] - 1]范围内的数.
其他几行的处理, 同理.
★ 返回分数和boxes
♦ python代码
def im_detect(net, im, boxes=None):
if cfg.TEST.BBOX_REG:
# Apply bounding-box regression deltas
box_deltas = blobs_out['bbox_pred']
pred_boxes = bbox_transform_inv(boxes, box_deltas)
pred_boxes = clip_boxes(pred_boxes, im.shape)
return scores, pred_boxes # 执行这里
★ C++代码: bbox_transform_inv
C++代码中bbox_transform_inv
同时实现了python中的bbox_transform_inv
和clip_boxes
.
void Detector::bbox_transform_inv(int num, const float* box_deltas, const float* scores,
float* boxes, float* pred, int img_height, int img_width)
{
float width, height, ctr_x, ctr_y, dx, dy, dw, dh, pred_ctr_x, pred_ctr_y, pred_w, pred_h;
for (int i = 0; i < num; i++) {
//boxes大小是num*4, 一行4个数据,分别是(xmin, ymin, xmax, ymax), x轴向右,y轴向下
width = boxes[i * 4 + 2] - boxes[i * 4 + 0] + 1.0;// xmax - xmin + 1
height = boxes[i * 4 + 3] - boxes[i * 4 + 1] + 1.0;// ymax - ymin + 1
ctr_x = boxes[i * 4 + 0] + 0.5 * width;
ctr_y = boxes[i * 4 + 1] + 0.5 * height;
for (int j = 0; j < CLASS_NUM; j++) {
//box_deltas 的大小是num * CLASS_NUM * 4, 一共num行,每行CLASS_NUM * 4个数据
// dx是每行中第0, 4, 8...位置的数据, dy是1, 5, 9...位置的数据
// dw是每行中第2, 6, 10...位置的数据, dh是3, 7, 11...位置的数据
dx = box_deltas[(i * CLASS_NUM + j) * 4 + 0];// i * CLASS_NUM * 4 + j * 4 表示第i行中4*j的位置
dy = box_deltas[(i * CLASS_NUM + j) * 4 + 1];
dw = box_deltas[(i * CLASS_NUM + j) * 4 + 2];
dh = box_deltas[(i * CLASS_NUM + j) * 4 + 3];
pred_ctr_x = ctr_x + width * dx;
pred_ctr_y = ctr_y + height * dy;
pred_w = width * exp(dw);
pred_h = height * exp(dh);
// 这几行代码对应着python中demo.py执行im_detect之后的代码
// pred的大小是num*5*CLASS_NUM,
pred[(j * num + i) * 5 + 0] = max(min(pred_ctr_x - 0.5 * pred_w, img_width - 1), 0);
pred[(j * num + i) * 5 + 1] = max(min(pred_ctr_y - 0.5 * pred_h, img_height - 1), 0);
pred[(j * num + i) * 5 + 2] = max(min(pred_ctr_x + 0.5 * pred_w, img_width - 1), 0);
pred[(j * num + i) * 5 + 3] = max(min(pred_ctr_y + 0.5 * pred_h, img_height - 1), 0);
// scores的大小是(num * CLASS_NUM), i*CLASS_NUM+j表示每一行中每一项的分数
pred[(j * num + i) * 5 + 4] = scores[i * CLASS_NUM + j];
}
}
}
————– 分割线 ————–
本系列文章如下:
- (1) py-faster-rcnn中demo.py代码与C++版本的代码对比: part01 铺垫, demo.py引入的模块
- (2) py-faster-rcnn中demo.py代码与C++版本的代码对比: part02 初始化, 创建Net
- (3) py-faster-rcnn中demo.py代码与C++版本的代码对比: part03 处理图片:减掉平均值, resize
- (4) py-faster-rcnn中demo.py代码与C++版本的代码对比: part04 图片转存为blob
- (5) py-faster-rcnn中demo.py代码与C++版本的代码对比: part05 Reshape
- (6) py-faster-rcnn中demo.py代码与C++版本的代码对比: part06 forward, rois boxes transform
- (7) py-faster-rcnn中demo.py代码与C++版本的代码对比: part07 nms, 获取符合条件的boxes