py-faster-rcnn中demo.py代码与C++版本的代码对比: part06 forward, rois boxes transform

这里”C++版本的代码”是指: https://github.com/galian123/cpp_faster_rcnn_detect .

py-faster-rcnn中demo.py代码, 是指 https://github.com/rbgirshick/py-faster-rcnn/blob/master/tools/demo.py 以及
https://github.com/rbgirshick/py-faster-rcnn/tree/master/lib 目录下的一些代码.

涉及到的.py文件都是 https://github.com/rbgirshick/py-faster-rcnn/ 中的.

★ forward

♦ python代码

def im_detect(net, im, boxes=None):
    # 将blob数据转换成float类型后, 赋值给forward_kwargs, 对应的key是'data'
    forward_kwargs = {'data': blobs['data'].astype(np.float32, copy=False)}
    if cfg.TEST.HAS_RPN: # HAS_RPN在demo.py中置为true了.
        # 图片的信息, 由之前的分析可知, 'im_info'对应的数据是一维数组,其值为(高, 宽, 缩放倍率)
        forward_kwargs['im_info'] = blobs['im_info'].astype(np.float32, copy=False)
    else:
        forward_kwargs['rois'] = blobs['rois'].astype(np.float32, copy=False)
    blobs_out = net.forward(**forward_kwargs)
  • 对 blobs_out = net.forward(**forward_kwargs) 的解释

(1) forward()参数中的两个星号**的含义是, 将字典(键值对)作为参数传入.
python官方文档中对**的解释:

If the syntax **expression appears in the function call, expression must evaluate to a mapping, the contents of which are treated as additional keyword arguments. In the case of a keyword appearing in both expression and as an explicit keyword argument, a TypeError exception is raised.

(2) net.forward是在 python/caffe/pycaffe.py 中定义的,

Net.forward = _Net_forward

_Net_forward:

def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
    """
    Forward pass: prepare inputs and run the net forward.

    Parameters
    ----------
    blobs : list of blobs to return in addition to output blobs.
    kwargs : Keys are input blob names and values are blob ndarrays.
             For formatting inputs for Caffe, see Net.preprocess().
             If None, input is taken from data layers.
    start : optional name of layer at which to begin the forward pass
    end : optional name of layer at which to finish the forward pass
          (inclusive)

    Returns
    -------
    outs : {blob name: blob ndarray} dict.
    """
    if blobs is None:
        blobs = []

    if start is not None:
        start_ind = list(self._layer_names).index(start)
    else:
        start_ind = 0   # 执行这里

    if end is not None:
        end_ind = list(self._layer_names).index(end)
        outputs = set(self.top_names[end] + blobs)
    else: # 执行这里
        end_ind = len(self.layers) - 1
        outputs = set(self.outputs + blobs)
    if kwargs:
        # self.inputs是test.prototxt中的输入, input: "data" 和 input: "im_info"
        if set(kwargs.keys()) != set(self.inputs):
            raise Exception('Input blob arguments do not match net inputs.')
        # Set input according to defined shapes and make arrays single and
        # C-contiguous as Caffe expects.
        for in_, blob in six.iteritems(kwargs):
            if blob.shape[0] != self.blobs[in_].shape[0]:
                raise Exception('Input is not batch sized')
            self.blobs[in_].data[...] = blob

    # _forward是在_caffe.cpp中实现的, 在声明Net类时,定义了:
    # .def("_forward", &Net<Dtype>::ForwardFromTo)
    # 所以将会执行 ForwardFromTo()
    self._forward(start_ind, end_ind)

    # Unpack blobs to extract
    return {out: self.blobs[out].data for out in outputs}

♦ C++ 代码

    // 这里的net_->input_blobs()[0] 与net_->blob_by_name("data")指向相同的地址
    Blob<float> * input_blobs= net_->input_blobs()[0];
    switch(Caffe::mode()){
    case Caffe::CPU:
        memcpy(input_blobs->mutable_cpu_data(), data_buf, sizeof(float) * input_blobs->count());
        break;
    case Caffe::GPU: #执行这里
        caffe_gpu_memcpy(sizeof(float)* input_blobs->count(), data_buf, input_blobs->mutable_gpu_data());
        break;
    }
    float im_info[3];
    im_info[0] = cv_resized.rows;
    im_info[1] = cv_resized.cols;
    im_info[2] = img_scale;
    net_->blob_by_name("im_info")->set_cpu_data(im_info);
    net_->ForwardFrom(0);

★ 获取roi的boxes

♦ python代码

def im_detect(net, im, boxes=None):
    blobs_out = net.forward(**forward_kwargs)

    if cfg.TEST.HAS_RPN:
        # 每次只处理一张图片
        assert len(im_scales) == 1, "Only single-image batch implemented"
        # 获取所有的ROI
        rois = net.blobs['rois'].data.copy()
        # rois是缩放后的, 需要将rois还原为原始的大小.
        boxes = rois[:, 1:5] / im_scales[0]     # 取rois每一行中的后4个数据, 见"解释1"
  • 解释1: rois的内容

打印rois的内容, 如下:

rois.shape:  (300, 5)
rois:  [[   0.          483.77429199    0.          999.          467.63253784]
 [   0.           63.49487305   24.32870483  615.18811035  568.28393555]
 [   0.          452.25259399  278.47637939  999.          595.        ]
 ..., 
 [   0.          245.8664856     0.          585.41247559  470.83172607]
 [   0.           50.10795212  286.49438477   96.97434998  327.50134277]
 [   0.          299.90609741  130.08828735  358.32467651  211.07351685]]

rois[:, 1:5]将第一列(下标为0的列)的值过滤掉了, 所以boxes的shape是:(300, 4), 每一行4个数值, 含义为(xmin, ymin, xmax, ymax).

♦ C++代码

    const float* rois;
    num = net_->blob_by_name("rois")->num(); // num为300
    // ROIs
    rois = net_->blob_by_name("rois")->cpu_data();
    boxes = new float[num*4];
    for (int n = 0; n < num; n++){
        for (int c = 0; c < 4; c++){
            /*rois一行是5个数据,所以是n*5. 每行第一个数据为0, 所以取1~4位置的数据.
            boxes一行是4个数据,所以是n*4.
            rois记录的是缩放后的大小, 除以img_scale转换为原图中的大小.*/
            boxes[n * 4 + c] = rois[n * 5 + c + 1] / img_scale;
        }
    }

★ 获取分数

♦ python代码

def im_detect(net, im, boxes=None):

    if cfg.TEST.SVM:    # cfg.TEST.SVM的值为False, 见"解释1".
        # use the raw scores before softmax under the assumption they
        # were trained as linear SVMs
        scores = net.blobs['cls_score'].data    # 见"解释2"
    else:
        # 执行这里
        # use softmax estimated probabilities
        scores = blobs_out['cls_prob']
  • 解释1: cfg.TEST.SVM的值
    py-faster-rcnn/lib/fast_rcnn/config.py
# Experimental: treat the (K+1) units in the cls_score layer as linear
# predictors (trained, eg, with one-vs-rest SVMs).
__C.TEST.SVM = False
  • 解释2: scores的shape
    scores的shape是(300, 21). 因为在test.prototxt中设置了cls_score layer的输出为21.

♦ C++代码

    const float* scores;
    scores = net_->blob_by_name("cls_prob")->cpu_data();

★ 转换boxes

BBOX_REG的值为True.

def im_detect(net, im, boxes=None):

    if cfg.TEST.BBOX_REG:   # 执行这里, 见"解释1"
        # Apply bounding-box regression deltas
        box_deltas = blobs_out['bbox_pred']     # 见"解释2"
        pred_boxes = bbox_transform_inv(boxes, box_deltas) # 见下一节
        pred_boxes = clip_boxes(pred_boxes, im.shape)
    else:
        # Simply repeat the boxes, once for each class
        pred_boxes = np.tile(boxes, (1, scores.shape[1]))
  • 解释1: cfg.TEST.BBOX_REG的值
    py-faster-rcnn/lib/fast_rcnn/config.py
# Test using bounding-box regressors
__C.TEST.BBOX_REG = True
  • 解释2: box_deltas的维度

box_deltas的shape是(300, 84), 这里的300是在config.py中设置的(如下), 之选取分数最高的前300个box.

## Number of top scoring boxes to keep after applying NMS to RPN proposals
__C.TEST.RPN_POST_NMS_TOP_N = 300

这里的84是test.prototxt中layer bbox_pred的输出.
在检测时,我使用的是下面这个路径的prototxt :
py-faster-rcnn/models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
打开这个文件,可以看到:
num_output: 84, 这个84即是box_deltas的第二个维度.

layer {
  name: "bbox_pred"
  type: "InnerProduct"
  bottom: "fc7"
  top: "bbox_pred"
略
  inner_product_param {
    num_output: 84}
}
  • deltas的内容(举例)
deltas.shape: (300, 84), dtype: float32
deltas:  [[  9.60100806e-05   1.26006280e-03  -2.43484465e-04 ...,  -4.72921133e-02
   -2.12732166e-01  -1.51322618e-01]
 [ -1.77061767e-04   7.68931466e-04  -6.74572366e-04 ...,  -1.69632081e-02
   -1.13817520e-01  -1.23578422e-01]
 [  2.23828043e-04   8.45583039e-04  -2.06545158e-03 ...,  -1.12596154e-01
    8.17453414e-02   1.14649057e-01]
 ..., 
 [  1.48329753e-04   1.67545048e-03  -2.05304706e-03 ...,   4.25588563e-02
   -3.89761664e-02  -8.92069712e-02]
 [  7.72749656e-04  -2.51855049e-03  -3.18117486e-03 ...,  -8.37800950e-02
    5.03335238e-01   1.02863863e-01]
 [  7.43390410e-04  -7.27616716e-04  -1.65828911e-03 ...,   1.51133044e-02
    1.52454287e-01   1.12097539e-01]]

★ bbox_transform_inv

♦ python代码

from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv

def im_detect(net, im, boxes=None):

    if cfg.TEST.BBOX_REG:   # 执行这里
        box_deltas = blobs_out['bbox_pred']
        pred_boxes = bbox_transform_inv(boxes, box_deltas)

py-faster-rcnn/lib/fast_rcnn/bbox_transform.py

def bbox_transform_inv(boxes, deltas):
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

    # 数据类型转换, deltas.dtype 是float32
    boxes = boxes.astype(deltas.dtype, copy=False)

    # boxes的shape(300, 4), 每一行4个数值, 含义为(xmin, ymin, xmax, ymax) 
    widths = boxes[:, 2] - boxes[:, 0] + 1.0    # xmax - xmin + 1, 所有boxes的宽度
    heights = boxes[:, 3] - boxes[:, 1] + 1.0   # ymax - ymin + 1, 所有boxes的高度
    ctr_x = boxes[:, 0] + 0.5 * widths          # x + 0.5 * widths, 是宽的中心点
    ctr_y = boxes[:, 1] + 0.5 * heights         # y + 0.5 * heights, 是高的中心点

    # deltas的维度是(300, 84), 所以dx,dy等的维度为(300, 21)
    dx = deltas[:, 0::4]    # 取每一行中, 0, 4, 8, ..., 位置的数据
    dy = deltas[:, 1::4]    # 取每一行中, 1, 5, 9, ..., 位置的数据
    dw = deltas[:, 2::4]    # 取每一行中, 2, 6, 10, ..., 位置的数据
    dh = deltas[:, 3::4]    # 取每一行中, 3, 7, 11, ..., 位置的数据

    # dx是二维数组, 维度为(300, 21),
    # 这里用到了numpy的broadcasting机制.
    # widths是一维数组(300,), widths[:, np.newaxis]扩展到二维, 并将每一列都用第一列的数据填充
    # ctr_x是一维数组(300,), ctr_x[:, np.newaxis]扩展到二维, 并将每一列都用第一列的数据填充
    # pred_ctr_x 是二维数组, 其中的每一个元素都是dx中的值 * 宽度, 再加上中心点位置.
    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]

    # np.exp(dw)是对dw(二维数组)中的每一项都记算以e为底数, 数组中的数值为指数的幂运算
    # 然后数组中的每一项都与相应的宽度相乘
    pred_w = np.exp(dw) * widths[:, np.newaxis]
    pred_h = np.exp(dh) * heights[:, np.newaxis]

    # deltas.shape是(300, 84), dtype为float32, 所以pred_boxes的维度是(300, 84)
    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
    # x1
    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
    # y1
    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
    # x2
    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
    # y2
    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
    return pred_boxes

♦ C++代码

注意: C++代码中bbox_transform_inv同时实现了python中的bbox_transform_invclip_boxes.
所以介绍完python的clip_boxes之后,再来说明C++中的bbox_transform_inv.

    bbox_transform_inv(num, bbox_delt, scores, boxes, pred, cv_img.rows, cv_img.cols);

★ clip_boxes

♦ python代码

由上面可知, BBOX_REG的值为True.

def im_detect(net, im, boxes=None):

    if cfg.TEST.BBOX_REG:   
        # Apply bounding-box regression deltas
        box_deltas = blobs_out['bbox_pred']
        pred_boxes = bbox_transform_inv(boxes, box_deltas)
        # im.shape是(高,宽,3), pred_boxes的维度是(300, 84)
        pred_boxes = clip_boxes(pred_boxes, im.shape) # 执行这里

py-faster-rcnn/lib/fast_rcnn/bbox_transform.py

注意: 这里的boxes的维度是(300, 84)

def clip_boxes(boxes, im_shape):
    """
    Clip boxes to image boundaries.
    """
    # x1 >= 0
    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) # 见"解释1", "解释2"
    # y1 >= 0
    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
    # x2 < im_shape[1]
    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
    # y2 < im_shape[0]
    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
    return boxes
  • 解释1: np.minimum(boxes[:, 0::4], im_shape[1] - 1)

numpy.minimum :
https://docs.scipy.org/doc/numpy/reference/generated/numpy.minimum.html#numpy.minimum

boxes的维度是(300, 84), 所以boxes[:, 0::4]的维度是(300, 21).
im_shape[1]是宽度. im_shape[1] - 1是宽度方向的最大下标值.

minimum首先将im_shape[1] - 1扩展到二维数组(300, 21), 其中每一个元素值都是im_shape[1] - 1.
np.minimum(boxes[:, 0::4], im_shape[1] - 1)的含义是, 取(300,21)这个二维数组中每一个元素值不超过im_shape[1] - 1, 超过im_shape[1] - 1的就取im_shape[1] - 1.

  • 解释2: np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
    在minimum的基础之上, 去掉二维数组中的负值, 如果为负值, 则以0来取代.

最终, boxes[:, 0::4]中每一个的值将是在[0, im_shape[1] - 1]范围内的数.
其他几行的处理, 同理.

★ 返回分数和boxes

♦ python代码

def im_detect(net, im, boxes=None):

    if cfg.TEST.BBOX_REG: 
        # Apply bounding-box regression deltas
        box_deltas = blobs_out['bbox_pred']
        pred_boxes = bbox_transform_inv(boxes, box_deltas)
        pred_boxes = clip_boxes(pred_boxes, im.shape)

    return scores, pred_boxes   # 执行这里

★ C++代码: bbox_transform_inv

C++代码中bbox_transform_inv同时实现了python中的bbox_transform_invclip_boxes.

void Detector::bbox_transform_inv(int num, const float* box_deltas, const float* scores,
    float* boxes, float* pred, int img_height, int img_width)
{
    float width, height, ctr_x, ctr_y, dx, dy, dw, dh, pred_ctr_x, pred_ctr_y, pred_w, pred_h;

    for (int i = 0; i < num; i++) {
        //boxes大小是num*4, 一行4个数据,分别是(xmin, ymin, xmax, ymax), x轴向右,y轴向下
        width = boxes[i * 4 + 2] - boxes[i * 4 + 0] + 1.0;// xmax - xmin + 1
        height = boxes[i * 4 + 3] - boxes[i * 4 + 1] + 1.0;// ymax - ymin + 1
        ctr_x = boxes[i * 4 + 0] + 0.5 * width;
        ctr_y = boxes[i * 4 + 1] + 0.5 * height;

        for (int j = 0; j < CLASS_NUM; j++) {
            //box_deltas 的大小是num * CLASS_NUM * 4, 一共num行,每行CLASS_NUM * 4个数据
            // dx是每行中第0, 4, 8...位置的数据, dy是1, 5, 9...位置的数据
            // dw是每行中第2, 6, 10...位置的数据, dh是3, 7, 11...位置的数据
            dx = box_deltas[(i * CLASS_NUM + j) * 4 + 0];// i * CLASS_NUM * 4 + j * 4 表示第i行中4*j的位置
            dy = box_deltas[(i * CLASS_NUM + j) * 4 + 1];
            dw = box_deltas[(i * CLASS_NUM + j) * 4 + 2];
            dh = box_deltas[(i * CLASS_NUM + j) * 4 + 3];

            pred_ctr_x = ctr_x + width * dx;
            pred_ctr_y = ctr_y + height * dy;
            pred_w = width * exp(dw);
            pred_h = height * exp(dh);

            // 这几行代码对应着python中demo.py执行im_detect之后的代码
            // pred的大小是num*5*CLASS_NUM, 
            pred[(j * num + i) * 5 + 0] = max(min(pred_ctr_x - 0.5 * pred_w, img_width - 1), 0);
            pred[(j * num + i) * 5 + 1] = max(min(pred_ctr_y - 0.5 * pred_h, img_height - 1), 0);
            pred[(j * num + i) * 5 + 2] = max(min(pred_ctr_x + 0.5 * pred_w, img_width - 1), 0);
            pred[(j * num + i) * 5 + 3] = max(min(pred_ctr_y + 0.5 * pred_h, img_height - 1), 0);
            // scores的大小是(num * CLASS_NUM), i*CLASS_NUM+j表示每一行中每一项的分数
            pred[(j * num + i) * 5 + 4] = scores[i * CLASS_NUM + j];
        }
   }
}

————– 分割线 ————–
本系列文章如下:

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值