转自:http://blog.csdn.net/u014568921/article/details/53188559
理解faster rcnn的源码有几个关键点
1.算法原理、网络结构、训练过程这是基本
2.要弄懂源码里训练数据数据是怎么组织起来的,imdb,roidb,blob很关键,弄清它们的数据结构以及各个阶段是如何产生的
3.一定的python、numpy基础知识
rpn_train.pt
- #stage 1训练RPN时用的网络结构
- name: "ZF"
- layer {
- name: 'input-data'
- type: 'Python'
- top: 'data'
- top: 'im_info'
- top: 'gt_boxes'
- python_param {
- module: 'roi_data_layer.layer'#对应lib/roi_data_layer/layer.py
- #为训练RPN时为网络输入roi,此时为gt box
- layer: 'RoIDataLayer'
- param_str: "'num_classes': 21"
- }
- }
- #前面是ZF网,提取特征用,各个阶段共享
- #========= conv1-conv5 ============
- layer {
- name: "conv1"
- type: "Convolution"
- bottom: "data"
- top: "conv1"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 96
- kernel_size: 7
- pad: 3
- stride: 2
- }
- }
- layer {
- name: "relu1"
- type: "ReLU"
- bottom: "conv1"
- top: "conv1"
- }
- layer {
- name: "norm1"
- type: "LRN"
- bottom: "conv1"
- top: "norm1"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool1"
- type: "Pooling"
- bottom: "norm1"
- top: "pool1"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv2"
- type: "Convolution"
- bottom: "pool1"
- top: "conv2"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 256
- kernel_size: 5
- pad: 2
- stride: 2
- }
- }
- layer {
- name: "relu2"
- type: "ReLU"
- bottom: "conv2"
- top: "conv2"
- }
- layer {
- name: "norm2"
- type: "LRN"
- bottom: "conv2"
- top: "norm2"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool2"
- type: "Pooling"
- bottom: "norm2"
- top: "pool2"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv3"
- type: "Convolution"
- bottom: "pool2"
- top: "conv3"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu3"
- type: "ReLU"
- bottom: "conv3"
- top: "conv3"
- }
- layer {
- name: "conv4"
- type: "Convolution"
- bottom: "conv3"
- top: "conv4"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu4"
- type: "ReLU"
- bottom: "conv4"
- top: "conv4"
- }
- layer {
- name: "conv5"
- type: "Convolution"
- bottom: "conv4"
- top: "conv5"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 256
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu5"
- type: "ReLU"
- bottom: "conv5"
- top: "conv5"
- }
- #========= RPN ============
- layer {
- name: "rpn_conv1"
- type: "Convolution"
- bottom: "conv5"
- top: "rpn_conv1"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 256
- kernel_size: 3 pad: 1 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- name: "rpn_relu1"
- type: "ReLU"
- bottom: "rpn_conv1"
- top: "rpn_conv1"
- }
- layer {
- name: "rpn_cls_score"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_cls_score"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 18 # 2(bg/fg) * 9(anchors)
- kernel_size: 1 pad: 0 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- name: "rpn_bbox_pred"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_bbox_pred"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 36 # 4 * 9(anchors)
- kernel_size: 1 pad: 0 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- bottom: "rpn_cls_score"
- top: "rpn_cls_score_reshape"
- name: "rpn_cls_score_reshape"
- type: "Reshape"
- reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
- }
- layer {
- name: 'rpn-data'
- type: 'Python'
- bottom: 'rpn_cls_score'
- bottom: 'gt_boxes'
- bottom: 'im_info'
- bottom: 'data'
- top: 'rpn_labels'
- top: 'rpn_bbox_targets'
- top: 'rpn_bbox_inside_weights'
- top: 'rpn_bbox_outside_weights'
- python_param {
- module: 'rpn.anchor_target_layer'#对应文件lib/rpn/anchor_target_layer.py
- #用于在原图上产生anchor,结合gt box训练rpn做box cls和box reg
- layer: 'AnchorTargetLayer'
- param_str: "'feat_stride': 16"
- }
- }
- layer {
- name: "rpn_loss_cls"
- type: "SoftmaxWithLoss"
- bottom: "rpn_cls_score_reshape"
- bottom: "rpn_labels"
- propagate_down: 1
- propagate_down: 0
- top: "rpn_cls_loss"
- loss_weight: 1
- loss_param {
- ignore_label: -1
- normalize: true
- }
- }
- layer {
- name: "rpn_loss_bbox"
- type: "SmoothL1Loss"
- bottom: "rpn_bbox_pred"
- bottom: "rpn_bbox_targets"
- bottom: "rpn_bbox_inside_weights"
- bottom: "rpn_bbox_outside_weights"
- top: "rpn_loss_bbox"
- loss_weight: 1
- smooth_l1_loss_param { sigma: 3.0 }
- }
- #========= RCNN ============
- # Dummy layers so that initial parameters are saved into the output net
- layer {
- name: "dummy_roi_pool_conv5"
- type: "DummyData"
- top: "dummy_roi_pool_conv5"
- dummy_data_param {
- shape { dim: 1 dim: 9216 }
- data_filler { type: "gaussian" std: 0.01 }
- }
- }
- layer {
- name: "fc6"
- type: "InnerProduct"
- bottom: "dummy_roi_pool_conv5"
- top: "fc6"
- param { lr_mult: 0 decay_mult: 0 }
- param { lr_mult: 0 decay_mult: 0 }
- inner_product_param {
- num_output: 4096
- }
- }
- layer {
- name: "relu6"
- type: "ReLU"
- bottom: "fc6"
- top: "fc6"
- }
- layer {
- name: "fc7"
- type: "InnerProduct"
- bottom: "fc6"
- top: "fc7"
- param { lr_mult: 0 decay_mult: 0 }
- param { lr_mult: 0 decay_mult: 0 }
- inner_product_param {
- num_output: 4096
- }
- }
- layer {
- name: "silence_fc7"
- type: "Silence"
- bottom: "fc7"
- }
上面需要注意的是rpn_cls_score层为每个位置的9个anchor做的只是bg/fg的二分类,而不管具体是fg的话属于那一类别,rpn阶段完成这个任务就够了,后面fast rcnn可以对region proposal进行细分和位置精修
roi_data_layer/layer.py
- #coding:utf-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """The data layer used during training to train a Fast R-CNN network.
- RoIDataLayer implements a Caffe Python layer.
- """
- import caffe
- from fast_rcnn.config import cfg
- from roi_data_layer.minibatch import get_minibatch
- import numpy as np
- import yaml
- from multiprocessing import Process, Queue
- #为网络输入roi
- class RoIDataLayer(caffe.Layer):
- """Fast R-CNN data layer used for training."""
- def _shuffle_roidb_inds(self):
- """Randomly permute the training roidb."""
- if cfg.TRAIN.ASPECT_GROUPING:
- widths = np.array([r['width'] for r in self._roidb])
- heights = np.array([r['height'] for r in self._roidb])
- horz = (widths >= heights)
- vert = np.logical_not(horz)
- horz_inds = np.where(horz)[0]
- vert_inds = np.where(vert)[0]
- inds = np.hstack((
- np.random.permutation(horz_inds),
- np.random.permutation(vert_inds)))
- inds = np.reshape(inds, (-1, 2))
- row_perm = np.random.permutation(np.arange(inds.shape[0]))
- inds = np.reshape(inds[row_perm, :], (-1,))
- self._perm = inds
- else:
- self._perm = np.random.permutation(np.arange(len(self._roidb)))
- self._cur = 0
- #得到下一个batch训练用的图像的index,默认一次两张图片
- def _get_next_minibatch_inds(self):
- """Return the roidb indices for the next minibatch."""
- #如果所有图片都用完了,打乱顺序,roidb由每张图片的rois集合构成
- if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
- self._shuffle_roidb_inds()
- #从_cur记录的位置开始选择cfg.TRAIN.IMS_PER_BATCH张图片作为训练用
- db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
- self._cur += cfg.TRAIN.IMS_PER_BATCH
- return db_inds
- #取得训练用的blob
- def _get_next_minibatch(self):
- """Return the blobs to be used for the next minibatch.
- If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
- separate process and made available through self._blob_queue.
- """
- if cfg.TRAIN.USE_PREFETCH:
- return self._blob_queue.get()
- else:
- db_inds = self._get_next_minibatch_inds()
- minibatch_db = [self._roidb[i] for i in db_inds]
- #函数在lib/roi_data_layer/minibatch.py里实现
- return get_minibatch(minibatch_db, self._num_classes)
- def set_roidb(self, roidb):
- """Set the roidb to be used by this layer during training."""
- self._roidb = roidb
- self._shuffle_roidb_inds()
- if cfg.TRAIN.USE_PREFETCH:
- self._blob_queue = Queue(10)
- self._prefetch_process = BlobFetcher(self._blob_queue,
- self._roidb,
- self._num_classes)
- self._prefetch_process.start()
- # Terminate the child process when the parent exists
- def cleanup():
- print 'Terminating BlobFetcher'
- self._prefetch_process.terminate()
- self._prefetch_process.join()
- import atexit
- atexit.register(cleanup)
- #该层初始化时调用
- def setup(self, bottom, top):
- """Setup the RoIDataLayer."""
- # parse the layer parameter string, which must be valid YAML
- layer_params = yaml.load(self.param_str_)
- self._num_classes = layer_params['num_classes']
- self._name_to_top_map = {}
- # data blob: holds a batch of N images, each with 3 channels
- idx = 0
- top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
- max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
- self._name_to_top_map['data'] = idx
- idx += 1
- #如果要训练RPN网,roi是gt box
- if cfg.TRAIN.HAS_RPN:
- top[idx].reshape(1, 3)
- self._name_to_top_map['im_info'] = idx
- idx += 1
- top[idx].reshape(1, 4)
- self._name_to_top_map['gt_boxes'] = idx
- idx += 1
- #如果是训练fast rcnn则roi是之前RPN提取的region proposal
- else: # not using RPN
- # rois blob: holds R regions of interest, each is a 5-tuple
- # (n, x1, y1, x2, y2) specifying an image batch index n and a
- # rectangle (x1, y1, x2, y2)
- top[idx].reshape(1, 5)
- self._name_to_top_map['rois'] = idx
- idx += 1
- # labels blob: R categorical labels in [0, ..., K] for K foreground
- # classes plus background
- top[idx].reshape(1)
- self._name_to_top_map['labels'] = idx
- idx += 1
- if cfg.TRAIN.BBOX_REG:
- # bbox_targets blob: R bounding-box regression targets with 4
- # targets per class
- top[idx].reshape(1, self._num_classes * 4)
- self._name_to_top_map['bbox_targets'] = idx
- idx += 1
- # bbox_inside_weights blob: At most 4 targets per roi are active;
- # thisbinary vector sepcifies the subset of active targets
- top[idx].reshape(1, self._num_classes * 4)
- self._name_to_top_map['bbox_inside_weights'] = idx
- idx += 1
- top[idx].reshape(1, self._num_classes * 4)
- self._name_to_top_map['bbox_outside_weights'] = idx
- idx += 1
- print 'RoiDataLayer: name_to_top:', self._name_to_top_map
- assert len(top) == len(self._name_to_top_map)
- #作为输入前向计算
- def forward(self, bottom, top):
- """Get blobs and copy them into this layer's top blob vector."""
- blobs = self._get_next_minibatch()
- for blob_name, blob in blobs.iteritems():
- top_ind = self._name_to_top_map[blob_name]
- # Reshape net's input blobs
- top[top_ind].reshape(*(blob.shape))
- # Copy data into net's input blobs
- top[top_ind].data[...] = blob.astype(np.float32, copy=False)
- #不用反向传播
- def backward(self, top, propagate_down, bottom):
- """This layer does not propagate gradients."""
- pass
- def reshape(self, bottom, top):
- """Reshaping happens during the call to forward."""
- pass
- class BlobFetcher(Process):
- """Experimental class for prefetching blobs in a separate process."""
- def __init__(self, queue, roidb, num_classes):
- super(BlobFetcher, self).__init__()
- self._queue = queue
- self._roidb = roidb
- self._num_classes = num_classes
- self._perm = None
- self._cur = 0
- self._shuffle_roidb_inds()
- # fix the random seed for reproducibility
- np.random.seed(cfg.RNG_SEED)
- def _shuffle_roidb_inds(self):
- """Randomly permute the training roidb."""
- # TODO(rbg): remove duplicated code
- self._perm = np.random.permutation(np.arange(len(self._roidb)))
- self._cur = 0
- def _get_next_minibatch_inds(self):
- """Return the roidb indices for the next minibatch."""
- # TODO(rbg): remove duplicated code
- if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
- self._shuffle_roidb_inds()
- db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
- self._cur += cfg.TRAIN.IMS_PER_BATCH
- return db_inds
- def run(self):
- print 'BlobFetcher started'
- while True:
- db_inds = self._get_next_minibatch_inds()
- minibatch_db = [self._roidb[i] for i in db_inds]
- blobs = get_minibatch(minibatch_db, self._num_classes)
- self._queue.put(blobs)
- #coding:utf-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """Compute minibatch blobs for training a Fast R-CNN network."""
- import numpy as np
- import numpy.random as npr
- import cv2
- from fast_rcnn.config import cfg
- from utils.blob import prep_im_for_blob, im_list_to_blob
- #采样产生训练用的rois的blob,可以直接作为caffe的输入
- def get_minibatch(roidb, num_classes):
- """Given a roidb, construct a minibatch sampled from it."""
- num_images = len(roidb)
- #从预设的训练尺度里随机抽样用作此次产生的batch里用的roi的尺度
- # Sample random scales to use for each image in this batch
- random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
- size=num_images)
- #BATCH_SIZE为一个minibatch里训练用的roi的数量
- assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
- 'num_images ({}) must divide BATCH_SIZE ({})'. \
- format(num_images, cfg.TRAIN.BATCH_SIZE)
- #每张图片上应该抽样得到的roi的数量
- rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
- #前景roi的数量
- fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
- #产生caffe能用的blob
- # Get the input image blob, formatted for caffe
- #_get_image_blob的实现在本文件的后面
- im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
- blobs = {'data': im_blob}
- #训练RPN时
- if cfg.TRAIN.HAS_RPN:
- assert len(im_scales) == 1, "Single batch only"
- assert len(roidb) == 1, "Single batch only"
- # gt boxes: (x1, y1, x2, y2, cls)
- #属于前景的roi的真实类别
- gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
- gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
- #gt_boxes[i]类似于(x1,y1,x2,y2,cls)
- gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
- gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
- blobs['gt_boxes'] = gt_boxes
- blobs['im_info'] = np.array(
- [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
- dtype=np.float32)
- #训练fast rcnn时
- else: # not using RPN
- # Now, build the region of interest and label blobs
- rois_blob = np.zeros((0, 5), dtype=np.float32)
- labels_blob = np.zeros((0), dtype=np.float32)
- bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
- bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
- # all_overlaps = []
- for im_i in xrange(num_images):
- #_sample_rois实现在下面,实现从每张图片的rois里采样
- labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
- = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
- num_classes)
- # Add to RoIs blob
- rois = _project_im_rois(im_rois, im_scales[im_i])
- batch_ind = im_i * np.ones((rois.shape[0], 1))
- rois_blob_this_image = np.hstack((batch_ind, rois))
- rois_blob = np.vstack((rois_blob, rois_blob_this_image))
- # Add to labels, bbox targets, and bbox loss blobs
- labels_blob = np.hstack((labels_blob, labels))
- bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
- bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
- # all_overlaps = np.hstack((all_overlaps, overlaps))
- # For debug visualizations
- # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
- blobs['rois'] = rois_blob
- blobs['labels'] = labels_blob
- if cfg.TRAIN.BBOX_REG:
- blobs['bbox_targets'] = bbox_targets_blob
- blobs['bbox_inside_weights'] = bbox_inside_blob
- blobs['bbox_outside_weights'] = \
- np.array(bbox_inside_blob > 0).astype(np.float32)
- return blobs
- #从一张图片的rois里采样得到roi
- def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes):
- """Generate a random sample of RoIs comprising foreground and background
- examples.
- """
- # label = class RoI has max overlap with
- labels = roidb['max_classes']
- overlaps = roidb['max_overlaps']
- rois = roidb['boxes']
- # Select foreground RoIs as those with >= FG_THRESH overlap
- fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
- # Guard against the case when an image has fewer than fg_rois_per_image
- # foreground RoIs
- #fg_rois_per_this_image取fg_rois_per_this_image和fg_inds.size的较小的一个
- fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
- # Sample foreground regions without replacement
- if fg_inds.size > 0:
- fg_inds = npr.choice(
- fg_inds, size=fg_rois_per_this_image, replace=False)
- # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
- bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
- (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
- # Compute number of background RoIs to take from this image (guarding
- # against there being fewer than desired)
- bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
- bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
- bg_inds.size)
- #这里如果正负样本数量相差太大会出问题,此时应该做正负样本平衡,这里没有做
- # Sample foreground regions without replacement
- if bg_inds.size > 0:
- bg_inds = npr.choice(
- bg_inds, size=bg_rois_per_this_image, replace=False)
- # The indices that we're selecting (both fg and bg)
- keep_inds = np.append(fg_inds, bg_inds)
- # Select sampled values from various arrays:
- labels = labels[keep_inds]
- # Clamp labels for the background RoIs to 0
- #设定背景roi的label为0
- labels[fg_rois_per_this_image:] = 0
- overlaps = overlaps[keep_inds]
- rois = rois[keep_inds]
- bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(
- roidb['bbox_targets'][keep_inds, :], num_classes)
- return labels, overlaps, rois, bbox_targets, bbox_inside_weights
- def _get_image_blob(roidb, scale_inds):
- """Builds an input blob from the images in the roidb at the specified
- scales.
- """
- num_images = len(roidb)
- processed_ims = []
- im_scales = []
- for i in xrange(num_images):
- #读取roi所在的图像
- im = cv2.imread(roidb[i]['image'])
- #判断该roi是否是由水平翻转得到的
- if roidb[i]['flipped']:
- #实现水平翻转
- im = im[:, ::-1, :]
- #得到尺度
- target_size = cfg.TRAIN.SCALES[scale_inds[i]]
- im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
- cfg.TRAIN.MAX_SIZE)
- im_scales.append(im_scale)
- processed_ims.append(im)
- #在lib/util/blob.py里实现
- # Create a blob to hold the input images
- blob = im_list_to_blob(processed_ims)
- return blob, im_scales
- def _project_im_rois(im_rois, im_scale_factor):
- """Project image RoIs into the rescaled training image."""
- rois = im_rois * im_scale_factor
- return rois
- def _get_bbox_regression_labels(bbox_target_data, num_classes):
- """Bounding-box regression targets are stored in a compact form in the
- roidb.
- This function expands those targets into the 4-of-4*K representation used
- by the network (i.e. only one class has non-zero targets). The loss weights
- are similarly expanded.
- Returns:
- bbox_target_data (ndarray): N x 4K blob of regression targets
- bbox_inside_weights (ndarray): N x 4K blob of loss weights
- """
- clss = bbox_target_data[:, 0]
- bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
- bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
- inds = np.where(clss > 0)[0]
- for ind in inds:
- cls = clss[ind]
- start = 4 * cls
- end = start + 4
- bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
- bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
- return bbox_targets, bbox_inside_weights
- def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps):
- """Visualize a mini-batch for debugging."""
- import matplotlib.pyplot as plt
- for i in xrange(rois_blob.shape[0]):
- rois = rois_blob[i, :]
- im_ind = rois[0]
- roi = rois[1:]
- im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy()
- im += cfg.PIXEL_MEANS
- im = im[:, :, (2, 1, 0)]
- im = im.astype(np.uint8)
- cls = labels_blob[i]
- plt.imshow(im)
- print 'class: ', cls, ' overlap: ', overlaps[i]
- plt.gca().add_patch(
- plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0],
- roi[3] - roi[1], fill=False,
- edgecolor='r', linewidth=3)
- )
- plt.show()
lib/utils/bolb.py
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """Blob helper functions."""
- import numpy as np
- import cv2
- def im_list_to_blob(ims):
- """Convert a list of images into a network input.
- Assumes images are already prepared (means subtracted, BGR order, ...).
- """
- max_shape = np.array([im.shape for im in ims]).max(axis=0)
- num_images = len(ims)
- blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
- dtype=np.float32)
- for i in xrange(num_images):
- im = ims[i]
- blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
- # Move channels (axis 3) to axis 1
- # Axis order will become: (batch elem, channel, height, width)
- channel_swap = (0, 3, 1, 2)
- blob = blob.transpose(channel_swap)
- return blob
- def prep_im_for_blob(im, pixel_means, target_size, max_size):
- """Mean subtract and scale an image for use in a blob."""
- im = im.astype(np.float32, copy=False)
- im -= pixel_means
- im_shape = im.shape
- im_size_min = np.min(im_shape[0:2])
- im_size_max = np.max(im_shape[0:2])
- im_scale = float(target_size) / float(im_size_min)
- # Prevent the biggest axis from being more than MAX_SIZE
- if np.round(im_scale * im_size_max) > max_size:
- im_scale = float(max_size) / float(im_size_max)
- im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
- interpolation=cv2.INTER_LINEAR)
- return im, im_scale
lib/rpn/anchor_target_layer.py
- #coding:utf-8
- # --------------------------------------------------------
- # Faster R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick and Sean Bell
- # --------------------------------------------------------
- import os
- import caffe
- import yaml
- from fast_rcnn.config import cfg
- import numpy as np
- import numpy.random as npr
- from generate_anchors import generate_anchors
- from utils.cython_bbox import bbox_overlaps
- from fast_rcnn.bbox_transform import bbox_transform
- DEBUG = False
- class AnchorTargetLayer(caffe.Layer):
- """
- Assign anchors to ground-truth targets. Produces anchor classification
- labels and bounding-box regression targets.
- """
- def setup(self, bottom, top):
- layer_params = yaml.load(self.param_str_)
- #设定anchor的三个尺度
- anchor_scales = layer_params.get('scales', (8, 16, 32))
- #以(8.5,8.5)为中心产生9个基准anchor
- self._anchors = generate_anchors(scales=np.array(anchor_scales))
- self._num_anchors = self._anchors.shape[0]
- #其余的anchor以feat_stride为步长上下滑动产生,config.py里feat_stride设为16,为什么是16,
- #因为不管是VGG还是ZF,conv5之后的scale是原图的1/16,这样产生的achor基本均匀分布在整个原图
- self._feat_stride = layer_params['feat_stride']
- if DEBUG:
- print 'anchors:'
- print self._anchors
- print 'anchor shapes:'
- print np.hstack((
- self._anchors[:, 2::4] - self._anchors[:, 0::4],
- self._anchors[:, 3::4] - self._anchors[:, 1::4],
- ))
- self._counts = cfg.EPS
- self._sums = np.zeros((1, 4))
- self._squared_sums = np.zeros((1, 4))
- self._fg_sum = 0
- self._bg_sum = 0
- self._count = 0
- # allow boxes to sit over the edge by a small amount
- self._allowed_border = layer_params.get('allowed_border', 0)
- #获得featuremap的宽高
- height, width = bottom[0].data.shape[-2:]
- if DEBUG:
- print 'AnchorTargetLayer: height', height, 'width', width
- A = self._num_anchors
- # labels
- top[0].reshape(1, 1, A * height, width)
- # bbox_targets
- top[1].reshape(1, A * 4, height, width)
- # bbox_inside_weights
- top[2].reshape(1, A * 4, height, width)
- # bbox_outside_weights
- top[3].reshape(1, A * 4, height, width)
- def forward(self, bottom, top):
- # Algorithm:
- #
- # for each (H, W) location i
- # generate 9 anchor boxes centered on cell i
- # apply predicted bbox deltas at cell i to each of the 9 anchors
- # filter out-of-image anchors
- # measure GT overlap
- assert bottom[0].data.shape[0] == 1, \
- 'Only single item batches are supported'
- # map of shape (..., H, W)
- height, width = bottom[0].data.shape[-2:]
- # GT boxes (x1, y1, x2, y2, label)
- gt_boxes = bottom[1].data
- # im_info
- im_info = bottom[2].data[0, :]
- if DEBUG:
- print ''
- print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
- print 'scale: {}'.format(im_info[2])
- print 'height, width: ({}, {})'.format(height, width)
- print 'rpn: gt_boxes.shape', gt_boxes.shape
- print 'rpn: gt_boxes', gt_boxes
- # 1. Generate proposals from bbox deltas and shifted anchors
- shift_x = np.arange(0, width) * self._feat_stride
- shift_y = np.arange(0, height) * self._feat_stride
- shift_x, shift_y = np.meshgrid(shift_x, shift_y)
- shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
- shift_x.ravel(), shift_y.ravel())).transpose()
- # add A anchors (1, A, 4) to
- # cell K shifts (K, 1, 4) to get
- # shift anchors (K, A, 4)
- # reshape to (K*A, 4) shifted anchors
- A = self._num_anchors
- K = shifts.shape[0]
- all_anchors = (self._anchors.reshape((1, A, 4)) +
- shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
- all_anchors = all_anchors.reshape((K * A, 4))
- total_anchors = int(K * A)
- # only keep anchors inside the image
- inds_inside = np.where(
- (all_anchors[:, 0] >= -self._allowed_border) &
- (all_anchors[:, 1] >= -self._allowed_border) &
- (all_anchors[:, 2] < im_info[1] + self._allowed_border) & # width
- (all_anchors[:, 3] < im_info[0] + self._allowed_border) # height
- )[0]
- if DEBUG:
- print 'total_anchors', total_anchors
- print 'inds_inside', len(inds_inside)
- #裁掉大小超出图片的anchor,inds_inside是在图像内部的anchor的索引数组
- # keep only inside anchors
- anchors = all_anchors[inds_inside, :]
- if DEBUG:
- print 'anchors.shape', anchors.shape
- # label: 1 is positive, 0 is negative, -1 is dont care
- labels = np.empty((len(inds_inside), ), dtype=np.float32)
- labels.fill(-1)
- # overlaps between the anchors and the gt boxes
- # overlaps (ex, gt)
- overlaps = bbox_overlaps(
- np.ascontiguousarray(anchors, dtype=np.float),
- np.ascontiguousarray(gt_boxes, dtype=np.float))
- argmax_overlaps = overlaps.argmax(axis=1)
- max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
- gt_argmax_overlaps = overlaps.argmax(axis=0)
- gt_max_overlaps = overlaps[gt_argmax_overlaps,
- np.arange(overlaps.shape[1])]
- gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
- if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
- # assign bg labels first so that positive labels can clobber them
- labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
- # fg label: for each gt, anchor with highest overlap
- labels[gt_argmax_overlaps] = 1
- # fg label: above threshold IOU
- labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
- if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
- # assign bg labels last so that negative labels can clobber positives
- labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
- #采样正负anchor,如果正负样本数量不均衡,需要保持正负样本的比例基本为1:1,太悬殊
- #会使得算法漏检严重,下面的算法没有实现保持正负样本均衡
- # subsample positive labels if we have too many
- num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
- fg_inds = np.where(labels == 1)[0]
- if len(fg_inds) > num_fg:
- disable_inds = npr.choice(
- fg_inds, size=(len(fg_inds) - num_fg), replace=False)
- labels[disable_inds] = -1
- # subsample negative labels if we have too many
- num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
- bg_inds = np.where(labels == 0)[0]
- if len(bg_inds) > num_bg:
- disable_inds = npr.choice(
- bg_inds, size=(len(bg_inds) - num_bg), replace=False)
- labels[disable_inds] = -1
- #print "was %s inds, disabling %s, now %s inds" % (
- #len(bg_inds), len(disable_inds), np.sum(labels == 0))
- bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
- bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
- bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
- bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
- bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
- if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
- # uniform weighting of examples (given non-uniform sampling)
- num_examples = np.sum(labels >= 0)
- positive_weights = np.ones((1, 4)) * 1.0 / num_examples
- negative_weights = np.ones((1, 4)) * 1.0 / num_examples
- else:
- assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
- (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
- positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
- np.sum(labels == 1))
- negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
- np.sum(labels == 0))
- bbox_outside_weights[labels == 1, :] = positive_weights
- bbox_outside_weights[labels == 0, :] = negative_weights
- if DEBUG:
- self._sums += bbox_targets[labels == 1, :].sum(axis=0)
- self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
- self._counts += np.sum(labels == 1)
- means = self._sums / self._counts
- stds = np.sqrt(self._squared_sums / self._counts - means ** 2)
- print 'means:'
- print means
- print 'stdevs:'
- print stds
- # map up to original set of anchors
- labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
- bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
- bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
- bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
- if DEBUG:
- print 'rpn: max max_overlap', np.max(max_overlaps)
- print 'rpn: num_positive', np.sum(labels == 1)
- print 'rpn: num_negative', np.sum(labels == 0)
- self._fg_sum += np.sum(labels == 1)
- self._bg_sum += np.sum(labels == 0)
- self._count += 1
- print 'rpn: num_positive avg', self._fg_sum / self._count
- print 'rpn: num_negative avg', self._bg_sum / self._count
- # labels
- labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
- labels = labels.reshape((1, 1, A * height, width))
- top[0].reshape(*labels.shape)
- top[0].data[...] = labels
- # bbox_targets
- bbox_targets = bbox_targets \
- .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
- top[1].reshape(*bbox_targets.shape)
- top[1].data[...] = bbox_targets
- # bbox_inside_weights
- bbox_inside_weights = bbox_inside_weights \
- .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
- assert bbox_inside_weights.shape[2] == height
- assert bbox_inside_weights.shape[3] == width
- top[2].reshape(*bbox_inside_weights.shape)
- top[2].data[...] = bbox_inside_weights
- # bbox_outside_weights
- bbox_outside_weights = bbox_outside_weights \
- .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
- assert bbox_outside_weights.shape[2] == height
- assert bbox_outside_weights.shape[3] == width
- top[3].reshape(*bbox_outside_weights.shape)
- top[3].data[...] = bbox_outside_weights
- def backward(self, top, propagate_down, bottom):
- """This layer does not propagate gradients."""
- pass
- def reshape(self, bottom, top):
- """Reshaping happens during the call to forward."""
- pass
- def _unmap(data, count, inds, fill=0):
- """ Unmap a subset of item (data) back to the original set of items (of
- size count) """
- if len(data.shape) == 1:
- ret = np.empty((count, ), dtype=np.float32)
- ret.fill(fill)
- ret[inds] = data
- else:
- ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
- ret.fill(fill)
- ret[inds, :] = data
- return ret
- def _compute_targets(ex_rois, gt_rois):
- """Compute bounding-box regression targets for an image."""
- assert ex_rois.shape[0] == gt_rois.shape[0]
- assert ex_rois.shape[1] == 4
- assert gt_rois.shape[1] == 5
- return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
用到了lib/rpn/generate_anchors.py里的函数
- #coding:utf-8
- # --------------------------------------------------------
- # Faster R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick and Sean Bell
- # --------------------------------------------------------
- import numpy as np
- #下面是产生的9个anchor的坐标,每个box为(xmin,ymin,xmax,ymax),每个box的中心都是(8.5,8.5),所以会有负值
- # Verify that we compute the same anchors as Shaoqing's matlab implementation:
- #
- # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
- # >> anchors
- #
- # anchors =
- #
- # -83 -39 100 56
- # -175 -87 192 104
- # -359 -183 376 200
- # -55 -55 72 72
- # -119 -119 136 136
- # -247 -247 264 264
- # -35 -79 52 96
- # -79 -167 96 184
- # -167 -343 184 360
- #array([[ -83., -39., 100., 56.],
- # [-175., -87., 192., 104.],
- # [-359., -183., 376., 200.],
- # [ -55., -55., 72., 72.],
- # [-119., -119., 136., 136.],
- # [-247., -247., 264., 264.],
- # [ -35., -79., 52., 96.],
- # [ -79., -167., 96., 184.],
- # [-167., -343., 184., 360.]])
- def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
- scales=2**np.arange(3, 6)):
- """
- Generate anchor (reference) windows by enumerating aspect ratios X
- scales wrt a reference (0, 0, 15, 15) window.
- """
- #base_anchor的大小为(0,0,15,15),其他anchor在此基础上变换产生
- base_anchor = np.array([1, 1, base_size, base_size]) - 1
- #产生不同长宽比的anchor,面积一样,中心一样
- ratio_anchors = _ratio_enum(base_anchor, ratios)
- anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
- for i in xrange(ratio_anchors.shape[0])])
- return anchors
- def _whctrs(anchor):
- """
- Return width, height, x center, and y center for an anchor (window).
- """
- w = anchor[2] - anchor[0] + 1
- h = anchor[3] - anchor[1] + 1
- x_ctr = anchor[0] + 0.5 * (w - 1)
- y_ctr = anchor[1] + 0.5 * (h - 1)
- return w, h, x_ctr, y_ctr
- def _mkanchors(ws, hs, x_ctr, y_ctr):
- """
- Given a vector of widths (ws) and heights (hs) around a center
- (x_ctr, y_ctr), output a set of anchors (windows).
- """
- ws = ws[:, np.newaxis]
- hs = hs[:, np.newaxis]
- anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
- y_ctr - 0.5 * (hs - 1),
- x_ctr + 0.5 * (ws - 1),
- y_ctr + 0.5 * (hs - 1)))
- return anchors
- def _ratio_enum(anchor, ratios):
- """
- Enumerate a set of anchors for each aspect ratio wrt an anchor.
- """
- w, h, x_ctr, y_ctr = _whctrs(anchor)
- size = w * h
- size_ratios = size / ratios
- ws = np.round(np.sqrt(size_ratios))
- hs = np.round(ws * ratios)
- anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
- return anchors
- #产生不同面积大小的anchor,长宽比不变,长宽均变为原来的scale倍
- def _scale_enum(anchor, scales):
- """
- Enumerate a set of anchors for each scale wrt an anchor.
- """
- w, h, x_ctr, y_ctr = _whctrs(anchor)
- ws = w * scales
- hs = h * scales
- anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
- return anchors
- if __name__ == '__main__':
- import time
- t = time.time()
- a = generate_anchors()
- print time.time() - t
- print a
- from IPython import embed; embed()
rpn_test.pt
- #用RPN产生region proposal时的网络结构,这个网络只用前向计算
- name: "ZF"
- input: "data"
- input_shape {
- dim: 1
- dim: 3
- dim: 224
- dim: 224
- }
- input: "im_info"
- input_shape {
- dim: 1
- dim: 3
- }
- #前面是ZF网,特征提取用,共享
- # ------------------------ layer 1 -----------------------------
- layer {
- name: "conv1"
- type: "Convolution"
- bottom: "data"
- top: "conv1"
- convolution_param {
- num_output: 96
- kernel_size: 7
- pad: 3
- stride: 2
- }
- }
- layer {
- name: "relu1"
- type: "ReLU"
- bottom: "conv1"
- top: "conv1"
- }
- layer {
- name: "norm1"
- type: "LRN"
- bottom: "conv1"
- top: "norm1"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool1"
- type: "Pooling"
- bottom: "norm1"
- top: "pool1"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv2"
- type: "Convolution"
- bottom: "pool1"
- top: "conv2"
- convolution_param {
- num_output: 256
- kernel_size: 5
- pad: 2
- stride: 2
- }
- }
- layer {
- name: "relu2"
- type: "ReLU"
- bottom: "conv2"
- top: "conv2"
- }
- layer {
- name: "norm2"
- type: "LRN"
- bottom: "conv2"
- top: "norm2"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool2"
- type: "Pooling"
- bottom: "norm2"
- top: "pool2"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv3"
- type: "Convolution"
- bottom: "pool2"
- top: "conv3"
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu3"
- type: "ReLU"
- bottom: "conv3"
- top: "conv3"
- }
- layer {
- name: "conv4"
- type: "Convolution"
- bottom: "conv3"
- top: "conv4"
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu4"
- type: "ReLU"
- bottom: "conv4"
- top: "conv4"
- }
- layer {
- name: "conv5"
- type: "Convolution"
- bottom: "conv4"
- top: "conv5"
- convolution_param {
- num_output: 256#经过最后一层,产生256个特征图
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu5"
- type: "ReLU"
- bottom: "conv5"
- top: "conv5"
- }
- #-----------------------layer +-------------------------
- #RPN在conv5上滑动窗口,256*3*3*256卷积核,预测每个位置9个anchor是否属于前景,
- #如果属于前景,box的修正位置
- layer {
- name: "rpn_conv1"
- type: "Convolution"
- bottom: "conv5"
- top: "rpn_conv1"
- convolution_param {
- num_output: 256
- kernel_size: 3 pad: 1 stride: 1
- }
- }
- layer {
- name: "rpn_relu1"
- type: "ReLU"
- bottom: "rpn_conv1"
- top: "rpn_conv1"
- }
- layer {
- name: "rpn_cls_score"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_cls_score"
- convolution_param {
- num_output: 18 # 2(bg/fg) * 9(anchors)#输出预测每个位置9个anchor,属于bg或fg
- kernel_size: 1 pad: 0 stride: 1
- }
- }
- layer {
- name: "rpn_bbox_pred"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_bbox_pred"
- convolution_param {
- num_output: 36 # 4 * 9(anchors)#输出预测9个anchor的修正坐标
- kernel_size: 1 pad: 0 stride: 1
- }
- }
- layer {
- bottom: "rpn_cls_score"
- top: "rpn_cls_score_reshape"
- name: "rpn_cls_score_reshape"
- type: "Reshape"
- reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
- }
- #-----------------------output------------------------
- layer {
- name: "rpn_cls_prob"
- type: "Softmax"
- bottom: "rpn_cls_score_reshape"
- top: "rpn_cls_prob"
- }
- layer {
- name: 'rpn_cls_prob_reshape'
- type: 'Reshape'
- bottom: 'rpn_cls_prob'
- top: 'rpn_cls_prob_reshape'
- reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
- }
- layer {
- name: 'proposal'
- type: 'Python'
- bottom: 'rpn_cls_prob_reshape'
- bottom: 'rpn_bbox_pred'
- bottom: 'im_info'
- top: 'rois'
- top: 'scores'
- python_param {
- module: 'rpn.proposal_layer'#对应lib/rpn/proposal_layer.py
- layer: 'ProposalLayer'
- param_str: "'feat_stride': 16"
- }
- }
lib/rpn/proposal_layer.py
这一层用来由RPN产生region proposal
- #coding:utf-8
- # --------------------------------------------------------
- # Faster R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick and Sean Bell
- # --------------------------------------------------------
- import caffe
- import numpy as np
- import yaml
- from fast_rcnn.config import cfg
- from generate_anchors import generate_anchors
- from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
- from fast_rcnn.nms_wrapper import nms
- DEBUG = False
- class ProposalLayer(caffe.Layer):
- """
- Outputs object detection proposals by applying estimated bounding-box
- transformations to a set of regular boxes (called "anchors").
- """
- def setup(self, bottom, top):
- # parse the layer parameter string, which must be valid YAML
- layer_params = yaml.load(self.param_str_)
- #16,提取特征后的feature map的大小是原来的1/16
- self._feat_stride = layer_params['feat_stride']
- anchor_scales = layer_params.get('scales', (8, 16, 32))
- #产生anchors
- self._anchors = generate_anchors(scales=np.array(anchor_scales))
- self._num_anchors = self._anchors.shape[0]
- if DEBUG:
- print 'feat_stride: {}'.format(self._feat_stride)
- print 'anchors:'
- print self._anchors
- # rois blob: holds R regions of interest, each is a 5-tuple
- # (n, x1, y1, x2, y2) specifying an image batch index n and a
- # rectangle (x1, y1, x2, y2)
- top[0].reshape(1, 5)
- # scores blob: holds scores for R regions of interest
- if len(top) > 1:
- top[1].reshape(1, 1, 1, 1)
- #英文解释得很清楚
- def forward(self, bottom, top):
- # Algorithm:
- #
- # for each (H, W) location i
- #1.generate A anchor boxes centered on cell i
- #2.apply predicted bbox deltas at cell i to each of the A anchors
- #3.clip predicted boxes to image
- #4.remove predicted boxes with either height or width < threshold
- #5.sort all (proposal, score) pairs by score from highest to lowest
- #6.take top pre_nms_topN proposals before NMS
- #7.apply NMS with threshold 0.7 to remaining proposals
- #8.take after_nms_topN proposals after NMS
- #9.return the top proposals (-> RoIs top, scores top)
- assert bottom[0].data.shape[0] == 1, \
- 'Only single item batches are supported'
- cfg_key = str(self.phase) # either 'TRAIN' or 'TEST'
- pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
- post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
- nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
- min_size = cfg[cfg_key].RPN_MIN_SIZE
- # the first set of _num_anchors channels are bg probs
- # the second set are the fg probs, which we want
- scores = bottom[0].data[:, self._num_anchors:, :, :]
- bbox_deltas = bottom[1].data
- im_info = bottom[2].data[0, :]
- if DEBUG:
- print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
- print 'scale: {}'.format(im_info[2])
- # 1. Generate proposals from bbox deltas and shifted anchors
- height, width = scores.shape[-2:]
- if DEBUG:
- print 'score map size: {}'.format(scores.shape)
- # Enumerate all shifts
- shift_x = np.arange(0, width) * self._feat_stride
- shift_y = np.arange(0, height) * self._feat_stride
- shift_x, shift_y = np.meshgrid(shift_x, shift_y)
- shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
- shift_x.ravel(), shift_y.ravel())).transpose()
- # Enumerate all shifted anchors:
- #
- # add A anchors (1, A, 4) to
- # cell K shifts (K, 1, 4) to get
- # shift anchors (K, A, 4)
- # reshape to (K*A, 4) shifted anchors
- A = self._num_anchors
- K = shifts.shape[0]
- anchors = self._anchors.reshape((1, A, 4)) + \
- shifts.reshape((1, K, 4)).transpose((1, 0, 2))
- anchors = anchors.reshape((K * A, 4))
- # Transpose and reshape predicted bbox transformations to get them
- # into the same order as the anchors:
- #
- # bbox deltas will be (1, 4 * A, H, W) format
- # transpose to (1, H, W, 4 * A)
- # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
- # in slowest to fastest order
- bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
- # Same story for the scores:
- #
- # scores are (1, A, H, W) format
- # transpose to (1, H, W, A)
- # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
- scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
- # Convert anchors into proposals via bbox transformations
- proposals = bbox_transform_inv(anchors, bbox_deltas)
- # 2. clip predicted boxes to image
- proposals = clip_boxes(proposals, im_info[:2])
- #过滤掉width或height小于RPN_MIN_SIZE的proposal
- # 3. remove predicted boxes with either height or width < threshold
- # (NOTE: convert min_size to input image scale stored in im_info[2])
- keep = _filter_boxes(proposals, min_size * im_info[2])
- proposals = proposals[keep, :]
- scores = scores[keep]
- # 4. sort all (proposal, score) pairs by score from highest to lowest
- # 5. take top pre_nms_topN (e.g. 6000)
- order = scores.ravel().argsort()[::-1]
- if pre_nms_topN > 0:
- order = order[:pre_nms_topN]
- proposals = proposals[order, :]
- scores = scores[order]
- # 6. apply nms (e.g. threshold = 0.7)
- # 7. take after_nms_topN (e.g. 300)
- # 8. return the top proposals (-> RoIs top)
- keep = nms(np.hstack((proposals, scores)), nms_thresh)
- if post_nms_topN > 0:
- keep = keep[:post_nms_topN]
- proposals = proposals[keep, :]
- scores = scores[keep]
- # Output rois blob
- # Our RPN implementation only supports a single input image, so all
- # batch inds are 0
- batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
- blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
- top[0].reshape(*(blob.shape))
- top[0].data[...] = blob
- # [Optional] output scores blob
- if len(top) > 1:
- top[1].reshape(*(scores.shape))
- top[1].data[...] = scores
- def backward(self, top, propagate_down, bottom):
- """This layer does not propagate gradients."""
- pass
- def reshape(self, bottom, top):
- """Reshaping happens during the call to forward."""
- pass
- def _filter_boxes(boxes, min_size):
- """Remove all boxes with any side smaller than min_size."""
- ws = boxes[:, 2] - boxes[:, 0] + 1
- hs = boxes[:, 3] - boxes[:, 1] + 1
- keep = np.where((ws >= min_size) & (hs >= min_size))[0]
- return keep
fast_rcnn_train.pt
- #stage 1训练fast rcnn网络,输入是rpn提取的roi以及gt box
- name: "ZF"
- layer {
- name: 'data'
- type: 'Python'
- top: 'data'
- top: 'rois'
- top: 'labels'
- top: 'bbox_targets'
- top: 'bbox_inside_weights'
- top: 'bbox_outside_weights'
- python_param {
- module: 'roi_data_layer.layer'#对应lib/roi_data_layer/layer.py
- #为训练fast rcnn时为网络输入roi,此时为roi是region proposal
- layer: 'RoIDataLayer'
- param_str: "'num_classes': 21"
- }
- }
- #ZF网,特征提取用,共享
- #========= conv1-conv5 ============
- layer {
- name: "conv1"
- type: "Convolution"
- bottom: "data"
- top: "conv1"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 96
- kernel_size: 7
- pad: 3
- stride: 2
- }
- }
- layer {
- name: "relu1"
- type: "ReLU"
- bottom: "conv1"
- top: "conv1"
- }
- layer {
- name: "norm1"
- type: "LRN"
- bottom: "conv1"
- top: "norm1"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool1"
- type: "Pooling"
- bottom: "norm1"
- top: "pool1"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv2"
- type: "Convolution"
- bottom: "pool1"
- top: "conv2"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 256
- kernel_size: 5
- pad: 2
- stride: 2
- }
- }
- layer {
- name: "relu2"
- type: "ReLU"
- bottom: "conv2"
- top: "conv2"
- }
- layer {
- name: "norm2"
- type: "LRN"
- bottom: "conv2"
- top: "norm2"
- lrn_param {
- local_size: 3
- alpha: 0.00005
- beta: 0.75
- norm_region: WITHIN_CHANNEL
- engine: CAFFE
- }
- }
- layer {
- name: "pool2"
- type: "Pooling"
- bottom: "norm2"
- top: "pool2"
- pooling_param {
- kernel_size: 3
- stride: 2
- pad: 1
- pool: MAX
- }
- }
- layer {
- name: "conv3"
- type: "Convolution"
- bottom: "pool2"
- top: "conv3"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu3"
- type: "ReLU"
- bottom: "conv3"
- top: "conv3"
- }
- layer {
- name: "conv4"
- type: "Convolution"
- bottom: "conv3"
- top: "conv4"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 384
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu4"
- type: "ReLU"
- bottom: "conv4"
- top: "conv4"
- }
- layer {
- name: "conv5"
- type: "Convolution"
- bottom: "conv4"
- top: "conv5"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- convolution_param {
- num_output: 256
- kernel_size: 3
- pad: 1
- stride: 1
- }
- }
- layer {
- name: "relu5"
- type: "ReLU"
- bottom: "conv5"
- top: "conv5"
- }
- #========= RCNN ============
- layer {
- name: "roi_pool_conv5"
- type: "ROIPooling"#这个层在caffe-fast-rcnn里实现
- bottom: "conv5"
- bottom: "rois"
- top: "roi_pool_conv5"
- roi_pooling_param {#每个roi做max pooling后的大小为6*6
- pooled_w: 6
- pooled_h: 6
- spatial_scale: 0.0625 # 1/16
- }
- }
- layer {
- name: "fc6"
- type: "InnerProduct"
- bottom: "roi_pool_conv5"
- top: "fc6"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- inner_product_param {
- num_output: 4096
- }
- }
- layer {
- name: "relu6"
- type: "ReLU"
- bottom: "fc6"
- top: "fc6"
- }
- layer {
- name: "drop6"
- type: "Dropout"
- bottom: "fc6"
- top: "fc6"
- dropout_param {
- dropout_ratio: 0.5
- scale_train: false
- }
- }
- layer {
- name: "fc7"
- type: "InnerProduct"
- bottom: "fc6"
- top: "fc7"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- inner_product_param {
- num_output: 4096
- }
- }
- layer {
- name: "relu7"
- type: "ReLU"
- bottom: "fc7"
- top: "fc7"
- }
- layer {
- name: "drop7"
- type: "Dropout"
- bottom: "fc7"
- top: "fc7"
- dropout_param {
- dropout_ratio: 0.5
- scale_train: false
- }
- }
- layer {
- name: "cls_score"
- type: "InnerProduct"
- bottom: "fc7"
- top: "cls_score"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- inner_product_param {
- num_output: 21
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 0
- }
- }
- }
- layer {
- name: "bbox_pred"
- type: "InnerProduct"
- bottom: "fc7"
- top: "bbox_pred"
- param { lr_mult: 1.0 }
- param { lr_mult: 2.0 }
- inner_product_param {
- num_output: 84
- weight_filler {
- type: "gaussian"
- std: 0.001
- }
- bias_filler {
- type: "constant"
- value: 0
- }
- }
- }
- layer {
- name: "loss_cls"
- type: "SoftmaxWithLoss"
- bottom: "cls_score"
- bottom: "labels"
- propagate_down: 1
- propagate_down: 0
- top: "cls_loss"
- loss_weight: 1
- loss_param {
- ignore_label: -1
- normalize: true
- }
- }
- layer {
- name: "loss_bbox"
- type: "SmoothL1Loss"
- bottom: "bbox_pred"
- bottom: "bbox_targets"
- bottom: "bbox_inside_weights"
- bottom: "bbox_outside_weights"
- top: "bbox_loss"
- loss_weight: 1
- }
- #========= RPN ============
- # Dummy layers so that initial parameters are saved into the output net
- layer {
- name: "rpn_conv1"
- type: "Convolution"
- bottom: "conv5"
- top: "rpn_conv1"
- param { lr_mult: 0 decay_mult: 0 }
- param { lr_mult: 0 decay_mult: 0 }
- convolution_param {
- num_output: 256
- kernel_size: 3 pad: 1 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- name: "rpn_relu1"
- type: "ReLU"
- bottom: "rpn_conv1"
- top: "rpn_conv1"
- }
- layer {
- name: "rpn_cls_score"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_cls_score"
- param { lr_mult: 0 decay_mult: 0 }
- param { lr_mult: 0 decay_mult: 0 }
- convolution_param {
- num_output: 18 # 2(bg/fg) * 9(anchors)
- kernel_size: 1 pad: 0 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- name: "rpn_bbox_pred"
- type: "Convolution"
- bottom: "rpn_conv1"
- top: "rpn_bbox_pred"
- param { lr_mult: 0 decay_mult: 0 }
- param { lr_mult: 0 decay_mult: 0 }
- convolution_param {
- num_output: 36 # 4 * 9(anchors)
- kernel_size: 1 pad: 0 stride: 1
- weight_filler { type: "gaussian" std: 0.01 }
- bias_filler { type: "constant" value: 0 }
- }
- }
- layer {
- name: "silence_rpn_cls_score"
- type: "Silence"
- bottom: "rpn_cls_score"
- }
- layer {
- name: "silence_rpn_bbox_pred"
- type: "Silence"
- bottom: "rpn_bbox_pred"
- }
其中roi pooling layer在 caffe/src/layers/roi_pooling_layer.cpp里实现
- // ------------------------------------------------------------------
- // Fast R-CNN
- // Copyright (c) 2015 Microsoft
- // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
- // Written by Ross Girshick
- // ------------------------------------------------------------------
- #include <cfloat>
- #include "caffe/fast_rcnn_layers.hpp"
- using std::max;
- using std::min;
- using std::floor;
- using std::ceil;
- namespace caffe {
- template <typename Dtype>
- void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
- CHECK_GT(roi_pool_param.pooled_h(), 0)
- << "pooled_h must be > 0";
- CHECK_GT(roi_pool_param.pooled_w(), 0)
- << "pooled_w must be > 0";
- pooled_height_ = roi_pool_param.pooled_h();
- pooled_width_ = roi_pool_param.pooled_w();
- spatial_scale_ = roi_pool_param.spatial_scale();
- LOG(INFO) << "Spatial scale: " << spatial_scale_;
- }
- template <typename Dtype>
- void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- channels_ = bottom[0]->channels();
- height_ = bottom[0]->height();
- width_ = bottom[0]->width();
- top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_,
- pooled_width_);
- max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_,
- pooled_width_);
- }
- template <typename Dtype>
- void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top) {
- const Dtype* bottom_data = bottom[0]->cpu_data();
- const Dtype* bottom_rois = bottom[1]->cpu_data();
- // Number of ROIs
- int num_rois = bottom[1]->num();
- int batch_size = bottom[0]->num();
- int top_count = top[0]->count();
- Dtype* top_data = top[0]->mutable_cpu_data();
- caffe_set(top_count, Dtype(-FLT_MAX), top_data);
- int* argmax_data = max_idx_.mutable_cpu_data();
- caffe_set(top_count, -1, argmax_data);
- // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
- for (int n = 0; n < num_rois; ++n) {
- int roi_batch_ind = bottom_rois[0];
- int roi_start_w = round(bottom_rois[1] * spatial_scale_);
- int roi_start_h = round(bottom_rois[2] * spatial_scale_);
- int roi_end_w = round(bottom_rois[3] * spatial_scale_);
- int roi_end_h = round(bottom_rois[4] * spatial_scale_);
- CHECK_GE(roi_batch_ind, 0);
- CHECK_LT(roi_batch_ind, batch_size);
- int roi_height = max(roi_end_h - roi_start_h + 1, 1);
- int roi_width = max(roi_end_w - roi_start_w + 1, 1);
- const Dtype bin_size_h = static_cast<Dtype>(roi_height)
- / static_cast<Dtype>(pooled_height_);
- const Dtype bin_size_w = static_cast<Dtype>(roi_width)
- / static_cast<Dtype>(pooled_width_);
- const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
- for (int c = 0; c < channels_; ++c) {
- for (int ph = 0; ph < pooled_height_; ++ph) {
- for (int pw = 0; pw < pooled_width_; ++pw) {
- // Compute pooling region for this output unit:
- // start (included) = floor(ph * roi_height / pooled_height_)
- // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
- int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
- * bin_size_h));
- int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
- * bin_size_w));
- int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
- * bin_size_h));
- int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
- * bin_size_w));
- hstart = min(max(hstart + roi_start_h, 0), height_);
- hend = min(max(hend + roi_start_h, 0), height_);
- wstart = min(max(wstart + roi_start_w, 0), width_);
- wend = min(max(wend + roi_start_w, 0), width_);
- bool is_empty = (hend <= hstart) || (wend <= wstart);
- const int pool_index = ph * pooled_width_ + pw;
- if (is_empty) {
- top_data[pool_index] = 0;
- argmax_data[pool_index] = -1;
- }
- for (int h = hstart; h < hend; ++h) {
- for (int w = wstart; w < wend; ++w) {
- const int index = h * width_ + w;
- if (batch_data[index] > top_data[pool_index]) {
- top_data[pool_index] = batch_data[index];
- argmax_data[pool_index] = index;
- }
- }
- }
- }
- }
- // Increment all data pointers by one channel
- batch_data += bottom[0]->offset(0, 1);
- top_data += top[0]->offset(0, 1);
- argmax_data += max_idx_.offset(0, 1);
- }
- // Increment ROI data pointer
- bottom_rois += bottom[1]->offset(1);
- }
- }
- template <typename Dtype>
- void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
- const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
- NOT_IMPLEMENTED;
- }
- #ifdef CPU_ONLY
- STUB_GPU(ROIPoolingLayer);
- #endif
- INSTANTIATE_CLASS(ROIPoolingLayer);
- REGISTER_LAYER_CLASS(ROIPooling);
- } // namespace caffe
大致结构看明白了来看具体训练流程
首先看tools/train_faster_rcnn_alt_opt.py
- #coding:utf-8
- #!/usr/bin/env python
- # --------------------------------------------------------
- # Faster R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """Train a Faster R-CNN network using alternating optimization.
- This tool implements the alternating optimization algorithm described in our
- NIPS 2015 paper ("Faster R-CNN: Towards Real-time Object Detection with Region
- Proposal Networks." Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.)
- """
- import _init_paths
- from fast_rcnn.train import get_training_roidb, train_net
- from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
- from datasets.factory import get_imdb
- from rpn.generate import imdb_proposals
- import argparse
- import pprint
- import numpy as np
- import sys, os
- import multiprocessing as mp
- import cPickle
- import shutil
- def parse_args():
- """
- Parse input arguments
- """
- parser = argparse.ArgumentParser(description='Train a Faster R-CNN network')
- #训练时设置使用哪个GPU
- parser.add_argument('--gpu', dest='gpu_id',
- help='GPU device id to use [0]',
- default=0, type=int)
- #设置训练时使用哪种网络模型
- parser.add_argument('--net_name', dest='net_name',
- help='network name (e.g., "ZF")',
- default=None, type=str)
- #指定预训练的模型来初始化网络
- parser.add_argument('--weights', dest='pretrained_model',
- help='initialize with pretrained model weights',
- default=None, type=str)
- #加载配置文件
- parser.add_argument('--cfg', dest='cfg_file',
- help='optional config file',
- default=None, type=str)
- #训练使用的数据集
- parser.add_argument('--imdb', dest='imdb_name',
- help='dataset to train on',
- default='voc_2007_trainval', type=str)
- parser.add_argument('--set', dest='set_cfgs',
- help='set config keys', default=None,
- nargs=argparse.REMAINDER)
- if len(sys.argv) == 1:
- parser.print_help()
- sys.exit(1)
- args = parser.parse_args()
- return args
- def get_roidb(imdb_name, rpn_file=None):
- #得到图像集(image database)的名字,如pascalvoc——2007——trainval
- imdb = get_imdb(imdb_name)
- print 'Loaded dataset `{:s}` for training'.format(imdb.name)
- #设置网络得到proposal的方法,有selective search和RPN、gt,selective search已弃用
- imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
- print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
- #判断之前是否已经有RPN网络提取得到的region proposal文件
- if rpn_file is not None:
- imdb.config['rpn_file'] = rpn_file
- roidb = get_training_roidb(imdb)
- return roidb, imdb
- def get_solvers(net_name):
- # Faster R-CNN Alternating Optimization
- n = 'faster_rcnn_alt_opt'
- # Solver for each training stage
- solvers = [[net_name, n, 'stage1_rpn_solver60k80k.pt'],
- [net_name, n, 'stage1_fast_rcnn_solver30k40k.pt'],
- [net_name, n, 'stage2_rpn_solver60k80k.pt'],
- [net_name, n, 'stage2_fast_rcnn_solver30k40k.pt']]
- solvers = [os.path.join(cfg.MODELS_DIR, *s) for s in solvers]
- # Iterations for each training stage
- #每一轮训练的最大迭代次数,建议测试时都设置为100
- max_iters = [80000, 40000, 80000, 40000]
- # max_iters = [100, 100, 100, 100]
- # Test prototxt for the RPN
- rpn_test_prototxt = os.path.join(
- cfg.MODELS_DIR, net_name, n, 'rpn_test.pt')
- return solvers, max_iters, rpn_test_prototxt
- # ------------------------------------------------------------------------------
- # Pycaffe doesn't reliably free GPU memory when instantiated nets are discarded
- # (e.g. "del net" in Python code). To work around this issue, each training
- # stage is executed in a separate process using multiprocessing.Process.
- # ------------------------------------------------------------------------------
- def _init_caffe(cfg):
- """Initialize pycaffe in a training process.
- """
- import caffe
- # fix the random seeds (numpy and caffe) for reproducibility
- np.random.seed(cfg.RNG_SEED)
- caffe.set_random_seed(cfg.RNG_SEED)
- # set up caffe
- caffe.set_mode_gpu()
- caffe.set_device(cfg.GPU_ID)
- #训练RPN
- def train_rpn(queue=None, imdb_name=None, init_model=None, solver=None,
- max_iters=None, cfg=None):
- """Train a Region Proposal Network in a separate training process.
- """
- # Not using any proposals, just ground-truth boxes
- cfg.TRAIN.HAS_RPN = True
- cfg.TRAIN.BBOX_REG = False # applies only to Fast R-CNN bbox regression
- #训练RPN时使用ground-truth
- cfg.TRAIN.PROPOSAL_METHOD = 'gt'
- #每次训练RPN只用一张图片
- cfg.TRAIN.IMS_PER_BATCH = 1
- print 'Init model: {}'.format(init_model)
- print('Using config:')
- pprint.pprint(cfg)
- import caffe
- _init_caffe(cfg)
- roidb, imdb = get_roidb(imdb_name)
- print 'roidb len: {}'.format(len(roidb))
- output_dir = get_output_dir(imdb)
- print 'Output will be saved to `{:s}`'.format(output_dir)
- #开始训练RPN网络
- model_paths = train_net(solver, roidb, output_dir,
- pretrained_model=init_model,
- max_iters=max_iters)
- #只保留最后得到的网络模型
- # Cleanup all but the final model
- for i in model_paths[:-1]:
- os.remove(i)
- rpn_model_path = model_paths[-1]
- # Send final model path through the multiprocessing queue
- queue.put({'model_path': rpn_model_path})
- #用训练完的RPN产生region proposal并存到磁盘上
- def rpn_generate(queue=None, imdb_name=None, rpn_model_path=None, cfg=None,
- rpn_test_prototxt=None):
- """Use a trained RPN to generate proposals.
- """
- cfg.TEST.RPN_PRE_NMS_TOP_N = -1 # no pre NMS filtering
- cfg.TEST.RPN_POST_NMS_TOP_N = 2000 # limit top boxes after NMS
- print 'RPN model: {}'.format(rpn_model_path)
- print('Using config:')
- pprint.pprint(cfg)
- import caffe
- _init_caffe(cfg)
- # NOTE: the matlab implementation computes proposals on flipped images, too.
- # We compute them on the image once and then flip the already computed
- # proposals. This might cause a minor loss in mAP (less proposal jittering).
- imdb = get_imdb(imdb_name)
- print 'Loaded dataset `{:s}` for proposal generation'.format(imdb.name)
- # Load RPN and configure output directory
- rpn_net = caffe.Net(rpn_test_prototxt, rpn_model_path, caffe.TEST)
- output_dir = get_output_dir(imdb)
- print 'Output will be saved to `{:s}`'.format(output_dir)
- # Generate proposals on the imdb
- rpn_proposals = imdb_proposals(rpn_net, imdb)
- # Write proposals to disk and send the proposal file path through the
- # multiprocessing queue
- rpn_net_name = os.path.splitext(os.path.basename(rpn_model_path))[0]
- rpn_proposals_path = os.path.join(
- output_dir, rpn_net_name + '_proposals.pkl')
- with open(rpn_proposals_path, 'wb') as f:
- cPickle.dump(rpn_proposals, f, cPickle.HIGHEST_PROTOCOL)
- print 'Wrote RPN proposals to {}'.format(rpn_proposals_path)
- queue.put({'proposal_path': rpn_proposals_path})
- #训练fast-rcnn
- def train_fast_rcnn(queue=None, imdb_name=None, init_model=None, solver=None,
- max_iters=None, cfg=None, rpn_file=None):
- """Train a Fast R-CNN using proposals generated by an RPN.
- """
- #conv5后面现在接的是fast-rcnn
- cfg.TRAIN.HAS_RPN = False # not generating prosals on-the-fly
- #roidb由刚刚训练完的RPN产生
- cfg.TRAIN.PROPOSAL_METHOD = 'rpn' # use pre-computed RPN proposals instead
- #每次训练fast-rcnn使用两张图片
- cfg.TRAIN.IMS_PER_BATCH = 2
- print 'Init model: {}'.format(init_model)
- print 'RPN proposals: {}'.format(rpn_file)
- print('Using config:')
- pprint.pprint(cfg)
- import caffe
- _init_caffe(cfg)
- roidb, imdb = get_roidb(imdb_name, rpn_file=rpn_file)
- output_dir = get_output_dir(imdb)
- print 'Output will be saved to `{:s}`'.format(output_dir)
- # Train Fast R-CNN
- model_paths = train_net(solver, roidb, output_dir,
- pretrained_model=init_model,
- max_iters=max_iters)
- # Cleanup all but the final model
- for i in model_paths[:-1]:
- os.remove(i)
- fast_rcnn_model_path = model_paths[-1]
- # Send Fast R-CNN model path over the multiprocessing queue
- queue.put({'model_path': fast_rcnn_model_path})
- if __name__ == '__main__':
- args = parse_args()
- print('Called with args:')
- print(args)
- if args.cfg_file is not None:
- cfg_from_file(args.cfg_file)
- if args.set_cfgs is not None:
- cfg_from_list(args.set_cfgs)
- cfg.GPU_ID = args.gpu_id
- # --------------------------------------------------------------------------
- # Pycaffe doesn't reliably free GPU memory when instantiated nets are
- # discarded (e.g. "del net" in Python code). To work around this issue, each
- # training stage is executed in a separate process using
- # multiprocessing.Process.
- # --------------------------------------------------------------------------
- # queue for communicated results between processes
- mp_queue = mp.Queue()
- # solves, iters, etc. for each training stage
- solvers, max_iters, rpn_test_prototxt = get_solvers(args.net_name)
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 1 RPN, init from ImageNet model'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- init_model=args.pretrained_model,
- solver=solvers[0],
- max_iters=max_iters[0],
- cfg=cfg)
- p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
- p.start()
- rpn_stage1_out = mp_queue.get()
- p.join()
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 1 RPN, generate proposals'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- rpn_model_path=str(rpn_stage1_out['model_path']),
- cfg=cfg,
- rpn_test_prototxt=rpn_test_prototxt)
- p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
- p.start()
- rpn_stage1_out['proposal_path'] = mp_queue.get()['proposal_path']
- p.join()
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 1 Fast R-CNN using RPN proposals, init from ImageNet model'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- init_model=args.pretrained_model,
- solver=solvers[1],
- max_iters=max_iters[1],
- cfg=cfg,
- rpn_file=rpn_stage1_out['proposal_path'])
- p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
- p.start()
- fast_rcnn_stage1_out = mp_queue.get()
- p.join()
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 2 RPN, init from stage 1 Fast R-CNN model'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- init_model=str(fast_rcnn_stage1_out['model_path']),
- solver=solvers[2],
- max_iters=max_iters[2],
- cfg=cfg)
- p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
- p.start()
- rpn_stage2_out = mp_queue.get()
- p.join()
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 2 RPN, generate proposals'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- rpn_model_path=str(rpn_stage2_out['model_path']),
- cfg=cfg,
- rpn_test_prototxt=rpn_test_prototxt)
- p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
- p.start()
- rpn_stage2_out['proposal_path'] = mp_queue.get()['proposal_path']
- p.join()
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- print 'Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model'
- print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
- cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
- mp_kwargs = dict(
- queue=mp_queue,
- imdb_name=args.imdb_name,
- init_model=str(rpn_stage2_out['model_path']),
- solver=solvers[3],
- max_iters=max_iters[3],
- cfg=cfg,
- rpn_file=rpn_stage2_out['proposal_path'])
- p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
- p.start()
- fast_rcnn_stage2_out = mp_queue.get()
- p.join()
- # Create final model (just a copy of the last stage)
- final_path = os.path.join(
- os.path.dirname(fast_rcnn_stage2_out['model_path']),
- args.net_name + '_faster_rcnn_final.caffemodel')
- print 'cp {} -> {}'.format(
- fast_rcnn_stage2_out['model_path'], final_path)
- shutil.copy(fast_rcnn_stage2_out['model_path'], final_path)
- print 'Final model: {}'.format(final_path)
lib/rpn/generate.py利用rpn网络前向计算产生proposal
- #coding:utf-8
- # --------------------------------------------------------
- # Faster R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- from fast_rcnn.config import cfg
- from utils.blob import im_list_to_blob
- from utils.timer import Timer
- import numpy as np
- import cv2
- def _vis_proposals(im, dets, thresh=0.5):
- """Draw detected bounding boxes."""
- inds = np.where(dets[:, -1] >= thresh)[0]
- if len(inds) == 0:
- return
- class_name = 'obj'
- im = im[:, :, (2, 1, 0)]
- fig, ax = plt.subplots(figsize=(12, 12))
- ax.imshow(im, aspect='equal')
- for i in inds:
- bbox = dets[i, :4]
- score = dets[i, -1]
- ax.add_patch(
- plt.Rectangle((bbox[0], bbox[1]),
- bbox[2] - bbox[0],
- bbox[3] - bbox[1], fill=False,
- edgecolor='red', linewidth=3.5)
- )
- ax.text(bbox[0], bbox[1] - 2,
- '{:s} {:.3f}'.format(class_name, score),
- bbox=dict(facecolor='blue', alpha=0.5),
- fontsize=14, color='white')
- ax.set_title(('{} detections with '
- 'p({} | box) >= {:.1f}').format(class_name, class_name,
- thresh),
- fontsize=14)
- plt.axis('off')
- plt.tight_layout()
- plt.draw()
- def _get_image_blob(im):
- """Converts an image into a network input.
- Arguments:
- im (ndarray): a color image in BGR order
- Returns:
- blob (ndarray): a data blob holding an image pyramid
- im_scale_factors (list): list of image scales (relative to im) used
- in the image pyramid
- """
- im_orig = im.astype(np.float32, copy=True)
- im_orig -= cfg.PIXEL_MEANS
- im_shape = im_orig.shape
- im_size_min = np.min(im_shape[0:2])
- im_size_max = np.max(im_shape[0:2])
- processed_ims = []
- assert len(cfg.TEST.SCALES) == 1
- target_size = cfg.TEST.SCALES[0]
- im_scale = float(target_size) / float(im_size_min)
- # Prevent the biggest axis from being more than MAX_SIZE
- if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
- im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
- im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
- interpolation=cv2.INTER_LINEAR)
- im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :]
- processed_ims.append(im)
- # Create a blob to hold the input images
- blob = im_list_to_blob(processed_ims)
- return blob, im_info
- #在一张图片上RPN前向计算产生region proposal
- def im_proposals(net, im):
- """Generate RPN proposals on a single image."""
- blobs = {}
- blobs['data'], blobs['im_info'] = _get_image_blob(im)
- net.blobs['data'].reshape(*(blobs['data'].shape))
- net.blobs['im_info'].reshape(*(blobs['im_info'].shape))
- blobs_out = net.forward(
- data=blobs['data'].astype(np.float32, copy=False),
- im_info=blobs['im_info'].astype(np.float32, copy=False))
- scale = blobs['im_info'][0, 2]
- #boxes是列表,是所有roi box的坐标
- boxes = blobs_out['rois'][:, 1:].copy() / scale
- scores = blobs_out['scores'].copy()
- return boxes, scores
- #对imdb中所有的图像计算Region Proposal
- def imdb_proposals(net, imdb):
- """Generate RPN proposals on all images in an imdb."""
- _t = Timer()
- imdb_boxes = [[] for _ in xrange(imdb.num_images)]
- for i in xrange(imdb.num_images):
- im = cv2.imread(imdb.image_path_at(i))
- _t.tic()
- imdb_boxes[i], scores = im_proposals(net, im)
- _t.toc()
- print 'im_proposals: {:d}/{:d} {:.3f}s' \
- .format(i + 1, imdb.num_images, _t.average_time)
- if 0:
- dets = np.hstack((imdb_boxes[i], scores))
- # from IPython import embed; embed()
- _vis_proposals(im, dets[:3, :], thresh=0.9)
- plt.show()
- return imdb_boxes
lib/fast_rcnn/train.py
- #coding:utf-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """Train a Fast R-CNN network."""
- import caffe
- from fast_rcnn.config import cfg
- import roi_data_layer.roidb as rdl_roidb
- from utils.timer import Timer
- import numpy as np
- import os
- from caffe.proto import caffe_pb2
- import google.protobuf as pb2
- class SolverWrapper(object):
- """A simple wrapper around Caffe's solver.
- This wrapper gives us control over he snapshotting process, which we
- use to unnormalize the learned bounding-box regression weights.
- """
- def __init__(self, solver_prototxt, roidb, output_dir,
- pretrained_model=None):
- """Initialize the SolverWrapper."""
- self.output_dir = output_dir
- if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
- cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
- # RPN can only use precomputed normalization because there are no
- # fixed statistics to compute a priori
- assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
- if cfg.TRAIN.BBOX_REG:
- print 'Computing bounding-box regression targets...'
- #bbox_stds是什么
- self.bbox_means, self.bbox_stds = \
- rdl_roidb.add_bbox_regression_targets(roidb)
- print 'done'
- self.solver = caffe.SGDSolver(solver_prototxt)
- #加载在ImageNet上训练得到的预训练模型
- if pretrained_model is not None:
- print ('Loading pretrained model '
- 'weights from {:s}').format(pretrained_model)
- self.solver.net.copy_from(pretrained_model)
- #解析得到训练时的参数,学习率等
- self.solver_param = caffe_pb2.SolverParameter()
- with open(solver_prototxt, 'rt') as f:
- pb2.text_format.Merge(f.read(), self.solver_param)
- #设置输入
- self.solver.net.layers[0].set_roidb(roidb)
- #迭代达到10000次、20000次。。。时存结果
- def snapshot(self):
- """Take a snapshot of the network after unnormalizing the learned
- bounding-box regression weights. This enables easy use at test-time.
- """
- net = self.solver.net
- scale_bbox_params = (cfg.TRAIN.BBOX_REG and
- cfg.TRAIN.BBOX_NORMALIZE_TARGETS and
- net.params.has_key('bbox_pred'))
- if scale_bbox_params:
- # save original values
- orig_0 = net.params['bbox_pred'][0].data.copy()
- orig_1 = net.params['bbox_pred'][1].data.copy()
- # scale and shift with bbox reg unnormalization; then save snapshot
- net.params['bbox_pred'][0].data[...] = \
- (net.params['bbox_pred'][0].data *
- self.bbox_stds[:, np.newaxis])
- net.params['bbox_pred'][1].data[...] = \
- (net.params['bbox_pred'][1].data *
- self.bbox_stds + self.bbox_means)
- infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
- if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
- filename = (self.solver_param.snapshot_prefix + infix +
- '_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
- filename = os.path.join(self.output_dir, filename)
- net.save(str(filename))
- print 'Wrote snapshot to: {:s}'.format(filename)
- if scale_bbox_params:
- # restore net to original state
- net.params['bbox_pred'][0].data[...] = orig_0
- net.params['bbox_pred'][1].data[...] = orig_1
- return filename
- #迭代一次
- def train_model(self, max_iters):
- """Network training loop."""
- last_snapshot_iter = -1
- timer = Timer()
- model_paths = []
- while self.solver.iter < max_iters:
- # Make one SGD update
- timer.tic()
- self.solver.step(1)
- timer.toc()
- if self.solver.iter % (10 * self.solver_param.display) == 0:
- print 'speed: {:.3f}s / iter'.format(timer.average_time)
- if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
- last_snapshot_iter = self.solver.iter
- model_paths.append(self.snapshot())
- if last_snapshot_iter != self.solver.iter:
- model_paths.append(self.snapshot())
- return model_paths
- def get_training_roidb(imdb):
- """Returns a roidb (Region of Interest database) for use in training."""
- #如果设置使用水平翻转的图像
- if cfg.TRAIN.USE_FLIPPED:
- print 'Appending horizontally-flipped training examples...'
- #把原来image database里所有的图像水平翻转加入到imdb里
- imdb.append_flipped_images()
- print 'done'
- print 'Preparing training data...'
- rdl_roidb.prepare_roidb(imdb)
- print 'done'
- return imdb.roidb
- #过滤产生符合条件的roidb
- def filter_roidb(roidb):
- """Remove roidb entries that have no usable RoIs."""
- def is_valid(entry):
- #满足roidb中至少有一个前景或背景的roidb才有效
- # Valid images have:
- # (1) At least one foreground RoI OR
- # (2) At least one background RoI
- overlaps = entry['max_overlaps']
- # find boxes with sufficient overlap
- fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
- # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
- bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
- (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
- # image is only valid if such boxes exist
- valid = len(fg_inds) > 0 or len(bg_inds) > 0
- return valid
- num = len(roidb)
- filtered_roidb = [entry for entry in roidb if is_valid(entry)]
- num_after = len(filtered_roidb)
- print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
- num, num_after)
- return filtered_roidb
- def train_net(solver_prototxt, roidb, output_dir,
- pretrained_model=None, max_iters=40000):
- """Train a Fast R-CNN network."""
- roidb = filter_roidb(roidb)
- sw = SolverWrapper(solver_prototxt, roidb, output_dir,
- pretrained_model=pretrained_model)
- print 'Solving...'
- model_paths = sw.train_model(max_iters)
- print 'done solving'
- return model_paths
lib/roi_data_layer/roidb.py
roidb是一个重要的数据结构,roidb是一个列表,里面的每个元素是字典,对应一张图片的所有roi信息
{'image':imageindex,'width':w,'height':h,'gt_overlaps':二维array,每张图片上所有roi与各个类别的gt的overlap,'max_classes':max_cls,每个roi属于那一类别的大,'max_overlaps':每个roi与gt最大重叠率的大小}
- #coding:utf-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- """Transform a roidb into a trainable roidb by adding a bunch of metadata."""
- import numpy as np
- from fast_rcnn.config import cfg
- from fast_rcnn.bbox_transform import bbox_transform
- from utils.cython_bbox import bbox_overlaps
- import PIL
- #准备roidb
- def prepare_roidb(imdb):
- """Enrich the imdb's roidb by adding some derived quantities that
- are useful for training. This function precomputes the maximum
- overlap, taken over ground-truth boxes, between each ROI and
- each ground-truth box. The class with maximum overlap is also
- recorded.
- """
- #得到每幅图像的宽和高
- sizes = [PIL.Image.open(imdb.image_path_at(i)).size
- for i in xrange(imdb.num_images)]
- roidb = imdb.roidb
- #roidb是一个列表,里面的每个元素是一个字典,对应一张图片的所有roi信息
- for i in xrange(len(imdb.image_index)):
- #字典{'image':imageindex,'width':w,'height':h,'gt_overlaps':二维array,每张图片上所有roi与各个类别的gt的overlap
- #'max_classes':max_cls,每个roi属于那一类别的最大
- roidb[i]['image'] = imdb.image_path_at(i)
- roidb[i]['width'] = sizes[i][0]
- roidb[i]['height'] = sizes[i][1]
- # need gt_overlaps as a dense array for argmax
- gt_overlaps = roidb[i]['gt_overlaps'].toarray()
- # max overlap with gt over classes (columns)
- #传递进来的roidb尚未经过下面的取最大值的操作
- #下面得到每个roi与ground-truth的bbox的最大IoU值
- max_overlaps = gt_overlaps.max(axis=1)
- # gt class that had the max overlap
- #与哪个类别有最大IoU
- max_classes = gt_overlaps.argmax(axis=1)
- roidb[i]['max_classes'] = max_classes
- roidb[i]['max_overlaps'] = max_overlaps
- # sanity checks
- # max overlap of 0 => class should be zero (background)
- #确保max overlap=0的box都属于背景
- zero_inds = np.where(max_overlaps == 0)[0]
- assert all(max_classes[zero_inds] == 0)
- # max overlap > 0 => class should not be zero (must be a fg class)
- nonzero_inds = np.where(max_overlaps > 0)[0]
- assert all(max_classes[nonzero_inds] != 0)
- def add_bbox_regression_targets(roidb):
- """Add information needed to train bounding-box regressors."""
- assert len(roidb) > 0
- assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
- num_images = len(roidb)
- # Infer number of classes from the number of columns in gt_overlaps
- num_classes = roidb[0]['gt_overlaps'].shape[1]
- for im_i in xrange(num_images):
- rois = roidb[im_i]['boxes']
- max_overlaps = roidb[im_i]['max_overlaps']
- max_classes = roidb[im_i]['max_classes']
- roidb[im_i]['bbox_targets'] = \
- _compute_targets(rois, max_overlaps, max_classes)
- if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
- # Use fixed / precomputed "means" and "stds" instead of empirical values
- means = np.tile(
- np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
- stds = np.tile(
- np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
- else:
- # Compute values needed for means and stds
- # var(x) = E(x^2) - E(x)^2
- class_counts = np.zeros((num_classes, 1)) + cfg.EPS
- sums = np.zeros((num_classes, 4))
- squared_sums = np.zeros((num_classes, 4))
- for im_i in xrange(num_images):
- targets = roidb[im_i]['bbox_targets']
- for cls in xrange(1, num_classes):
- cls_inds = np.where(targets[:, 0] == cls)[0]
- if cls_inds.size > 0:
- class_counts[cls] += cls_inds.size
- sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
- squared_sums[cls, :] += \
- (targets[cls_inds, 1:] ** 2).sum(axis=0)
- means = sums / class_counts
- stds = np.sqrt(squared_sums / class_counts - means ** 2)
- print 'bbox target means:'
- print means
- print means[1:, :].mean(axis=0) # ignore bg class
- print 'bbox target stdevs:'
- print stds
- print stds[1:, :].mean(axis=0) # ignore bg class
- # Normalize targets
- if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
- print "Normalizing targets"
- for im_i in xrange(num_images):
- targets = roidb[im_i]['bbox_targets']
- for cls in xrange(1, num_classes):
- cls_inds = np.where(targets[:, 0] == cls)[0]
- roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
- roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
- else:
- print "NOT normalizing targets"
- # These values will be needed for making predictions
- # (the predicts will need to be unnormalized and uncentered)
- return means.ravel(), stds.ravel()
- #计算bbox回归时用到的回归目标值,包括region proposal相对grouynd-truth的bbox的
- #坐标偏移量和长宽比例,这四个目标值都经过了归一化处理
- def _compute_targets(rois, overlaps, labels):
- """Compute bounding-box regression targets for an image."""
- # Indices of ground-truth ROIs
- gt_inds = np.where(overlaps == 1)[0]
- if len(gt_inds) == 0:
- # Bail if the image has no ground-truth ROIs
- #如果roidb全部是背景,返回0矩阵
- return np.zeros((rois.shape[0], 5), dtype=np.float32)
- # Indices of examples for which we try to make predictions
- ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
- # Get IoU overlap between each ex ROI and gt ROI
- ex_gt_overlaps = bbox_overlaps(
- np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
- np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
- # Find which gt ROI each ex ROI has max overlap with:
- # this will be the ex ROI's gt target
- gt_assignment = ex_gt_overlaps.argmax(axis=1)
- gt_rois = rois[gt_inds[gt_assignment], :]
- ex_rois = rois[ex_inds, :]
- targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
- #矩阵第一列是类别
- targets[ex_inds, 0] = labels[ex_inds]
- #后面四列是边框回归目标值
- targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
- return targets
lib/datasets/imdb.py
- #coding:utf-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- import os
- import os.path as osp
- import PIL
- from utils.cython_bbox import bbox_overlaps
- import numpy as np
- import scipy.sparse
- from fast_rcnn.config import cfg
- class imdb(object):
- """Image database."""
- def __init__(self, name):
- #imdb的一些属性
- self._name = name
- self._num_classes = 0
- self._classes = []
- self._image_index = []
- self._obj_proposer = 'selective_search'
- self._roidb = None
- self._roidb_handler = self.default_roidb
- # Use this dict for storing dataset specific config options
- self.config = {}
- @property
- def name(self):
- return self._name
- @property
- def num_classes(self):
- return len(self._classes)
- @property
- def classes(self):
- return self._classes
- @property
- def image_index(self):
- return self._image_index
- @property
- def roidb_handler(self):
- return self._roidb_handler
- @roidb_handler.setter
- def roidb_handler(self, val):
- self._roidb_handler = val
- def set_proposal_method(self, method):
- method = eval('self.' + method + '_roidb')
- self.roidb_handler = method
- @property
- def roidb(self):
- # A roidb is a list of dictionaries, each with the following keys:
- # boxes
- # gt_overlaps
- # gt_classes
- # flipped
- if self._roidb is not None:
- return self._roidb
- self._roidb = self.roidb_handler()
- return self._roidb
- @property
- def cache_path(self):
- cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
- if not os.path.exists(cache_path):
- os.makedirs(cache_path)
- return cache_path
- @property
- def num_images(self):
- return len(self.image_index)
- def image_path_at(self, i):
- raise NotImplementedError
- def default_roidb(self):
- raise NotImplementedError
- def evaluate_detections(self, all_boxes, output_dir=None):
- """
- all_boxes is a list of length number-of-classes.
- Each list element is a list of length number-of-images.
- Each of those list elements is either an empty list []
- or a numpy array of detection.
- all_boxes[class][image] = [] or np.array of shape #dets x 5
- """
- raise NotImplementedError
- def _get_widths(self):
- return [PIL.Image.open(self.image_path_at(i)).size[0]
- for i in xrange(self.num_images)]
- #对所有原始图像进行水平翻转
- def append_flipped_images(self):
- num_images = self.num_images
- #得到所有图像的宽度存入list
- widths = self._get_widths()
- for i in xrange(num_images):
- #复制每张图中所有的box坐标,这个boxes是一个列表,类似[(x1min,y1min,x1max,y1max),]
- boxes = self.roidb[i]['boxes'].copy()
- oldx1 = boxes[:, 0].copy()
- oldx2 = boxes[:, 2].copy()
- #水平翻转只用对横坐标做变换,容易得到x'=width-x
- boxes[:, 0] = widths[i] - oldx2 - 1
- boxes[:, 2] = widths[i] - oldx1 - 1
- assert (boxes[:, 2] >= boxes[:, 0]).all()
- #entry是一个字典,存了boxes坐标,和ground-truth的重叠率,类别,是否由水平翻转得到,
- #重叠率和类别不会变,直接复制i
- entry = {'boxes' : boxes,
- 'gt_overlaps' : self.roidb[i]['gt_overlaps'],
- 'gt_classes' : self.roidb[i]['gt_classes'],
- 'flipped' : True}
- #把水平翻转得到的数据加入roidb
- self.roidb.append(entry)
- #数量变为原来的2倍
- self._image_index = self._image_index * 2
- def evaluate_recall(self, candidate_boxes=None, thresholds=None,
- area='all', limit=None):
- """Evaluate detection proposal recall metrics.
- Returns:
- results: dictionary of results with keys
- 'ar': average recall
- 'recalls': vector recalls at each IoU overlap threshold
- 'thresholds': vector of IoU overlap thresholds
- 'gt_overlaps': vector of all ground-truth overlaps
- """
- # Record max overlap value for each gt box
- # Return vector of overlap values
- areas = { 'all': 0, 'small': 1, 'medium': 2, 'large': 3,
- '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
- area_ranges = [ [0**2, 1e5**2], # all
- [0**2, 32**2], # small
- [32**2, 96**2], # medium
- [96**2, 1e5**2], # large
- [96**2, 128**2], # 96-128
- [128**2, 256**2], # 128-256
- [256**2, 512**2], # 256-512
- [512**2, 1e5**2], # 512-inf
- ]
- assert areas.has_key(area), 'unknown area range: {}'.format(area)
- area_range = area_ranges[areas[area]]
- gt_overlaps = np.zeros(0)
- num_pos = 0
- for i in xrange(self.num_images):
- # Checking for max_overlaps == 1 avoids including crowd annotations
- # (...pretty hacking :/)
- max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
- gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
- (max_gt_overlaps == 1))[0]
- gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
- gt_areas = self.roidb[i]['seg_areas'][gt_inds]
- valid_gt_inds = np.where((gt_areas >= area_range[0]) &
- (gt_areas <= area_range[1]))[0]
- gt_boxes = gt_boxes[valid_gt_inds, :]
- num_pos += len(valid_gt_inds)
- if candidate_boxes is None:
- # If candidate_boxes is not supplied, the default is to use the
- # non-ground-truth boxes from this roidb
- non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
- boxes = self.roidb[i]['boxes'][non_gt_inds, :]
- else:
- boxes = candidate_boxes[i]
- if boxes.shape[0] == 0:
- continue
- if limit is not None and boxes.shape[0] > limit:
- boxes = boxes[:limit, :]
- overlaps = bbox_overlaps(boxes.astype(np.float),
- gt_boxes.astype(np.float))
- _gt_overlaps = np.zeros((gt_boxes.shape[0]))
- for j in xrange(gt_boxes.shape[0]):
- # find which proposal box maximally covers each gt box
- argmax_overlaps = overlaps.argmax(axis=0)
- # and get the iou amount of coverage for each gt box
- max_overlaps = overlaps.max(axis=0)
- # find which gt box is 'best' covered (i.e. 'best' = most iou)
- gt_ind = max_overlaps.argmax()
- gt_ovr = max_overlaps.max()
- assert(gt_ovr >= 0)
- # find the proposal box that covers the best covered gt box
- box_ind = argmax_overlaps[gt_ind]
- # record the iou coverage of this gt box
- _gt_overlaps[j] = overlaps[box_ind, gt_ind]
- assert(_gt_overlaps[j] == gt_ovr)
- # mark the proposal box and the gt box as used
- overlaps[box_ind, :] = -1
- overlaps[:, gt_ind] = -1
- # append recorded iou coverage level
- gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
- gt_overlaps = np.sort(gt_overlaps)
- if thresholds is None:
- step = 0.05
- thresholds = np.arange(0.5, 0.95 + 1e-5, step)
- recalls = np.zeros_like(thresholds)
- # compute recall for each iou threshold
- for i, t in enumerate(thresholds):
- recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
- # ar = 2 * np.trapz(recalls, thresholds)
- ar = recalls.mean()
- return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
- 'gt_overlaps': gt_overlaps}
- def create_roidb_from_box_list(self, box_list, gt_roidb):
- #断言box_list的数目和图像数目一样,这里box_list[i]里存的是相应第i张图像里所有的bbox的坐标
- assert len(box_list) == self.num_images, \
- 'Number of boxes must match number of ground-truth images'
- #重要,roidb是一个列表,列表中的每一个元素是字典,存储了每张图象的boxes,gt_classes,'gt_overlaps',是否flipped信息
- roidb = []
- for i in xrange(self.num_images):
- boxes = box_list[i]
- num_boxes = boxes.shape[0]
- #计算每个box和每一类目标的重叠率
- overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
- if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
- #取得ground-truth里bbox的坐标
- gt_boxes = gt_roidb[i]['boxes']
- #取得每个bbox对应的类别
- gt_classes = gt_roidb[i]['gt_classes']
- #计算roidb的bbox与ground-truth的bbox的重叠率
- gt_overlaps = bbox_overlaps(boxes.astype(np.float),
- gt_boxes.astype(np.float))
- #与那一类的重叠率最大
- argmaxes = gt_overlaps.argmax(axis=1)
- maxes = gt_overlaps.max(axis=1)
- I = np.where(maxes > 0)[0]
- overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
- overlaps = scipy.sparse.csr_matrix(overlaps)
- roidb.append({
- 'boxes' : boxes,
- 'gt_classes' : np.zeros((num_boxes,), dtype=np.int32),
- 'gt_overlaps' : overlaps,
- 'flipped' : False,
- 'seg_areas' : np.zeros((num_boxes,), dtype=np.float32),
- })
- return roidb
- @staticmethod
- def merge_roidbs(a, b):
- assert len(a) == len(b)
- for i in xrange(len(a)):
- a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
- a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
- b[i]['gt_classes']))
- a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
- b[i]['gt_overlaps']])
- a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
- b[i]['seg_areas']))
- return a
- def competition_mode(self, on):
- """Turn competition mode on or off."""
- pass
lib/datasets/pascal_voc.py
- #CODING:UTF-8
- # --------------------------------------------------------
- # Fast R-CNN
- # Copyright (c) 2015 Microsoft
- # Licensed under The MIT License [see LICENSE for details]
- # Written by Ross Girshick
- # --------------------------------------------------------
- import os
- from datasets.imdb import imdb
- import datasets.ds_utils as ds_utils
- import xml.etree.ElementTree as ET
- import numpy as np
- import scipy.sparse
- import scipy.io as sio
- import utils.cython_bbox
- import cPickle
- import subprocess
- import uuid
- from voc_eval import voc_eval
- from fast_rcnn.config import cfg
- class pascal_voc(imdb):
- def __init__(self, image_set, year, devkit_path=None):
- imdb.__init__(self, 'voc_' + year + '_' + image_set)
- self._year = year
- self._image_set = image_set
- self._devkit_path = self._get_default_path() if devkit_path is None \
- else devkit_path
- self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
- self._classes = ('__background__', # always index 0
- 'aeroplane', 'bicycle', 'bird', 'boat',
- 'bottle', 'bus', 'car', 'cat', 'chair',
- 'cow', 'diningtable', 'dog', 'horse',
- 'motorbike', 'person', 'pottedplant',
- 'sheep', 'sofa', 'train', 'tvmonitor')
- self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
- self._image_ext = '.jpg'
- self._image_index = self._load_image_set_index()
- # Default to roidb handler
- self._roidb_handler = self.selective_search_roidb
- self._salt = str(uuid.uuid4())
- self._comp_id = 'comp4'
- # PASCAL specific config options
- self.config = {'cleanup' : True,
- 'use_salt' : True,
- 'use_diff' : False,
- 'matlab_eval' : False,
- 'rpn_file' : None,
- 'min_size' : 2}
- assert os.path.exists(self._devkit_path), \
- 'VOCdevkit path does not exist: {}'.format(self._devkit_path)
- assert os.path.exists(self._data_path), \
- 'Path does not exist: {}'.format(self._data_path)
- def image_path_at(self, i):
- """
- Return the absolute path to image i in the image sequence.
- """
- return self.image_path_from_index(self._image_index[i])
- def image_path_from_index(self, index):
- """
- Construct an image path from the image's "index" identifier.
- """
- image_path = os.path.join(self._data_path, 'JPEGImages',
- index + self._image_ext)
- assert os.path.exists(image_path), \
- 'Path does not exist: {}'.format(image_path)
- return image_path
- def _load_image_set_index(self):
- """
- Load the indexes listed in this dataset's image set file.
- """
- # Example path to image set file:
- # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
- image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
- self._image_set + '.txt')
- assert os.path.exists(image_set_file), \
- 'Path does not exist: {}'.format(image_set_file)
- with open(image_set_file) as f:
- image_index = [x.strip() for x in f.readlines()]
- return image_index
- def _get_default_path(self):
- """
- Return the default path where PASCAL VOC is expected to be installed.
- """
- return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
- def gt_roidb(self):
- """
- Return the database of ground-truth regions of interest.
- This function loads/saves from/to a cache file to speed up future calls.
- """
- cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
- if os.path.exists(cache_file):
- with open(cache_file, 'rb') as fid:
- roidb = cPickle.load(fid)
- print '{} gt roidb loaded from {}'.format(self.name, cache_file)
- return roidb
- gt_roidb = [self._load_pascal_annotation(index)
- for index in self.image_index]
- with open(cache_file, 'wb') as fid:
- cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
- print 'wrote gt roidb to {}'.format(cache_file)
- return gt_roidb
- def selective_search_roidb(self):
- """
- Return the database of selective search regions of interest.
- Ground-truth ROIs are also included.
- This function loads/saves from/to a cache file to speed up future calls.
- """
- cache_file = os.path.join(self.cache_path,
- self.name + '_selective_search_roidb.pkl')
- if os.path.exists(cache_file):
- with open(cache_file, 'rb') as fid:
- roidb = cPickle.load(fid)
- print '{} ss roidb loaded from {}'.format(self.name, cache_file)
- return roidb
- if int(self._year) == 2007 or self._image_set != 'test':
- gt_roidb = self.gt_roidb()
- ss_roidb = self._load_selective_search_roidb(gt_roidb)
- roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
- else:
- roidb = self._load_selective_search_roidb(None)
- with open(cache_file, 'wb') as fid:
- cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
- print 'wrote ss roidb to {}'.format(cache_file)
- return roidb
- def rpn_roidb(self):
- if int(self._year) == 2007 or self._image_set != 'test':
- gt_roidb = self.gt_roidb()
- rpn_roidb = self._load_rpn_roidb(gt_roidb)
- roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
- else:
- roidb = self._load_rpn_roidb(None)
- return roidb
- def _load_rpn_roidb(self, gt_roidb):
- filename = self.config['rpn_file']
- print 'loading {}'.format(filename)
- assert os.path.exists(filename), \
- 'rpn data not found at: {}'.format(filename)
- #得到rpn产生的box
- with open(filename, 'rb') as f:
- box_list = cPickle.load(f)
- #由box_list产生roidb
- return self.create_roidb_from_box_list(box_list, gt_roidb)
- def _load_selective_search_roidb(self, gt_roidb):
- filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
- 'selective_search_data',
- self.name + '.mat'))
- assert os.path.exists(filename), \
- 'Selective search data not found at: {}'.format(filename)
- raw_data = sio.loadmat(filename)['boxes'].ravel()
- box_list = []
- for i in xrange(raw_data.shape[0]):
- boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
- keep = ds_utils.unique_boxes(boxes)
- boxes = boxes[keep, :]
- keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
- boxes = boxes[keep, :]
- box_list.append(boxes)
- return self.create_roidb_from_box_list(box_list, gt_roidb)
- def _load_pascal_annotation(self, index):
- """
- Load image and bounding boxes info from XML file in the PASCAL VOC
- format.
- """
- #xml文件名
- filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
- #解析xml
- tree = ET.parse(filename)
- #找到所有object属性项
- objs = tree.findall('object')
- if not self.config['use_diff']:
- # Exclude the samples labeled as difficult
- non_diff_objs = [
- obj for obj in objs if int(obj.find('difficult').text) == 0]
- # if len(non_diff_objs) != len(objs):
- # print 'Removed {} difficult objects'.format(
- # len(objs) - len(non_diff_objs))
- objs = non_diff_objs
- num_objs = len(objs)
- #boxes存储这张图片上所有bbox的坐标
- boxes = np.zeros((num_objs, 4), dtype=np.uint16)
- #gt_classes存储每个bbox的类别
- gt_classes = np.zeros((num_objs), dtype=np.int32)
- overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
- # "Seg" area for pascal is just the box area
- seg_areas = np.zeros((num_objs), dtype=np.float32)
- # Load object bounding boxes into a data frame.
- for ix, obj in enumerate(objs):
- bbox = obj.find('bndbox')
- # Make pixel indexes 0-based
- x1 = float(bbox.find('xmin').text) - 1
- y1 = float(bbox.find('ymin').text) - 1
- x2 = float(bbox.find('xmax').text) - 1
- y2 = float(bbox.find('ymax').text) - 1
- #从类别名字映射到ID
- cls = self._class_to_ind[obj.find('name').text.lower().strip()]
- boxes[ix, :] = [x1, y1, x2, y2]
- gt_classes[ix] = cls
- #因为是groud-truth,所以重叠率设置为1
- overlaps[ix, cls] = 1.0
- seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
- overlaps = scipy.sparse.csr_matrix(overlaps)
- #返回一个字典
- return {'boxes' : boxes,
- 'gt_classes': gt_classes,
- 'gt_overlaps' : overlaps,
- 'flipped' : False,
- 'seg_areas' : seg_areas}
- def _get_comp_id(self):
- comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
- else self._comp_id)
- return comp_id
- def _get_voc_results_file_template(self):
- # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
- filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
- path = os.path.join(
- self._devkit_path,
- 'results',
- 'VOC' + self._year,
- 'Main',
- filename)
- return path
- def _write_voc_results_file(self, all_boxes):
- for cls_ind, cls in enumerate(self.classes):
- if cls == '__background__':
- continue
- print 'Writing {} VOC results file'.format(cls)
- filename = self._get_voc_results_file_template().format(cls)
- with open(filename, 'wt') as f:
- for im_ind, index in enumerate(self.image_index):
- dets = all_boxes[cls_ind][im_ind]
- if dets == []:
- continue
- # the VOCdevkit expects 1-based indices
- for k in xrange(dets.shape[0]):
- f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
- format(index, dets[k, -1],
- dets[k, 0] + 1, dets[k, 1] + 1,
- dets[k, 2] + 1, dets[k, 3] + 1))
- def _do_python_eval(self, output_dir = 'output'):
- annopath = os.path.join(
- self._devkit_path,
- 'VOC' + self._year,
- 'Annotations',
- '{:s}.xml')
- imagesetfile = os.path.join(
- self._devkit_path,
- 'VOC' + self._year,
- 'ImageSets',
- 'Main',
- self._image_set + '.txt')
- cachedir = os.path.join(self._devkit_path, 'annotations_cache')
- aps = []
- # The PASCAL VOC metric changed in 2010
- use_07_metric = True if int(self._year) < 2010 else False
- print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
- if not os.path.isdir(output_dir):
- os.mkdir(output_dir)
- for i, cls in enumerate(self._classes):
- if cls == '__background__':
- continue
- filename = self._get_voc_results_file_template().format(cls)
- rec, prec, ap = voc_eval(
- filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
- use_07_metric=use_07_metric)
- aps += [ap]
- print('AP for {} = {:.4f}'.format(cls, ap))
- with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
- cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
- print('Mean AP = {:.4f}'.format(np.mean(aps)))
- print('~~~~~~~~')
- print('Results:')
- for ap in aps:
- print('{:.3f}'.format(ap))
- print('{:.3f}'.format(np.mean(aps)))
- print('~~~~~~~~')
- print('')
- print('--------------------------------------------------------------')
- print('Results computed with the **unofficial** Python eval code.')
- print('Results should be very close to the official MATLAB eval code.')
- print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
- print('-- Thanks, The Management')
- print('--------------------------------------------------------------')
- def _do_matlab_eval(self, output_dir='output'):
- print '-----------------------------------------------------'
- print 'Computing results with the official MATLAB eval code.'
- print '-----------------------------------------------------'
- path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
- 'VOCdevkit-matlab-wrapper')
- cmd = 'cd {} && '.format(path)
- cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
- cmd += '-r "dbstop if error; '
- cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
- .format(self._devkit_path, self._get_comp_id(),
- self._image_set, output_dir)
- print('Running:\n{}'.format(cmd))
- status = subprocess.call(cmd, shell=True)
- def evaluate_detections(self, all_boxes, output_dir):
- self._write_voc_results_file(all_boxes)
- self._do_python_eval(output_dir)
- if self.config['matlab_eval']:
- self._do_matlab_eval(output_dir)
- if self.config['cleanup']:
- for cls in self._classes:
- if cls == '__background__':
- continue
- filename = self._get_voc_results_file_template().format(cls)
- os.remove(filename)
- def competition_mode(self, on):
- if on:
- self.config['use_salt'] = False
- self.config['cleanup'] = False
- else:
- self.config['use_salt'] = True
- self.config['cleanup'] = True
- if __name__ == '__main__':
- from datasets.pascal_voc import pascal_voc
- d = pascal_voc('trainval', '2007')
- res = d.roidb
- from IPython import embed; embed()
http://blog.csdn.net/iamzhangzhuping/article/category/6230157
http://blog.csdn.net/u010668907/article/category/6237110