目录
背景:
从之前的知识可知,在得到一个RoI后,Faster RCNN通过RoI与标签的IoU值来判断该RoI是正样本还是负样本,默认的IoU阈值为0.5,这个阈值是一个超参数,对于检测的精度有较大影响。 如何选择合适的阈值是一个矛盾的问题。一方面,阈值越高,选出的RoI会更接近真实物体,检测器的定位会更加准确,但此时符合条件的RoI会变少,正、负样本会更加不均衡,容易导致训练过拟合;另一方面,阈值越低,正样本会更多,有利于模型训练,但这时误检也会增多,从而增大了分类的误差。 对于阈值的问题,通过实验可以发现两个现象:
-
一个检测器如果采用某个阈值界定正负样本时,那么当输入Proposal的IoU在这个阈值附近时,检测效果要比基于其他阈值时好,也就是很难让一个在指定阈值界定正、负样本的检测模型对所有IoU的输入Proposal检测效果都最佳。
-
经过回归之后的候选框与标签之间的IoU会有所提升。
论文地址:
https://arxiv.org/pdf/1712.00726.pdf
网络结构:
基于以上结果,2018年CVPR上的Cascade RCNN算法通过级联多个检测器来不断优化结果,每个检测器都基于不同的IoU阈值来界定正负样本,前一个检测器的输出作为后一个检测器的输入,并且检测器越靠后,IoU的阈值越高。
级联检测器可以有多种形式,如图上图所示为迭代式的边框回归模型示意图,图中的Pooling代表了RoI Pooling过程,H1表示RCNN部分网络,C与B分别表示分类与回归部分网络。从图中可以看出,这种方法将前一个回归网络输出的边框作为下一个检测器的输入继续进行回归,连续迭代3次才得到结果。 从前面的实验可以得知,经过一个固定IoU阈值的检测器后,边框的IoU会提升,分布也发生了变化,即越来越靠近真实物体。如果下一个检测器仍然还是这个IoU阈值的话,显然不是一个最优的选择,这也是上述这种级联器的问题所在。
上图是另一种多个检测器的组合方式,称为Integral Loss。图中H1、H2与H3分别代表不同的IoU阈值界定正负样本的检测器,当阈值较高时,预测的边框会更为精准,但会损失一些正样本。但是这种方法中多个检测器相互独立,没有反馈优化的思想,仅仅是利用了多个IoU阈值的检测器。
上图的结构则是Cascade RCNN采用的方法,可以看到每一个检测器的边框输出作为下一个检测器的输入,并且检测器的IoU阈值是逐渐提升的,因此这种方法可以逐步过滤掉一些误检框,并且提升边框的定位精度。 总体来看,Cascade RCNN算法深入探讨了IoU阈值对检测器性能的影响,并且在不增加任何tricks的前提下,在多个数据集上都有了明显的精度提升,是一个性能优越的高精度物体检测器。
良好表现:
相关代码:(Pytorch实现)
整体结构:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable, gradcheck
from torch.autograd.gradcheck import gradgradcheck
import torchvision.models as models
from torch.autograd import Variable
import numpy as np
import torchvision.utils as vutils
from model.utils.config import cfg # rm 'lib.', or cfg will create a new copy
from model.rpn.rpn_fpn import _RPN_FPN
from model.roi_pooling.modules.roi_pool import _RoIPooling
from model.roi_crop.modules.roi_crop import _RoICrop
from model.roi_align.modules.roi_align import RoIAlignAvg
from model.rpn.proposal_target_layer import _ProposalTargetLayer
from model.utils.net_utils import _smooth_l1_loss, _crop_pool_layer, _affine_grid_gen, _affine_theta
from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes, bbox_decode
import time
import pdb
class _FPN(nn.Module):
""" FPN """
def __init__(self, classes, class_agnostic):
super(_FPN, self).__init__()
self.classes = classes
self.n_classes = len(classes)
self.class_agnostic = class_agnostic
# loss
self.RCNN_loss_cls = 0
self.RCNN_loss_bbox = 0
# define rpn
self.RCNN_rpn = _RPN_FPN(self.dout_base_model)
self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes)
# NOTE: the original paper used pool_size = 7 for cls branch, and 14 for mask branch, to save the
# computation time, we first use 14 as the pool_size, and then do stride=2 pooling for cls branch.
self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)
self.RCNN_roi_align = RoIAlignAvg(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0)
self.grid_size = cfg.POOLING_SIZE * 2 if cfg.CROP_RESIZE_WITH_MAX_POOL else cfg.POOLING_SIZE
self.RCNN_roi_crop = _RoICrop()
def _init_weights(self):
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
# custom weights initialization called on netG and netD
def weights_init(m, mean, stddev, truncated=False):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
m.bias.data.fill_(0)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
normal_init(self.RCNN_toplayer, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_smooth1, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_smooth2, 0, 0.01, cfg.TRAIN.TRUNCATED)
# normal_init(self.RCNN_smooth3, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_latlayer1, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_latlayer2, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_latlayer3, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_latlayer4, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_rpn.RPN_Conv, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_rpn.RPN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_rpn.RPN_bbox_pred, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_bbox_pred, 0, 0.001, cfg.TRAIN.TRUNCATED)
weights_init(self.RCNN_top, 0, 0.01, cfg.TRAIN.TRUNCATED)
def create_architecture(self):
self._init_modules()
self._init_weights()
def _upsample_add(self, x, y):
'''Upsample and add two feature maps.
Args:
x: (Variable) top feature map to be upsampled.
y: (Variable) lateral feature map.
Returns:
(Variable) added feature map.
Note in PyTorch, when input size is odd, the upsampled feature map
with `F.upsample(..., scale_factor=2, mode='nearest')`
maybe not equal to the lateral feature map size.
e.g.
original input size: [N,_,15,15] ->
conv2d feature map size: [N,_,8,8] ->
upsampled feature map size: [N,_,16,16]
So we choose bilinear upsample which supports arbitrary output sizes.
'''
_,_,H,W = y.size()
return F.upsample(x, size=(H,W), mode='bilinear') + y
def _PyramidRoI_Feat(self, feat_maps, rois, im_info):
''' roi pool on pyramid feature maps'''
# do roi pooling based on predicted rois
img_area = im_info[0][0] * im_info[0][1]
h = rois.data[:, 4] - rois.data[:, 2] + 1
w = rois.data[:, 3] - rois.data[:, 1] + 1
roi_level = torch.log(torch.sqrt(h * w) / 224.0) / np.log(2)
roi_level = torch.floor(roi_level + 4)
# --------
# roi_level = torch.log(torch.sqrt(h * w) / 224.0)
# roi_level = torch.round(roi_level + 4)
# ------
roi_level[roi_level < 2] = 2
roi_level[roi_level > 5] = 5
# roi_level.fill_(5)
if cfg.POOLING_MODE == 'crop':
# pdb.set_trace()
# pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
# NOTE: need to add pyrmaid
grid_xy = _affine_grid_gen(rois, feat_maps.size()[2:], self.grid_size) ##
grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous()
roi_pool_feat = self.RCNN_roi_crop(feat_maps, Variable(grid_yx).detach()) ##
if cfg.CROP_RESIZE_WITH_MAX_POOL:
roi_pool_feat = F.max_pool2d(roi_pool_feat, 2, 2)
elif cfg.POOLING_MODE == 'align':
roi_pool_feats = []
box_to_levels = []
for i, l in enumerate(range(2, 6)):
if (roi_level == l).sum() == 0:
continue
idx_l = (roi_level == l).nonzero().squeeze()
box_to_levels.append(idx_l)
scale = feat_maps[i].size(2) / im_info[0][0]
feat = self.RCNN_roi_align(feat_maps[i], rois[idx_l], scale)
roi_pool_feats.append(feat)
roi_pool_feat = torch.cat(roi_pool_feats, 0)
box_to_level = torch.cat(box_to_levels, 0)
idx_sorted, order = torch.sort(box_to_level)
roi_pool_feat = roi_pool_feat[order]
elif cfg.POOLING_MODE == 'pool':
roi_pool_feats = []
box_to_levels = []
for i, l in enumerate(range(2, 6)):
if (roi_level == l).sum() == 0:
continue
idx_l = (roi_level == l).nonzero().squeeze()
box_to_levels.append(idx_l)
scale = feat_maps[i].size(2) / im_info[0][0]
feat = self.RCNN_roi_pool(feat_maps[i], rois[idx_l], scale)
roi_pool_feats.append(feat)
roi_pool_feat = torch.cat(roi_pool_feats, 0)
box_to_level = torch.cat(box_to_levels, 0)
idx_sorted, order = torch.sort(box_to_level)
roi_pool_feat = roi_pool_feat[order]
return roi_pool_feat
def forward(self, im_data, im_info, gt_boxes, num_boxes):
batch_size = im_data.size(0)
im_info = im_info.data
gt_boxes = gt_boxes.data
num_boxes = num_boxes.data
# feed image data to base model to obtain base feature map
# Bottom-up
c1 = self.RCNN_layer0(im_data)
c2 = self.RCNN_layer1(c1)
c3 = self.RCNN_layer2(c2)
c4 = self.RCNN_layer3(c3)
c5 = self.RCNN_layer4(c4)
c6 = self.RCNN_layer5(c5)
# Top-down
p6 = self.RCNN_toplayer(c6)
p5 = self.RCNN_latlayer1(c5) + p6
p4 = self.RCNN_latlayer2(c4) + p5
p3 = self._upsample_add(p4, self.RCNN_latlayer3(c3))
p3 = self.RCNN_smooth1(p3)
p2 = self._upsample_add(p3, self.RCNN_latlayer4(c2))
p2 = self.RCNN_smooth2(p2)
rpn_feature_maps = [p2, p3, p4, p5, p6]
mrcnn_feature_maps = [p2, p3, p4, p5]
rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(rpn_feature_maps, im_info, gt_boxes, num_boxes)
# if it is training phrase, then use ground trubut bboxes for refining
if self.training:
roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes)
rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data
## NOTE: additionally, normalize proposals to range [0, 1],
# this is necessary so that the following roi pooling
# is correct on different feature maps
# rois[:, :, 1::2] /= im_info[0][1]
# rois[:, :, 2::2] /= im_info[0][0]
rois = rois.view(-1, 5)
rois_label = rois_label.view(-1).long()
gt_assign = gt_assign.view(-1).long()
pos_id = rois_label.nonzero().squeeze()
gt_assign_pos = gt_assign[pos_id]
rois_label_pos = rois_label[pos_id]
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
rois_label = Variable(rois_label)
rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))
rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))
else:
## NOTE: additionally, normalize proposals to range [0, 1],
# this is necessary so that the following roi pooling
# is correct on different feature maps
# rois[:, :, 1::2] /= im_info[0][1]
# rois[:, :, 2::2] /= im_info[0][0]
rois_label = None
gt_assign = None
rois_target = None
rois_inside_ws = None
rois_outside_ws = None
rpn_loss_cls = 0
rpn_loss_bbox = 0
rois = rois.view(-1, 5)
pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long()
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
# print('before pooling, cfg', cfg.POOLING_MODE)
# print('before pooling, get_cfg', get_cfg().POOLING_MODE)
# pooling features based on rois, output 14x14 map
roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info)
# feed pooled features to top model
pooled_feat = self._head_to_tail(roi_pool_feat)
# compute bbox offset
bbox_pred = self.RCNN_bbox_pred(pooled_feat)
if self.training and not self.class_agnostic:
# select the corresponding columns according to roi labels
bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)
bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4))
bbox_pred = bbox_pred_select.squeeze(1)
# compute object classification probability
cls_score = self.RCNN_cls_score(pooled_feat)
# cls_prob = F.softmax(cls_score) ----------------not be used ---------------
RCNN_loss_cls = 0
RCNN_loss_bbox = 0
if self.training:
# loss (cross entropy) for object classification
RCNN_loss_cls = F.cross_entropy(cls_score, rois_label)
# loss (l1-norm) for bounding box regression
RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws)
rois = rois.view(batch_size, -1, rois.size(1))
# cls_prob = cls_prob.view(batch_size, -1, cls_prob.size(1)) ----------------not be used ---------------
bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1))
if self.training:
rois_label = rois_label.view(batch_size, -1)
# 2nd-----------------------------
# decode
rois = bbox_decode(rois, bbox_pred, batch_size, self.class_agnostic, self.n_classes, im_info, self.training)
# proposal_target
if self.training:
roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, stage=2)
rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data
rois = rois.view(-1, 5)
rois_label = rois_label.view(-1).long()
gt_assign = gt_assign.view(-1).long()
pos_id = rois_label.nonzero().squeeze()
gt_assign_pos = gt_assign[pos_id]
rois_label_pos = rois_label[pos_id]
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
rois_label = Variable(rois_label)
rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))
rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))
else:
rois_label = None
gt_assign = None
rois_target = None
rois_inside_ws = None
rois_outside_ws = None
rpn_loss_cls = 0
rpn_loss_bbox = 0
rois = rois.view(-1, 5)
pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long()
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info)
# feed pooled features to top model
pooled_feat = self._head_to_tail_2nd(roi_pool_feat)
# compute bbox offset
bbox_pred = self.RCNN_bbox_pred_2nd(pooled_feat)
if self.training and not self.class_agnostic:
# select the corresponding columns according to roi labels
bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)
bbox_pred_select = torch.gather(bbox_pred_view, 1,
rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0),
1, 4))
bbox_pred = bbox_pred_select.squeeze(1)
# compute object classification probability
cls_score = self.RCNN_cls_score_2nd(pooled_feat)
# cls_prob_2nd = F.softmax(cls_score) ----------------not be used ---------------
RCNN_loss_cls_2nd = 0
RCNN_loss_bbox_2nd = 0
if self.training:
# loss (cross entropy) for object classification
RCNN_loss_cls_2nd = F.cross_entropy(cls_score, rois_label)
# loss (l1-norm) for bounding box regression
RCNN_loss_bbox_2nd = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws)
rois = rois.view(batch_size, -1, rois.size(1))
# cls_prob_2nd = cls_prob_2nd.view(batch_size, -1, cls_prob_2nd.size(1)) ----------------not be used ---------
bbox_pred_2nd = bbox_pred.view(batch_size, -1, bbox_pred.size(1))
if self.training:
rois_label = rois_label.view(batch_size, -1)
# 3rd---------------
# decode
rois = bbox_decode(rois, bbox_pred_2nd, batch_size, self.class_agnostic, self.n_classes, im_info, self.training)
# proposal_target
# if it is training phrase, then use ground trubut bboxes for refining
if self.training:
roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, stage=3)
rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data
rois = rois.view(-1, 5)
rois_label = rois_label.view(-1).long()
gt_assign = gt_assign.view(-1).long()
pos_id = rois_label.nonzero().squeeze()
gt_assign_pos = gt_assign[pos_id]
rois_label_pos = rois_label[pos_id]
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
rois_label = Variable(rois_label)
rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))
rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))
else:
rois_label = None
gt_assign = None
rois_target = None
rois_inside_ws = None
rois_outside_ws = None
rpn_loss_cls = 0
rpn_loss_bbox = 0
rois = rois.view(-1, 5)
pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long()
rois_label_pos_ids = pos_id
rois_pos = Variable(rois[pos_id])
rois = Variable(rois)
roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info)
# feed pooled features to top model
pooled_feat = self._head_to_tail_3rd(roi_pool_feat)
# compute bbox offset
bbox_pred = self.RCNN_bbox_pred_3rd(pooled_feat)
if self.training and not self.class_agnostic:
# select the corresponding columns according to roi labels
bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)
bbox_pred_select = torch.gather(bbox_pred_view, 1,
rois_label.long().view(rois_label.size(0), 1, 1).expand(
rois_label.size(0),
1, 4))
bbox_pred = bbox_pred_select.squeeze(1)
# compute object classification probability
cls_score = self.RCNN_cls_score_3rd(pooled_feat)
cls_prob_3rd = F.softmax(cls_score)
RCNN_loss_cls_3rd = 0
RCNN_loss_bbox_3rd = 0
if self.training:
# loss (cross entropy) for object classification
RCNN_loss_cls_3rd = F.cross_entropy(cls_score, rois_label)
# loss (l1-norm) for bounding box regression
RCNN_loss_bbox_3rd = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws)
rois = rois.view(batch_size, -1, rois.size(1))
cls_prob_3rd = cls_prob_3rd.view(batch_size, -1, cls_prob_3rd.size(1))
bbox_pred_3rd = bbox_pred.view(batch_size, -1, bbox_pred.size(1))
if self.training:
rois_label = rois_label.view(batch_size, -1)
if not self.training:
# 3rd_avg
# 1st_3rd
pooled_feat_1st_3rd = self._head_to_tail(roi_pool_feat)
cls_score_1st_3rd = self.RCNN_cls_score(pooled_feat_1st_3rd)
cls_prob_1st_3rd = F.softmax(cls_score_1st_3rd)
cls_prob_1st_3rd = cls_prob_1st_3rd.view(batch_size, -1, cls_prob_1st_3rd.size(1))
# 2nd_3rd
pooled_feat_2nd_3rd = self._head_to_tail_2nd(roi_pool_feat)
cls_score_2nd_3rd = self.RCNN_cls_score_2nd(pooled_feat_2nd_3rd)
cls_prob_2nd_3rd = F.softmax(cls_score_2nd_3rd)
cls_prob_2nd_3rd = cls_prob_2nd_3rd.view(batch_size, -1, cls_prob_2nd_3rd.size(1))
cls_prob_3rd_avg = (cls_prob_1st_3rd + cls_prob_2nd_3rd + cls_prob_3rd) / 3
else:
cls_prob_3rd_avg = cls_prob_3rd
return rois, cls_prob_3rd_avg, bbox_pred_3rd, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox,
demo代码:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import _init_paths
import os
import sys
import numpy as np
import argparse
import pprint
import pdb
import time
import cv2
import torch
from torch.autograd import Variable
from PIL import Image
from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
from model.fpn.cascade.detnet_backbone import detnet as detnet_cascade
from model.fpn.non_cascade.detnet_backbone import detnet as detnet_noncascade
from model.rpn.bbox_transform import clip_boxes
from model.nms.nms_wrapper import nms, soft_nms
from model.rpn.bbox_transform import bbox_transform_inv
from model.utils.net_utils import save_net, load_net, vis_detections
from model.utils.blob import im_list_to_blob
import pdb
try:
xrange # Python 2
except NameError:
xrange = range # Python 3
def parse_args():
"""
Parse input arguments
"""
parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
parser.add_argument('exp_name', type=str, default=None, help='experiment name')
parser.add_argument('--dataset', dest='dataset',
help='training dataset',
default='pascal_voc', type=str)
parser.add_argument('--cfg', dest='cfg_file',
help='optional config file',
default='cfgs/vgg16.yml', type=str)
parser.add_argument('--net', dest='net',
help='detnet59, etc',
default='detnet59', type=str)
parser.add_argument('--set', dest='set_cfgs',
help='set config keys', default=None,
nargs=argparse.REMAINDER)
parser.add_argument('--load_dir', dest='load_dir',
help='directory to load models', default="weights")
parser.add_argument('--image_dir', dest='image_dir',
help='directory to load images', default="demo_images/",
type=str)
parser.add_argument('--result_dir', dest='result_dir', help='directory to save visual result', default="vis_results/",
type=str)
parser.add_argument('--cuda', dest='cuda',
help='whether use CUDA',
action='store_true')
parser.add_argument('--checksession', dest='checksession',
help='checksession to load model',
default=4, type=int)
parser.add_argument('--checkepoch', dest='checkepoch',
help='checkepoch to load network',
default=6, type=int)
parser.add_argument('--checkpoint', dest='checkpoint',
help='checkpoint to load network',
default=10000, type=int)
parser.add_argument('--soft_nms', help='whether use soft nms', action='store_true')
parser.add_argument('--cascade', help='whether use cascade', action='store_true')
parser.add_argument('--cag', dest='class_agnostic',
help='whether perform class_agnostic bbox regression',
action='store_true')
args = parser.parse_args()
return args
lr = cfg.TRAIN.LEARNING_RATE
momentum = cfg.TRAIN.MOMENTUM
weight_decay = cfg.TRAIN.WEIGHT_DECAY
def _get_image_blob(im):
"""Converts an image into a network input.
Arguments:
im (ndarray): a color image in BGR order
Returns:
blob (ndarray): a data blob holding an image pyramid
im_scale_factors (list): list of image scales (relative to im) used
in the image pyramid
"""
im_orig = im.astype(np.float32, copy=True) # RGB
im_orig /= 255.0
im_orig -= cfg.PIXEL_MEANS
im_orig /= cfg.PIXEL_STDS
im_shape = im_orig.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
processed_ims = []
im_scale_factors = []
for target_size in cfg.TEST.SCALES:
im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE
if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
im_scale_factors.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims)
return blob, np.array(im_scale_factors)
if __name__ == '__main__':
args = parse_args()
print('Called with args:')
print(args)
args.cfg_file = "cfgs/{}.yml".format(args.net)
if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs)
if not os.path.exists(args.result_dir):
os.mkdir(args.result_dir)
print('Using config:')
pprint.pprint(cfg)
np.random.seed(cfg.RNG_SEED)
# train set
# -- Note: Use validation set and disable the flipped to enable faster loading.
if args.exp_name is not None:
input_dir = args.load_dir + "/" + args.net + "/" + args.dataset + '/' + args.exp_name
else:
input_dir = args.load_dir + "/" + args.net + "/" + args.dataset
if not os.path.exists(input_dir):
raise Exception('There is no input directory for loading network from ' + input_dir)
load_name = os.path.join(input_dir,
'fpn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint))
classes = np.asarray(['__background__',
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor'])
if args.cascade:
if args.net == 'detnet59':
fpn = detnet_cascade(classes, 59, pretrained=False, class_agnostic=args.class_agnostic)
else:
print("network is not defined")
pdb.set_trace()
else:
if args.net == 'detnet59':
fpn = detnet_noncascade(classes, 59, pretrained=False, class_agnostic=args.class_agnostic)
else:
print("network is not defined")
pdb.set_trace()
fpn.create_architecture()
checkpoint = torch.load(load_name)
fpn.load_state_dict(checkpoint['model'])
print('load model successfully!')
# pdb.set_trace()
print("load checkpoint %s" % (load_name))
# initilize the tensor holder here.
im_data = torch.FloatTensor(1)
im_info = torch.FloatTensor(1)
num_boxes = torch.LongTensor(1)
gt_boxes = torch.FloatTensor(1)
# ship to cuda
if args.cuda:
im_data = im_data.cuda()
im_info = im_info.cuda()
num_boxes = num_boxes.cuda()
gt_boxes = gt_boxes.cuda()
# make variable
im_data = Variable(im_data, volatile=True)
im_info = Variable(im_info, volatile=True)
num_boxes = Variable(num_boxes, volatile=True)
gt_boxes = Variable(gt_boxes, volatile=True)
if args.cuda:
cfg.CUDA = True
if args.cuda:
fpn.cuda()
fpn.eval()
start = time.time()
max_per_image = 100
thresh = 0.05
vis = True
imglist = os.listdir(args.image_dir)
num_images = len(imglist)
print('Loaded Photo: {} images.'.format(num_images))
for i in range(num_images):
# Load the demo image
im_file = os.path.join(args.image_dir, imglist[i])
# im = cv2.imread(im_file)
im = np.array(Image.open(im_file))
if len(im.shape) == 2:
im = im[:, :, np.newaxis]
im = np.concatenate((im, im, im), axis=2)
blobs, im_scales = _get_image_blob(im)
assert len(im_scales) == 1, "Only single-image batch implemented"
im_blob = blobs
im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32)
im_data_pt = torch.from_numpy(im_blob)
im_data_pt = im_data_pt.permute(0, 3, 1, 2)
im_info_pt = torch.from_numpy(im_info_np)
im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt)
im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt)
gt_boxes.data.resize_(1, 1, 5).zero_()
num_boxes.data.resize_(1).zero_()
# pdb.set_trace()
det_tic = time.time()
# rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = \
# fpn(im_data, im_info, gt_boxes, num_boxes)
ret = fpn(im_data, im_info, gt_boxes, num_boxes)
rois, cls_prob, bbox_pred = ret[0:3]
scores = cls_prob.data
boxes = (rois[:, :, 1:5] / im_scales[0]).data
if cfg.TEST.BBOX_REG:
# Apply bounding-box regression deltas
box_deltas = bbox_pred.data
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Optionally normalize targets by a precomputed mean and stdev
box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \
+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda()
box_deltas = box_deltas.view(1, -1, 4)
pred_boxes = bbox_transform_inv(boxes, box_deltas, 1)
pred_boxes = clip_boxes(pred_boxes, im_info.data, 1)
else:
# Simply repeat the boxes, once for each class
pred_boxes = np.tile(boxes, (1, scores.size[1]))
scores = scores.squeeze()
pred_boxes = pred_boxes.squeeze()
# _t['im_detect'].tic()
det_toc = time.time()
detect_time = det_toc - det_tic
misc_tic = time.time()
if vis:
im2show = np.copy(im[:, :, ::-1])
for j in xrange(1, len(classes)):
inds = torch.nonzero(scores[:, j] > thresh).view(-1)
if inds.numel() > 0:
cls_scores = scores[:, j][inds]
_, order = torch.sort(cls_scores, 0, True)
if args.class_agnostic:
cls_boxes = pred_boxes[inds, :]
else:
cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4]
cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1)
cls_dets = cls_dets[order]
if args.soft_nms:
np_dets = cls_dets.cpu().numpy().astype(np.float32)
keep = soft_nms(np_dets, method=cfg.TEST.SOFT_NMS_METHOD) # np_dets will be changed
keep = torch.from_numpy(keep).type_as(cls_dets).int()
cls_dets = torch.from_numpy(np_dets).type_as(cls_dets)
else:
keep = nms(cls_dets, cfg.TEST.NMS)
cls_dets = cls_dets[keep.view(-1).long()]
cls_dets = cls_dets.cpu().numpy()
else:
cls_dets = np.array([])
if vis:
im2show = vis_detections(im2show, classes[j], cls_dets, thresh=0.5)
misc_toc = time.time()
nms_time = misc_toc - misc_tic
sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \
.format(i + 1, num_images, detect_time, nms_time))
sys.stdout.flush()
if vis:
# cv2.imshow('test', im2show)
# cv2.waitKey(0)
result_path = os.path.join(args.result_dir, imglist[i][:-4] + "_det.jpg")
cv2.imwrite(result_path, im2show)