原理请参考这篇文章
https://www.cnblogs.com/fariver/p/7347197.html
关于每层抽取的特征对应的anchors的种类个数
300x300输入下
fc7层抽取的到
可以看到,后处理后是19x19的grid 每个格子生成24/4=6个anchors对应的location
如上图可以看到类别的预测, fc7对用每个格子126/6=21 每个上面提到的anchor要有长度21的向量对应
layer {
name: "fc7_mbox_priorbox"
type: "PriorBox"
bottom: "fc7"
bottom: "data"
top: "fc7_mbox_priorbox"
prior_box_param {
min_size: 60.0
max_size: 111.0
aspect_ratio: 2.0
aspect_ratio: 3.0
flip: true
clip: false
variance: 0.10000000149
variance: 0.10000000149
variance: 0.20000000298
variance: 0.20000000298
step: 16.0
offset: 0.5
}
}
void PriorBoxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const int layer_width = bottom[0]->width();
const int layer_height = bottom[0]->height();
int img_width, img_height;
if (img_h_ == 0 || img_w_ == 0) {
img_width = bottom[1]->width();
img_height = bottom[1]->height();
} else {
img_width = img_w_;
img_height = img_h_;
}
float step_w, step_h;
if (step_w_ == 0 || step_h_ == 0) {
step_w = static_cast<float>(img_width) / layer_width;
step_h = static_cast<float>(img_height) / layer_height;
} else {
step_w = step_w_;
step_h = step_h_;
}
Dtype* top_data = top[0]->mutable_cpu_data();
int dim = layer_height * layer_width * num_priors_ * 4;
int idx = 0;
for (int h = 0; h < layer_height; ++h) {
for (int w = 0; w < layer_width; ++w) {
float center_x = (w + offset_) * step_w;
float center_y = (h + offset_) * step_h;
float box_width, box_height;
for (int s = 0; s < min_sizes_.size(); ++s) {
int min_size_ = min_sizes_[s];
// first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size_;
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
if (max_sizes_.size() > 0) {
CHECK_EQ(min_sizes_.size(), max_sizes_.size());
int max_size_ = max_sizes_[s];
// second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size_ * max_size_);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}
// rest of priors
for (int r = 0; r < aspect_ratios_.size(); ++r) {
float ar = aspect_ratios_[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size_ * sqrt(ar);
box_height = min_size_ / sqrt(ar);
// xmin
top_data[idx++] = (center_x - box_width / 2.) / img_width;
// ymin
top_data[idx++] = (center_y - box_height / 2.) / img_height;
// xmax
top_data[idx++] = (center_x + box_width / 2.) / img_width;
// ymax
top_data[idx++] = (center_y + box_height / 2.) / img_height;
}
}
}
}
结合caffe源码:
void PriorBoxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const PriorBoxParameter& prior_box_param =
this->layer_param_.prior_box_param();
CHECK_GT(prior_box_param.min_size_size(), 0) << "must provide min_size.";
for (int i = 0; i < prior_box_param.min_size_size(); ++i) {
min_sizes_.push_back(prior_box_param.min_size(i));
CHECK_GT(min_sizes_.back(), 0) << "min_size must be positive.";
}
aspect_ratios_.clear();
aspect_ratios_.push_back(1.);
flip_ = prior_box_param.flip();
for (int i = 0; i < prior_box_param.aspect_ratio_size(); ++i) {
float ar = prior_box_param.aspect_ratio(i);
bool already_exist = false;
for (int j = 0; j < aspect_ratios_.size(); ++j) {
if (fabs(ar - aspect_ratios_[j]) < 1e-6) {
already_exist = true;
break;
}
}
if (!already_exist) {
aspect_ratios_.push_back(ar);
if (flip_) {
aspect_ratios_.push_back(1./ar);
}
}
}
num_priors_ = aspect_ratios_.size() * min_sizes_.size();
if (prior_box_param.max_size_size() > 0) {
CHECK_EQ(prior_box_param.min_size_size(), prior_box_param.max_size_size());
for (int i = 0; i < prior_box_param.max_size_size(); ++i) {
max_sizes_.push_back(prior_box_param.max_size(i));
CHECK_GT(max_sizes_[i], min_sizes_[i])
<< "max_size must be greater than min_size.";
num_priors_ += 1;
}
}
clip_ = prior_box_param.clip();
if (prior_box_param.variance_size() > 1) {
// Must and only provide 4 variance.
CHECK_EQ(prior_box_param.variance_size(), 4);
for (int i = 0; i < prior_box_param.variance_size(); ++i) {
CHECK_GT(prior_box_param.variance(i), 0);
variance_.push_back(prior_box_param.variance(i));
}
} else if (prior_box_param.variance_size() == 1) {
CHECK_GT(prior_box_param.variance(0), 0);
variance_.push_back(prior_box_param.variance(0));
} else {
// Set default to 0.1.
variance_.push_back(0.1);
}
if (prior_box_param.has_img_h() || prior_box_param.has_img_w()) {
CHECK(!prior_box_param.has_img_size())
<< "Either img_size or img_h/img_w should be specified; not both.";
img_h_ = prior_box_param.img_h();
CHECK_GT(img_h_, 0) << "img_h should be larger than 0.";
img_w_ = prior_box_param.img_w();
CHECK_GT(img_w_, 0) << "img_w should be larger than 0.";
} else if (prior_box_param.has_img_size()) {
const int img_size = prior_box_param.img_size();
CHECK_GT(img_size, 0) << "img_size should be larger than 0.";
img_h_ = img_size;
img_w_ = img_size;
} else {
img_h_ = 0;
img_w_ = 0;
}
if (prior_box_param.has_step_h() || prior_box_param.has_step_w()) {
CHECK(!prior_box_param.has_step())
<< "Either step or step_h/step_w should be specified; not both.";
step_h_ = prior_box_param.step_h();
CHECK_GT(step_h_, 0.) << "step_h should be larger than 0.";
step_w_ = prior_box_param.step_w();
CHECK_GT(step_w_, 0.) << "step_w should be larger than 0.";
} else if (prior_box_param.has_step()) {
const float step = prior_box_param.step();
CHECK_GT(step, 0) << "step should be larger than 0.";
step_h_ = step;
step_w_ = step;
} else {
step_h_ = 0;
step_w_ = 0;
}
offset_ = prior_box_param.offset();
}
看上图layer的定义可知, aspect_ratio有两个, 程序生成的方式是
1个min_size 方形的anchors
1个maxsize 方形的anchors
根据我们加上去的两个aspect_radio 进行flip个数翻倍后,根据min_size进行的4个anchors
一共是是6个么没错了.
同理:
conv8_2_mbox_loc 分别是3x3个格子,每个格子4个候选目标
layer {
name: "conv8_2_mbox_priorbox"
type: "PriorBox"
bottom: "conv8_2"
bottom: "data"
top: "conv8_2_mbox_priorbox"
prior_box_param {
min_size: 213.0
max_size: 264.0
aspect_ratio: 2.0
flip: true
clip: false
variance: 0.10000000149
variance: 0.10000000149
variance: 0.20000000298
variance: 0.20000000298
step: 100.0
offset: 0.5
}
}
对于每个格子,这里会有4个不同的框生成.
对于loss函数的分析
layer {
name: "mbox_loss"
type: "MultiBoxLoss"
bottom: "mbox_loc"
bottom: "mbox_conf"
bottom: "mbox_priorbox"
bottom: "label"
top: "mbox_loss"
include {
phase: TRAIN
}
propagate_down: true
propagate_down: true
propagate_down: false
propagate_down: false
loss_param {
normalization: VALID
}
multibox_loss_param {
loc_loss_type: SMOOTH_L1
conf_loss_type: SOFTMAX
loc_weight: 1.0
num_classes: 21
share_location: true
match_type: PER_PREDICTION
overlap_threshold: 0.5
use_prior_for_matching: true
background_label_id: 0
use_difficult_gt: true
neg_pos_ratio: 3.0
neg_overlap: 0.5
code_type: CENTER_SIZE
ignore_cross_boundary_bbox: false
mining_type: MAX_NEGATIVE
}
}
上面是layer的定义,这里主要关注的是这么几个参数:
loc_loss_type: SMOOTH_L1 # 位置回归loss的类型
conf_loss_type: SOFTMAX #类别回归的loss类型
loc_weight: 1.0 #显然需要权重来平衡上面两个Loss的数量级
num_classes: 21
#loc_classes_ = share_location_ ? 1 : num_classes_;
# 意味着是否一个box只对应一个21维的类别向量,
# 在faster-rcnn中相当于false, 另外这个倘若是false则会大大增加预测location的矩阵的大小,直接*class_num个
share_location: true
match_type: PER_PREDICTION
#判别维正样本的iou阈值,大于该值为正样本
overlap_threshold: 0.5
use_prior_for_matching: true
#指定了背景类的id
background_label_id: 0
c++源码:
bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
if (!use_difficult_gt && difficult) {
// Skip reading difficult ground truth.
continue;
}
use_difficult_gt: true
neg_pos_ratio: 3.0
inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
const float match_overlap, const float neg_overlap) {
if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
return match_idx == -1 && match_overlap < neg_overlap;
} else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
return true;
} else {
return false;
}
}
neg_overlap: 0.5
code_type: CENTER_SIZE
#有时候预测的框会超过原来的图片边界(例如xmin<0 或者>1.0时, 归一化坐标),是否false则接收这种情况
ignore_cross_boundary_bbox: false
#MAX_NEGATIVE:结合neg_pos_ratio, 使得每幅图里面的负样本数量最多是正样本的3倍(此neg_pos_ratio: 3.0 )
# HARD_EXAMPLE : 除了正样本的数量,剩下的都是负样本的数量
mining_type: MAX_NEGATIVE #HARD_EXAMPLE