这篇博客主要写prior_box_layer
这一层完成的是给定一系列feature map后如何在上面生成prior box。SSD的做法很有意思,对于输入大小是W×H的feature map,生成的prior box中心就是W×H个,均匀分布在整张图上,像下图中演示的一样。在每个中心上,可以生成多个不同长宽比的prior box,如[1/3, 1/2, 1, 2, 3]。所以在一个feature map上可以生成的prior box总数是W×H×length_of_aspect_ratio,对于比较大的feature map,如VGG的conv4_3,生成的prior box可以达到数千个。当然对于边界上的box,还要做一些处理保证其不超出图片范围,这都是细节了。
这里需要注意的是,虽然prior box的位置是在W×H的格子上,但prior box的大小并不是跟格子一样大,而是人工指定的,原论文中随着feature map从底层到高层,prior box的大小在0.2到0.9之间均匀变化。
一开始看SSD的时候很困扰我的一点就是形状的匹配问题:SSD用卷积层做bbox的拟合,输出的不应该是feature map吗,怎么能正好输出4个坐标呢?这里的做法有点暴力,比如需要输出W×H×length_of_aspect_ratio×4个坐标,就直接用length_of_aspect_ratio×4个channel的卷积层做拟合,这样就得到length_of_aspect_ratio×4个大小为W×H的feature map,然后把feature map拉成一个长度为W×H×length_of_aspect_ratio×4的向量,用SmoothL1之类的loss去拟合,效果还意外地不错……
代码解读:
-
#include <algorithm>
-
#include <functional>
-
#include <utility>
-
#include <vector>
-
-
#include "caffe/layers/prior_box_layer.hpp"
-
-
namespace caffe {
-
-
template <
typename Dtype>
-
void PriorBoxLayer<Dtype>::LayerSetUp(
const
vector<Blob<Dtype>*>& bottom,
// 参数解析
-
const
vector<Blob<Dtype>*>& top) {
-
const PriorBoxParameter& prior_box_param =
-
this->layer_param_.prior_box_param();
-
CHECK_GT(prior_box_param.min_size_size(),
0) <<
"must provide min_size.";
-
for (
int i =
0; i < prior_box_param.min_size_size(); ++i) {
// min_size_size()=1
-
min_sizes_.push_back(prior_box_param.min_size(i));
-
CHECK_GT(min_sizes_.back(),
0) <<
"min_size must be positive.";
-
}
-
aspect_ratios_.clear();
-
aspect_ratios_.push_back(
1.);
// 加入1,在ProtoTXT只设置了2,3或者2
-
flip_ = prior_box_param.flip();
// 默认true
-
for (
int i =
0; i < prior_box_param.aspect_ratio_size(); ++i) {
// aspect_ratio_size=2
-
float ar = prior_box_param.aspect_ratio(i);
-
bool already_exist =
false;
-
for (
int j =
0; j < aspect_ratios_.size(); ++j) {
// 这里判断是不是已近把ratio压入栈,保证每个ratios都只有一个1/ratios
-
if (
fabs(ar - aspect_ratios_[j]) <
1e-6) {
// 这里aspect_ratios_只有1一个值
-
already_exist =
true;
-
break;
// 跳出for循环
-
}
-
}
-
if (!already_exist) {
-
aspect_ratios_.push_back(ar);
-
if (flip_) {
// 翻转,改变长宽比
-
aspect_ratios_.push_back(
1./ar);
// 得到1,2,3,1/2,1/3
-
}
-
}
// 到这里,共有5个ratios,分别为1,2,1/2,3,1/3
-
}
-
num_priors_ = aspect_ratios_.size() * min_sizes_.size();
// min_sizes_.size()=1 5*1
-
if (prior_box_param.max_size_size() >
0) {
-
CHECK_EQ(prior_box_param.min_size_size(), prior_box_param.max_size_size());
// 最大和最小不能相等
-
for (
int i =
0; i < prior_box_param.max_size_size(); ++i) {
// max_size_size=1
-
max_sizes_.push_back(prior_box_param.max_size(i));
-
CHECK_GT(max_sizes_[i], min_sizes_[i])
-
<<
"max_size must be greater than min_size.";
-
num_priors_ +=
1;
// num_priors_ = 6;这里很重要,不然就只有5个,和论文中的6个就不相符了
-
}
-
}
-
clip_ = prior_box_param.clip();
// true 默认false
-
if (prior_box_param.variance_size() >
1) {
// variance_size = 4
-
// Must and only provide 4 variance.
-
CHECK_EQ(prior_box_param.variance_size(),
4);
// 必须有4个variance
-
for (
int i =
0; i < prior_box_param.variance_size(); ++i) {
// variance:0.1 0.1 0.2 0.2
-
CHECK_GT(prior_box_param.variance(i),
0);
-
variance_.push_back(prior_box_param.variance(i));
-
}
-
}
else
if (prior_box_param.variance_size() ==
1) {
// 或者只设置一个,设为0.1
-
CHECK_GT(prior_box_param.variance(
0),
0);
-
variance_.push_back(prior_box_param.variance(
0));
-
}
else {
-
// Set default to 0.1.
-
variance_.push_back(
0.1);
-
}
-
-
if (prior_box_param.has_img_h() || prior_box_param.has_img_w()) {
// 设置图片的长宽
-
CHECK(!prior_box_param.has_img_size())
-
<<
"Either img_size or img_h/img_w should be specified; not both.";
-
img_h_ = prior_box_param.img_h();
-
CHECK_GT(img_h_,
0) <<
"img_h should be larger than 0.";
-
img_w_ = prior_box_param.img_w();
-
CHECK_GT(img_w_,
0) <<
"img_w should be larger than 0.";
-
}
else
if (prior_box_param.has_img_size()) {
-
const
int img_size = prior_box_param.img_size();
-
CHECK_GT(img_size,
0) <<
"img_size should be larger than 0.";
-
img_h_ = img_size;
-
img_w_ = img_size;
-
}
else {
-
img_h_ =
0;
-
img_w_ =
0;
-
}
-
-
if (prior_box_param.has_step_h() || prior_box_param.has_step_w()) {
// step,tesp_h,step_w参数设置
-
CHECK(!prior_box_param.has_step())
-
<<
"Either step or step_h/step_w should be specified; not both.";
-
step_h_ = prior_box_param.step_h();
-
CHECK_GT(step_h_,
0.) <<
"step_h should be larger than 0.";
-
step_w_ = prior_box_param.step_w();
-
CHECK_GT(step_w_,
0.) <<
"step_w should be larger than 0.";
-
}
else
if (prior_box_param.has_step()) {
-
const
float step = prior_box_param.step();
-
CHECK_GT(step,
0) <<
"step should be larger than 0.";
-
step_h_ = step;
-
step_w_ = step;
-
}
else {
-
step_h_ =
0;
-
step_w_ =
0;
-
}
-
-
offset_ = prior_box_param.offset();
// 偏移量,默认0.5
-
}
// layersetup 结束
-
-
template <
typename Dtype>
-
void PriorBoxLayer<Dtype>::Reshape(
const
vector<Blob<Dtype>*>& bottom,
-
const
vector<Blob<Dtype>*>& top) {
-
const
int layer_width = bottom[
0]->width();
// 输入feature map的大小
-
const
int layer_height = bottom[
0]->height();
-
vector<
int> top_shape(
3,
1);
-
// Since all images in a batch has same height and width, we only need to
-
// generate one set of priors which can be shared across all images.
-
top_shape[
0] =
1;
-
// 2 channels. First channel stores the mean of each prior coordinate.
-
// Second channel stores the variance of each prior coordinate.
-
top_shape[
1] =
2;
-
top_shape[
2] = layer_width * layer_height * num_priors_ *
4;
-
// 输出坐标,就是需要这么多个map,类似faster rcnn,注意:这里,如果没有在ptototxt中没有设置max_size,num_priors_的值就要减1
-
CHECK_GT(top_shape[
2],
0);
-
top[
0]->Reshape(top_shape);
-
// 在mbox_priorbox层中,concat是选的axis: 2,就是说是concat的map数。
-
}
-
-
template <
typename Dtype>
-
void PriorBoxLayer<Dtype>::Forward_cpu(
const
vector<Blob<Dtype>*>& bottom,
-
const
vector<Blob<Dtype>*>& top) {
-
const
int layer_width = bottom[
0]->width();
// 上一层feature map
-
const
int layer_height = bottom[
0]->height();
-
int img_width, img_height;
-
if (img_h_ ==
0 || img_w_ ==
0) {
-
img_width = bottom[
1]->width();
// data layer出来的结果,原始图
-
img_height = bottom[
1]->height();
-
}
else {
-
img_width = img_w_;
// 对图进行缩放,可以设置参数
-
img_height = img_h_;
-
}
-
float step_w, step_h;
-
if (step_w_ ==
0 || step_h_ ==
0) {
// 得到缩放比例,相当于faster的feat_stride,这里处理的稍好些,长宽都有相应参数
-
step_w =
static_cast<
float>(img_width) / layer_width;
// 这里都用的float,不像faster直接暴力int型
-
step_h =
static_cast<
float>(img_height) / layer_height;
-
}
else {
-
step_w = step_w_;
-
step_h = step_h_;
-
}
-
Dtype* top_data = top[
0]->mutable_cpu_data();
-
int dim = layer_height * layer_width * num_priors_ *
4;
// 一般情况下w*h*6*4,conv4_3除外,详细参考笔记上的框架图
-
int idx =
0;
-
for (
int h =
0; h < layer_height; ++h) {
// 对于feature map上的每个点逐一映射
-
for (
int w =
0; w < layer_width; ++w) {
-
// 这里和Faster RCNN 一样,就是把feature map上的点映射回原图,这里加上0.5也是为了四舍五入,和faster rcnn python代码类似
-
float center_x = (w + offset_) * step_w;
-
float center_y = (h + offset_) * step_h;
-
float box_width, box_height;
-
for (
int s =
0; s < min_sizes_.size(); ++s) {
// min_sizes_.size()=1
-
int min_size_ = min_sizes_[s];
-
// 这里的min_size从fc7_mbox_priorbox的60到最后的276,就是s_k从0.2到0.92的过程
-
// first prior: aspect_ratio = 1, size = min_size
-
box_width = box_height = min_size_;
-
// xmin
-
top_data[idx++] = (center_x - box_width /
2.) / img_width;
//
-
// ymin
-
top_data[idx++] = (center_y - box_height /
2.) / img_height;
-
// xmax
-
top_data[idx++] = (center_x + box_width /
2.) / img_width;
-
// ymax
-
top_data[idx++] = (center_y + box_height /
2.) / img_height;
-
-
if (max_sizes_.size() >
0) {
-
CHECK_EQ(min_sizes_.size(), max_sizes_.size());
-
int max_size_ = max_sizes_[s];
-
// second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) // 这里就和论文中一致,s_k的选法,每个都不同
-
box_width = box_height =
sqrt(min_size_ * max_size_);
-
// xmin
-
top_data[idx++] = (center_x - box_width /
2.) / img_width;
-
// ymin
-
top_data[idx++] = (center_y - box_height /
2.) / img_height;
-
// xmax
-
top_data[idx++] = (center_x + box_width /
2.) / img_width;
-
// ymax
-
top_data[idx++] = (center_y + box_height /
2.) / img_height;
-
}
-
-
// rest of priors
-
for (
int r =
0; r < aspect_ratios_.size(); ++r) {
// 其他几个比例计算
-
float ar = aspect_ratios_[r];
-
if (
fabs(ar -
1.) <
1e-6) {
-
continue;
-
}
-
box_width = min_size_ *
sqrt(ar);
-
box_height = min_size_ /
sqrt(ar);
-
// xmin
-
top_data[idx++] = (center_x - box_width /
2.) / img_width;
-
// ymin
-
top_data[idx++] = (center_y - box_height /
2.) / img_height;
-
// xmax
-
top_data[idx++] = (center_x + box_width /
2.) / img_width;
-
// ymax
-
top_data[idx++] = (center_y + box_height /
2.) / img_height;
-
}
-
}
// end for min_size=1
-
}
// end for w
-
}
// end for h
-
// 到这里,所有的prior_box选取完成,共6个比例,和论文中相符合,同时在每一层中算一个s_k,就是每一层都会设置一个min_size
-
// clip the prior's coordidate such that it is within [0, 1]
-
if (clip_) {
// 裁剪到[0,1]
-
for (
int d =
0; d < dim; ++d) {
-
top_data[d] =
std::min<Dtype>(
std::max<Dtype>(top_data[d],
0.),
1.);
-
}
-
}
-
// set the variance.
-
// 解答: https://github.com/weiliu89/caffe/issues/75
-
// 除以variance是对预测box和真实box的误差进行放大,从而增加loss,增大梯度,加快收敛。
-
// 另外,top_data += top[0]->offset(0, 1);已经使指针指向新的地址,所以variance不会覆盖前面的结果。
-
// offse一般都是4个参数的offset(n,c,w,h),设置相应的参数就可以指到下一张图(以四位张量为例)
-
top_data += top[
0]->offset(
0,
1);
// 这里我猜是指向了下一个chanel
-
if (variance_.size() ==
1) {
-
caffe_set<Dtype>(dim, Dtype(variance_[
0]), top_data);
// 用常数variance_[0]对top_data进行初始化
-
}
else {
-
int count =
0;
-
for (
int h =
0; h < layer_height; ++h) {
-
for (
int w =
0; w < layer_width; ++w) {
-
for (
int i =
0; i < num_priors_; ++i) {
-
for (
int j =
0; j <
4; ++j) {
-
top_data[count] = variance_[j];
-
++count;
-
}
-
}
-
}
-
}
-
}
-
}
-
-
INSTANTIATE_CLASS(PriorBoxLayer);
-
REGISTER_LAYER_CLASS(PriorBox);
-
-
}
// namespace caffe