NVCaffe中MultiBoxLossLayer的代码说真的不是那么好容易理解,有很多细节的地方最好是结合 SSD的论文反复推敲,方得真谛。在深入解读源码之前,还是先给一个訪层的参数配置样例。在后文中对代码进行推演的时候以訪样例为参考。
layer {
name: "mbox_loss"
type: "MultiBoxLoss"
bottom: "mbox_loc"
bottom: "mbox_conf"
bottom: "mbox_priorbox"
bottom: "label"
top: "mbox_loss"
include {
phase: TRAIN
}
propagate_down: true
propagate_down: true
propagate_down: false
propagate_down: false
loss_param {
normalization: VALID
}
multibox_loss_param {
loc_loss_type: SMOOTH_L1 //位置损失函数
conf_loss_type: SOFTMAX //置信度损失函数
loc_weight: 1
num_classes: 5 //类别数(背景类 + 目标类别)
share_location: true
match_type: PER_PREDICTION
overlap_threshold: 0.5
use_prior_for_matching: true
background_label_id: 0 //背景类id,通常为0
use_difficult_gt: false
neg_pos_ratio: 3 //正负样本比例1:3
neg_overlap: 0.5 //负样本IoU 阈值
code_type: CENTER_SIZE
ignore_cross_boundary_bbox: false
mining_type: MAX_NEGATIVE //难样本挖掘策略
}
}
LayerSetUp
每个网络层都将调用 LayerSetUp 进行特定的设置,主要包括读取和处理对应訪层的相关参数,设置输出 blob 的 shape 等。
template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
const vector<Blob*>& top) {
LossLayer<Ftype, Btype>::LayerSetUp(bottom, top);
//配置文件如果没有明确给出propagate_down参数配置,则照此默认配置.
if (this->layer_param_.propagate_down_size() == 0) {
this->layer_param_.add_propagate_down(true);
this->layer_param_.add_propagate_down(true);
this->layer_param_.add_propagate_down(false);
this->layer_param_.add_propagate_down(false);
}
const MultiBoxLossParameter& multibox_loss_param =
this->layer_param_.multibox_loss_param();
multibox_loss_param_ = this->layer_param_.multibox_loss_param();
//图片的数量即(N,C,H,W)中的 N
num_ = bottom[0]->num();
/*bootom[2]是 mbox_priorbox,它是 PriorBoxLayer的输出,即生成的 Anchor.
bootom 的 shape 为(1,2,N),其中 N 为訪层生成的所有Anchor的坐标,用于 Anchor 用
4个坐标来表示,所以这里除以4就得到了总的生成的 Anchor 的个数*/
num_priors_ = bottom[2]->height() / 4;
// Get other parameters.
CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
//类别数,注意要目标类别数再加上一个背景类
num_classes_ = multibox_loss_param.num_classes();
CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1.";
/*什么意思呢?就是说各个类别是否共享一组位置, 相当于是在同一个 box上区分
不同的类别,通常都是true*/
share_location_ = multibox_loss_param.share_location();
loc_classes_ = share_location_ ? 1 : num_classes_;
background_label_id_ = multibox_loss_param.background_label_id();
use_difficult_gt_ = multibox_loss_param.use_difficult_gt();
//难样本挖掘策略
mining_type_ = multibox_loss_param.mining_type();
if (multibox_loss_param.has_do_neg_mining()) {
LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
do_neg_mining_ = multibox_loss_param.do_neg_mining();
CHECK_EQ(do_neg_mining_,
mining_type_ != MultiBoxLossParameter_MiningType_NONE);
}
//如果配置为None,表示不采用难样本挖掘
do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE;
if (!this->layer_param_.loss_param().has_normalization() &&
this->layer_param_.loss_param().has_normalize()) {
normalization_ = this->layer_param_.loss_param().normalize() ?
LossParameter_NormalizationMode_VALID :
LossParameter_NormalizationMode_BATCH_SIZE;
} else {
normalization_ = this->layer_param_.loss_param().normalization();
}
if (do_neg_mining_) {
CHECK(share_location_)
<< "Currently only support negative mining if share_location is true.";
}
vector<int> loss_shape(1, 1);
// Set up localization loss layer.
loc_weight_ = multibox_loss_param.loc_weight();
loc_loss_type_ = multibox_loss_param.loc_loss_type();
// fake shape.
vector<int> loc_shape(1, 1);
loc_shape.push_back(4);
loc_pred_ = Blob::create<Dtype>();
loc_pred_->Reshape(loc_shape);
loc_gt_ = Blob::create<Dtype>();
loc_gt_->Reshape(loc_shape);
loc_bottom_vec_.push_back(loc_pred_.get());
loc_bottom_vec_.push_back(loc_gt_.get());
loc_loss_ = Blob::create<Dtype>();
loc_loss_->Reshape(loss_shape);
loc_top_vec_.push_back(loc_loss_.get());
if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2) {
LayerParameter layer_param;
layer_param.set_name(this->layer_param_.name() + "_l2_loc");
layer_param.set_type("EuclideanLoss");
layer_param.add_loss_weight(loc_weight_);
loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
} else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
LayerParameter layer_param;
layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc");
layer_param.set_type("SmoothL1Loss");
layer_param.add_loss_weight(loc_weight_);
loc_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
} else {
LOG(FATAL) << "Unknown localization loss type.";
}
// Set up confidence loss layer.
conf_weight_ = multibox_loss_param.conf_weight();
conf_loss_type_ = multibox_loss_param.conf_loss_type();
conf_pred_ = Blob::create<Dtype>();
conf_gt_ = Blob::create<Dtype>();
conf_loss_ = Blob::create<Dtype>();
conf_loss_->Reshape(loss_shape);
conf_top_vec_.push_back(conf_loss_.get());
if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
CHECK_GE(background_label_id_, 0)
<< "background_label_id should be within [0, num_classes) for Softmax.";
CHECK_LT(background_label_id_, num_classes_)
<< "background_label_id should be within [0, num_classes) for Softmax.";
LayerParameter layer_param;
layer_param.set_name(this->layer_param_.name() + "_softmax_conf");
layer_param.set_type("SoftmaxWithLoss");
layer_param.add_loss_weight(conf_weight_);
// layer_param.add_loss_weight(Dtype(1.));
layer_param.mutable_loss_param()->set_normalization(
LossParameter_NormalizationMode_NONE);
SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param();
softmax_param->set_axis(1);
// Fake reshape.
vector<int> conf_shape(1, 1);
conf_gt_->Reshape(conf_shape);
conf_shape.push_back(num_classes_);
conf_pred_->Reshape(conf_shape);
conf_bottom_vec_.push_back(conf_pred_.get());
conf_bottom_vec_.push_back(conf_gt_.get());
conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
} else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
LayerParameter layer_param;
layer_param.set_name(this->layer_param_.name() + "_logistic_conf");
layer_param.set_type("SigmoidCrossEntropyLoss");
layer_param.add_loss_weight(conf_weight_);
// layer_param.add_loss_weight(Dtype(1.));
// Fake reshape.
vector<int> conf_shape(1, 1);
conf_shape.push_back(num_classes_);
conf_gt_->Reshape(conf_shape);
conf_pred_->Reshape(conf_shape);
conf_bottom_vec_.push_back(conf_pred_.get());
conf_bottom_vec_.push_back(conf_gt_.get());
conf_loss_layer_ = LayerRegistry::CreateLayer(layer_param, this->parent_rank());
conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
} else {
LOG(FATAL) << "Unknown confidence loss type.";
}
}
Reshape
template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
const vector<Blob*>& top) {
LossLayer<Ftype, Btype>::Reshape(bottom, top);
num_ = bottom[0]->num();
num_priors_ = bottom[2]->height() / 4; //anchor 的数量,上文已做解释
num_gt_ = bottom[3]->height(); //bottom[3]是 label,这里得到 gt 框的数目
CHECK_EQ(bottom[0]->num(), bottom[1]->num());
CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels())
<< "Number of priors must match number of location predictions.";
CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
<< "Number of priors must match number of confidence predictions.";
}
bottom[3]表示为 label,bottom[3]->height()得到的是gt 框的总的个数,为什么这么说呢?数据输入层以 AnnotatedDataLayer 为例,
它的输出 shape 一般情况下为(1,1,N,8),这里的 第2维N 就是一个 batch 中的 gt 框的数量,这个就是这么设定的。第3维的8表示对应 gt 框的相关信息(分类信息、位置信息等)。
前向推理(Forward)
最核心的部分,也最复杂。主要完成的工作包括:1).正负样本的匹配、划分;2).难样本挖掘;3).loss 的计算等等。
template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Forward_cpu(const vector<Blob*>& bottom,
const vector<Blob*>& top) {
const Dtype* loc_data = bottom[0]->cpu_data<Dtype>();
const Dtype* conf_data = bottom[1]->cpu_data<Dtype>();
const Dtype* prior_data = bottom[2]->cpu_data<Dtype>();
const Dtype* gt_data = bottom[3]->cpu_data<Dtype>();
// refinedet
const Dtype* arm_conf_data = NULL;
const Dtype* arm_loc_data = NULL;
vector<LabelBBox> all_arm_loc_preds;
if (bottom.size() >= 5) {
arm_conf_data = bottom[4]->cpu_data<Dtype>();
}
if (bottom.size() >= 6) {
arm_loc_data = bottom[5]->cpu_data<Dtype>();
GetLocPredictions(arm_loc_data, num_, num_priors_, loc_classes_, share_location_,
&all_arm_loc_preds);
}
// Retrieve all ground truth.
map<int, vector<NormalizedBBox> > all_gt_bboxes;
GetGroundTruth(gt_data, num_classes_, num_gt_, background_label_id_, use_difficult_gt_,
&all_gt_bboxes);
// Retrieve all prior bboxes. It is same within a batch since we assume all
// images in a batch are of same dimension.
vector<NormalizedBBox> prior_bboxes;
vector<vector<float> > prior_variances;
GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
// Retrieve all predictions.
vector<LabelBBox> all_loc_preds;
GetLocPredictions(loc_data, num_, num_priors_, loc_classes_, share_location_,
&all_loc_preds);
// Find matches between source bboxes and ground truth bboxes.
vector<map<int, vector<float> > > all_match_overlaps;
if (bottom.size() >= 6) {
CasRegFindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
multibox_loss_param_, &all_match_overlaps, &all_match_indices_, all_arm_loc_preds);
} else {
FindMatches(all_loc_preds, all_gt_bboxes, prior_bboxes, prior_variances,
multibox_loss_param_, &all_match_overlaps, &all_match_indices_);
}
num_matches_ = 0;
int num_negs = 0;
// Sample hard negative (and positive) examples based on mining type.
MineHardExamples<Dtype>(*bottom[1],
all_loc_preds, all_gt_bboxes, prior_bboxes,
prior_variances, all_match_overlaps, multibox_loss_param_,
&num_matches_, &num_negs, &all_match_indices_, &all_neg_indices_, arm_conf_data, do_neg_mining_);
if (num_matches_ >= 1) {
// Form data to pass on to loc_loss_layer_.
vector<int> loc_shape(2);
loc_shape[0] = 1;
loc_shape[1] = num_matches_ * 4;
loc_pred_->Reshape(loc_shape);
loc_gt_->Reshape(loc_shape);
Dtype* loc_pred_data = loc_pred_->mutable_cpu_data<Dtype>();
Dtype* loc_gt_data = loc_gt_->mutable_cpu_data<Dtype>();
if (bottom.size() >= 6) {
CasRegEncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
prior_bboxes, prior_variances, multibox_loss_param_,
loc_pred_data, loc_gt_data, all_arm_loc_preds);
} else {
EncodeLocPrediction(all_loc_preds, all_gt_bboxes, all_match_indices_,
prior_bboxes, prior_variances, multibox_loss_param_,
loc_pred_data, loc_gt_data);
}
loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_);
loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_);
} else {
loc_loss_->mutable_cpu_data<Dtype>()[0] = 0;
}
// Form data to pass on to conf_loss_layer_.
if (do_neg_mining_) {
num_conf_ = num_matches_ + num_negs;
} else {
num_conf_ = num_ * num_priors_;
}
if (0) {
const Solver* solver = this->parent_solver();
if ((solver && solver->display()) || solver==0) {
LOG(INFO) << cv::format("iter %d, do_neg_mining %d, num_matches %d, num_negs %d, num_conf %d, num %d, num_priors %d\n",
this->iter(), do_neg_mining_, num_matches_, num_negs, num_conf_, num_, num_priors_);
}
}
if (num_conf_ >= 1) {
// Reshape the confidence data.
vector<int> conf_shape;
if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX) {
conf_shape.push_back(num_conf_);
conf_bottom_vec_[1]->Reshape(conf_shape);
conf_shape.push_back(num_classes_);
conf_bottom_vec_[0]->Reshape(conf_shape);
} else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
conf_shape.push_back(1);
conf_shape.push_back(num_conf_);
conf_shape.push_back(num_classes_);
conf_bottom_vec_[0]->Reshape(conf_shape);
conf_bottom_vec_[1]->Reshape(conf_shape);
} else {
LOG(FATAL) << "Unknown confidence loss type.";
}
if (!do_neg_mining_) {
// Consider all scores.
// Share data and diff with bottom[1].
CHECK_EQ(conf_pred_->count(), bottom[1]->count());
conf_pred_->ShareData(*(bottom[1]));
}
Dtype* conf_pred_data = conf_pred_->mutable_cpu_data<Dtype>();
Dtype* conf_gt_data = conf_gt_->mutable_cpu_data<Dtype>();
caffe_set(conf_gt_->count(), Dtype(background_label_id_), conf_gt_data);
EncodeConfPrediction(conf_data, num_, num_priors_, multibox_loss_param_,
all_match_indices_, all_neg_indices_, all_gt_bboxes,
conf_pred_data, conf_gt_data, do_neg_mining_);
conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_);
conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_);
} else {
conf_loss_->mutable_cpu_data<Dtype>()[0] = 0;
}
top[0]->mutable_cpu_data<Dtype>()[0] = 0;
if (this->layer_param_.propagate_down(0)) {
Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
normalization_, num_, num_priors_, num_matches_);
top[0]->mutable_cpu_data<Dtype>()[0] +=
loc_weight_ * loc_loss_->cpu_data<Dtype>()[0] / normalizer;
}
if (this->layer_param_.propagate_down(1)) {
Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
normalization_, num_, num_priors_, num_matches_);
top[0]->mutable_cpu_data<Dtype>()[0] +=
conf_weight_ * conf_loss_->cpu_data<Dtype>()[0] / normalizer;
}
}
这里首先调用 GetGroundTruth 函数获取当前 batch 下的所有 gt 框存入 all_gt_bboxes 中,訪变量是一个 map。
template <typename Dtype>
void GetGroundTruth(const Dtype* gt_data, const int num_classes, const int num_gt,
const int background_label_id, const bool use_difficult_gt,
map<int, vector<NormalizedBBox> >* all_gt_bboxes) {
all_gt_bboxes->clear();
for (int i = 0; i < num_gt; ++i) {
//8->item_id,group_label,instance_id,xmin,ymin,xmax,ymax,difficult
int start_idx = i * 8;
int item_id = gt_data[start_idx];
if (item_id == -1) {
continue;
}
//group_label,也就是类别label
int label = std::round(gt_data[start_idx + 1]);
if (label <= background_label_id) {
DLOG(WARNING) << "Ignoring background label in the dataset: " << gt_data[start_idx + 1];
continue;
}
if (label >= num_classes) {
DLOG(WARNING) << "Ignoring label >= num_classes in the dataset: " << gt_data[start_idx + 1];
continue;
}
bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
if (!use_difficult_gt && difficult) {
// Skip reading difficult ground truth.
continue;
}
NormalizedBBox bbox;
bbox.set_label(label);
bbox.set_xmin(gt_data[start_idx + 3]);
bbox.set_ymin(gt_data[start_idx + 4]);
bbox.set_xmax(gt_data[start_idx + 5]);
bbox.set_ymax(gt_data[start_idx + 6]);
bbox.set_difficult(difficult);
//面积
float bbox_size = BBoxSize(bbox);
bbox.set_size(bbox_size);
(*all_gt_bboxes)[item_id].push_back(bbox);
}
}
接下来是调用 GetPriorBBoxes 函数,它的含义并不是命名上的这个意思即获取 prior bboxes,因为 prior bboxes 本身就已经存在于 prior_data 里面了,来源于 bottom[2]。这个函数只是将 prior_data 中关于 prior box 的部分和 variance 的部分分别提取出来存入prior_bboxes 和 prior_variances 这两个变量中。
template <typename Dtype>
void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
vector<NormalizedBBox>* prior_bboxes,
vector<vector<float> >* prior_variances) {
prior_bboxes->clear();
prior_variances->clear();
for (int i = 0; i < num_priors; ++i) {
int start_idx = i * 4;
NormalizedBBox bbox;
bbox.set_xmin(prior_data[start_idx]);
bbox.set_ymin(prior_data[start_idx + 1]);
bbox.set_xmax(prior_data[start_idx + 2]);
bbox.set_ymax(prior_data[start_idx + 3]);
float bbox_size = BBoxSize(bbox);
bbox.set_size(bbox_size);
prior_bboxes->push_back(bbox);
}
//prior_data由两部分组成,前面部分统一是坐标,后面部分统一是variances
for (int i = 0; i < num_priors; ++i) {
int start_idx = (num_priors + i) * 4;
vector<float> var;
for (int j = 0; j < 4; ++j) {
var.push_back(prior_data[start_idx + j]);
}
prior_variances->push_back(var);
}
}
接下来要调用的 GetLocPredictions 函数,预测 box 存入 loc_preds 中。通常我们都是设置 share_location 为 True,多类共享同一个预测框的坐标位置。
template <typename Dtype>
void GetLocPredictions(const Dtype* loc_data, const int num,
const int num_preds_per_class, const int num_loc_classes,
const bool share_location, vector<LabelBBox>* loc_preds) {
loc_preds->clear();
if (share_location) {
CHECK_EQ(num_loc_classes, 1);
}
loc_preds->resize(num); //这里很关键,相当于是分配了内存空间
for (int i = 0; i < num; ++i) {
//typedef map<int, vector<NormalizedBBox> > LabelBBox;
LabelBBox& label_bbox = (*loc_preds)[i];
for (int p = 0; p < num_preds_per_class; ++p) { //share_location=>1
int start_idx = p * num_loc_classes * 4;
for (int c = 0; c < num_loc_classes; ++c) {
//share_location为true的话,label为-1
//否则的话label就从0开始,一直到(num_loc_classes -1)
int label = share_location ? -1 : c;
if (label_bbox.find(label) == label_bbox.end()) {
label_bbox[label].resize(num_preds_per_class);
}
//label_bbox 是 map,所以 label 为-1没啥问题
label_bbox[label][p].set_xmin(loc_data[start_idx + c * 4]);
label_bbox[label][p].set_ymin(loc_data[start_idx + c * 4 + 1]);
label_bbox[label][p].set_xmax(loc_data[start_idx + c * 4 + 2]);
label_bbox[label][p].set_ymax(loc_data[start_idx + c * 4 + 3]);
}
}
loc_data += num_preds_per_class * num_loc_classes * 4;
}
}
在 SSD 论文中讨论 Matching Strategy 时有下面这段话:
在训练过程中,首先要确定训练图片中的gt与哪个anchor来进行匹配,与之匹配的anchor所将负责预测访 gt。在Yolo中,gt的中心落在哪个单元格,该单元格中与其IOU最大的边界框负责预测它。但是在SSD中却完全不一样,SSD的anchor与gt的匹配原则主要有两点。首先,对于图片中每个gt,找到与其IOU最大的anchor为与之匹配的 anchor。这样,可以保证每个gt一定与有可匹配的 anchor。通常称与gt匹配的anchor为正样本(其实应该是先验框对应的预测box,不过由于是一一对应的就这样称呼了)。反之,若某个 anchor没有匹配上任何的 gt,那么该anchor只能与背景匹配, 那它就是负样本。一个图片中gt是非常少的, 而anchor却很多,如果仅按第一个原则匹配,很多anchor会是负样本,正负样本极其不平衡。所以需要第二个原则。第二个原则是:对于剩余的未匹配到 gt的anchor,若和某个 gt 的IOU 大于阈值(一般是0.5),那么该anchor也是和訪 gt 匹配。这样就意味着某个gt可能与多个anchor 匹配,相当于多个预测框是针对的同一个目标,这没问题。但是反过来却不行,因为一个anchor只能匹配一个gt,如果多个gt与某个anchor大于阈值,那么anchor只与IOU最大的那个gt进行匹配。关于 anchor 与 gt 的匹配就是在 FindMatches 函数中完成的。
void FindMatches(const vector<LabelBBox>& all_loc_preds,
const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
const vector<NormalizedBBox>& prior_bboxes,
const vector<vector<float> >& prior_variances,
const MultiBoxLossParameter& multibox_loss_param,
vector<map<int, vector<float> > >* all_match_overlaps,
vector<map<int, vector<int> > >* all_match_indices) {
// all_match_overlaps->clear();
// all_match_indices->clear();
// Get parameters.
CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
const int num_classes = multibox_loss_param.num_classes();
CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
const bool share_location = multibox_loss_param.share_location();
const int loc_classes = share_location ? 1 : num_classes;
//
const MatchType match_type = multibox_loss_param.match_type(); //匹配类型
//IoU 阈值,用来划分正负样本的
const float overlap_threshold = multibox_loss_param.overlap_threshold();
const bool use_prior_for_matching = multibox_loss_param.use_prior_for_matching();
const int background_label_id = multibox_loss_param.background_label_id();
const CodeType code_type = multibox_loss_param.code_type();
const bool encode_variance_in_target =
multibox_loss_param.encode_variance_in_target();
const bool ignore_cross_boundary_bbox =
multibox_loss_param.ignore_cross_boundary_bbox();
// Find the matches.
int num = all_loc_preds.size(); //图片的数目
for (int i = 0; i < num; ++i) {
map<int, vector<int> > match_indices;
map<int, vector<float> > match_overlaps;
// Check if there is ground truth for current image.
/*对于当前图像是否有gt, all_gt_bboxes 是一个map<int, vector<NormalizedBBox>>
结构,int 为图像 id,没有 gt 的图像在 GetGroundTruth 函数填充 all_gt_bboxes 时
会直接 pass 掉*/
if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) {
// There is no gt for current image. All predictions are negative.
//对于当前图像没有 gt,那自然所有预测 box 都是负样本
const vector<NormalizedBBox> gt_bboxes;
vector<int> temp_match_indices;
vector<float> temp_match_overlaps;
const int label = -1;
MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
ignore_cross_boundary_bbox, &temp_match_indices,
&temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
if (share_location) {
match_indices[label] = temp_match_indices;
match_overlaps[label] = temp_match_overlaps;
}
all_match_indices->push_back(match_indices);
all_match_overlaps->push_back(match_overlaps);
continue;
}
// Find match between predictions and ground truth.
const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
//是不使用默认框(default box)进行匹配,default true
if (!use_prior_for_matching) {
for (int c = 0; c < loc_classes; ++c) {
int label = share_location ? -1 : c;
if (!share_location && label == background_label_id) {
// Ignore background loc predictions.
continue;
}
// Decode the prediction into bbox first.
vector<NormalizedBBox> loc_bboxes;
bool clip_bbox = false;
DecodeBBoxes(prior_bboxes, prior_variances,
code_type, encode_variance_in_target, clip_bbox,
all_loc_preds[i].find(label)->second, &loc_bboxes);
MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
overlap_threshold, ignore_cross_boundary_bbox,
&match_indices[label], &match_overlaps[label],
multibox_loss_param.ignore_difficult_gt());
}
} else {
// Use prior bboxes to match against all ground truth.
vector<int> temp_match_indices;
vector<float> temp_match_overlaps;
const int label = -1;
//gt与 prior box 的匹配在訪函数中完成
MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
ignore_cross_boundary_bbox, &temp_match_indices,
&temp_match_overlaps, multibox_loss_param.ignore_difficult_gt());
if (share_location) {
match_indices[label] = temp_match_indices;
match_overlaps[label] = temp_match_overlaps;
} else {
// Get ground truth label for each ground truth bbox.
vector<int> gt_labels;
for (int g = 0; g < gt_bboxes.size(); ++g) {
gt_labels.push_back(gt_bboxes[g].label());
}
// Distribute the matching results to different loc_class.
for (int c = 0; c < loc_classes; ++c) {
if (c == background_label_id) {
// Ignore background loc predictions.
continue;
}
match_indices[c].resize(temp_match_indices.size(), -1);
match_overlaps[c] = temp_match_overlaps;
for (int m = 0; m < temp_match_indices.size(); ++m) {
if (temp_match_indices[m] > -1) {
const int gt_idx = temp_match_indices[m];
CHECK_LT(gt_idx, gt_labels.size());
if (c == gt_labels[gt_idx]) {
match_indices[c][m] = gt_idx;
}
}
}
}
}
}
all_match_indices->push_back(match_indices);
all_match_overlaps->push_back(match_overlaps);
};
}
MineHardExamples函数实现了 SSD 中的提出的 OHNM(Online Hard Negative Mining)机制,在配置mining_type 是可以选择:None、MAX_NEGATIVE和 HARD_EXAMPLE 三种策略。其中 None 相当于不使用难样本挖掘。而 MAX_NEGATIVE只计算分类 loss,不计算定位 loss,只针对负样本选择 loss 最大的3倍于正样本数量的负样本。HARD_EXAMPLE会同时计算loss的时候会同时计算分类 loss 和 confidence loss。
template <typename Dtype>
void MineHardExamples(const Blob& conf_blob,
const vector<LabelBBox>& all_loc_preds,
const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
const vector<NormalizedBBox>& prior_bboxes,
const vector<vector<float> >& prior_variances,
const vector<map<int, vector<float> > >& all_match_overlaps,
const MultiBoxLossParameter& multibox_loss_param,
int* num_matches, int* num_negs,
vector<map<int, vector<int> > >* all_match_indices,
vector<vector<int> >* all_neg_indices) {
int num = all_loc_preds.size();
// CHECK_EQ(num, all_match_overlaps.size());
// CHECK_EQ(num, all_match_indices->size());
// all_neg_indices->clear();
*num_matches = CountNumMatches(*all_match_indices, num);
*num_negs = 0;
int num_priors = prior_bboxes.size();
CHECK_EQ(num_priors, prior_variances.size());
// Get parameters.
CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
const int num_classes = multibox_loss_param.num_classes();
CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
const int background_label_id = multibox_loss_param.background_label_id();
const bool use_prior_for_nms = multibox_loss_param.use_prior_for_nms();
const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
const MiningType mining_type = multibox_loss_param.mining_type();
//配置为None表示不做验样本挖掘
if (mining_type == MultiBoxLossParameter_MiningType_NONE) {
return;
}
const LocLossType loc_loss_type = multibox_loss_param.loc_loss_type();
//负样本比例???
const float neg_pos_ratio = multibox_loss_param.neg_pos_ratio();
//负样本IoU
const float neg_overlap = multibox_loss_param.neg_overlap();
//CENTER_SIZE?(cx,cy,w,h)
const CodeType code_type = multibox_loss_param.code_type();
const bool encode_variance_in_target =
multibox_loss_param.encode_variance_in_target();
const bool has_nms_param = multibox_loss_param.has_nms_param();
float nms_threshold = 0;
int top_k = -1;
if (has_nms_param) {
nms_threshold = multibox_loss_param.nms_param().nms_threshold();
top_k = multibox_loss_param.nms_param().top_k();
}
const int sample_size = multibox_loss_param.sample_size();
// Compute confidence losses based on matching results.
// 反正先将 conf loss计算出来,不管后面 type 是啥,它都是要计算的
vector<vector<float> > all_conf_loss;
ComputeConfLossGPU<Dtype>(conf_blob, num, num_priors, num_classes,
background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
&all_conf_loss);
vector<vector<float> > all_loc_loss;
//貌似会同时计算分类和定位loss
if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
// Compute localization losses based on matching results.
TBlob<Dtype> loc_pred, loc_gt;
if (*num_matches != 0) {
vector<int> loc_shape(2, 1);
loc_shape[1] = *num_matches * 4;
loc_pred.Reshape(loc_shape);
loc_gt.Reshape(loc_shape);
Dtype* loc_pred_data = loc_pred.mutable_cpu_data();
Dtype* loc_gt_data = loc_gt.mutable_cpu_data();
EncodeLocPrediction(all_loc_preds, all_gt_bboxes, *all_match_indices,
prior_bboxes, prior_variances, multibox_loss_param,
loc_pred_data, loc_gt_data);
}
ComputeLocLoss(loc_pred, loc_gt, *all_match_indices, num,
num_priors, loc_loss_type, &all_loc_loss);
} else {
// No localization loss.
//只计算了分类 loss,配置成 MAX_NEGATIVE
for (int i = 0; i < num; ++i) {
vector<float> loc_loss(num_priors, 0.f); //全是0
all_loc_loss.push_back(loc_loss);
}
}
for (int i = 0; i < num; ++i) {
map<int, vector<int> >& match_indices = (*all_match_indices)[i];
const map<int, vector<float> >& match_overlaps = all_match_overlaps[i];
// loc + conf loss.
const vector<float>& conf_loss = all_conf_loss[i];
const vector<float>& loc_loss = all_loc_loss[i];
vector<float> loss;
std::transform(conf_loss.begin(), conf_loss.end(), loc_loss.begin(),
std::back_inserter(loss), std::plus<float>());
// Pick negatives or hard examples based on loss.
set<int> sel_indices;
vector<int> neg_indices;
for (map<int, vector<int> >::iterator it = match_indices.begin();
it != match_indices.end(); ++it) {
const int label = it->first;
int num_sel = 0;
// Get potential indices and loss pairs.
vector<pair<float, int> > loss_indices;
for (int m = 0; m < match_indices[label].size(); ++m) {
if (IsEligibleMining(mining_type, match_indices[label][m],
match_overlaps.find(label)->second[m], neg_overlap)) {
loss_indices.push_back(std::make_pair(loss[m], m));
++num_sel;
}
}
//OHEM type
if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {
int num_pos = 0;
for (int m = 0; m < match_indices[label].size(); ++m) {
if (match_indices[label][m] > -1) {
++num_pos;
}
}
//根据正样本数及正负样本比例计算出负样本数,然后和num_sel 比较取其小
num_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), num_sel);
} else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
CHECK_GT(sample_size, 0);
num_sel = std::min(sample_size, num_sel);
}
// Select samples.
if (has_nms_param && nms_threshold > 0) {
// Do nms before selecting samples.
vector<float> sel_loss;
vector<NormalizedBBox> sel_bboxes;
if (use_prior_for_nms) {
for (int m = 0; m < match_indices[label].size(); ++m) {
if (IsEligibleMining(mining_type, match_indices[label][m],
match_overlaps.find(label)->second[m], neg_overlap)) {
sel_loss.push_back(loss[m]);
sel_bboxes.push_back(prior_bboxes[m]);
}
}
} else {
// Decode the prediction into bbox first.
vector<NormalizedBBox> loc_bboxes;
bool clip_bbox = false;
DecodeBBoxes(prior_bboxes, prior_variances,
code_type, encode_variance_in_target, clip_bbox,
all_loc_preds[i].find(label)->second, &loc_bboxes);
for (int m = 0; m < match_indices[label].size(); ++m) {
if (IsEligibleMining(mining_type, match_indices[label][m],
match_overlaps.find(label)->second[m], neg_overlap)) {
sel_loss.push_back(loss[m]);
sel_bboxes.push_back(loc_bboxes[m]);
}
}
}
// Do non-maximum suppression based on the loss.
vector<int> nms_indices;
ApplyNMS(sel_bboxes, sel_loss, nms_threshold, top_k, &nms_indices);
if (nms_indices.size() < num_sel) {
LOG(INFO) << "not enough sample after nms: " << nms_indices.size();
}
// Pick top example indices after nms.
num_sel = std::min(static_cast<int>(nms_indices.size()), num_sel);
for (int n = 0; n < num_sel; ++n) {
sel_indices.insert(loss_indices[nms_indices[n]].second);
}
} else {
// Pick top example indices based on loss.
std::sort(loss_indices.begin(), loss_indices.end(),
SortScorePairDescend<int>);
for (int n = 0; n < num_sel; ++n) {
sel_indices.insert(loss_indices[n].second);
}
}
// Update the match_indices and select neg_indices.
for (int m = 0; m < match_indices[label].size(); ++m) {
if (match_indices[label][m] > -1) {
if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE &&
sel_indices.find(m) == sel_indices.end()) {
match_indices[label][m] = -1;
*num_matches -= 1;
}
} else if (match_indices[label][m] == -1) {
if (sel_indices.find(m) != sel_indices.end()) {
neg_indices.push_back(m);
*num_negs += 1;
}
}
}
}
all_neg_indices->push_back(neg_indices);
}
}
反向传播(Backward)
template <typename Ftype, typename Btype>
void MultiBoxLossLayer<Ftype, Btype>::Backward_cpu(const vector<Blob*>& top,
const vector<bool>& propagate_down,
const vector<Blob*>& bottom) {
if (propagate_down[2]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to prior inputs.";
}
if (propagate_down[3]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
}
// Back propagate on location prediction.
if (propagate_down[0]) {
Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff<Dtype>();
caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff);
if (num_matches_ >= 1) {
vector<bool> loc_propagate_down;
// Only back propagate on prediction, not ground truth.
loc_propagate_down.push_back(true);
loc_propagate_down.push_back(false);
loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down,
loc_bottom_vec_);
// Scale gradient.
Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
normalization_, num_, num_priors_, num_matches_);
Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
caffe_scal(loc_pred_->count(), loss_weight, loc_pred_->mutable_cpu_diff<Dtype>());
// Copy gradient back to bottom[0].
const Dtype* loc_pred_diff = loc_pred_->cpu_diff<Dtype>();
int count = 0;
for (int i = 0; i < num_; ++i) {
for (map<int, vector<int> >::iterator it =
all_match_indices_[i].begin();
it != all_match_indices_[i].end(); ++it) {
const int label = share_location_ ? 0 : it->first;
const vector<int>& match_index = it->second;
for (int j = 0; j < match_index.size(); ++j) {
if (match_index[j] <= -1) {
continue;
}
// Copy the diff to the right place.
int start_idx = loc_classes_ * 4 * j + label * 4;
caffe_copy(4, loc_pred_diff + count * 4,
loc_bottom_diff + start_idx);
++count;
}
}
loc_bottom_diff += bottom[0]->offset(1);
}
}
}
// Back propagate on confidence prediction.
if (propagate_down[1]) {
Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff<Dtype>();
caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff);
if (num_conf_ >= 1) {
vector<bool> conf_propagate_down;
// Only back propagate on prediction, not ground truth.
conf_propagate_down.push_back(true);
conf_propagate_down.push_back(false);
conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down,
conf_bottom_vec_);
// Scale gradient.
Dtype normalizer = LossLayer<Ftype, Btype>::GetNormalizer(
normalization_, num_, num_priors_, num_matches_);
Dtype loss_weight = top[0]->cpu_diff<Dtype>()[0] / normalizer;
caffe_scal(conf_pred_->count(), loss_weight,
conf_pred_->mutable_cpu_diff<Dtype>());
// Copy gradient back to bottom[1].
const Dtype* conf_pred_diff = conf_pred_->cpu_diff<Dtype>();
if (do_neg_mining_) {
int count = 0;
for (int i = 0; i < num_; ++i) {
// Copy matched (positive) bboxes scores' diff.
const map<int, vector<int> >& match_indices = all_match_indices_[i];
for (map<int, vector<int> >::const_iterator it =
match_indices.begin(); it != match_indices.end(); ++it) {
const vector<int>& match_index = it->second;
CHECK_EQ(match_index.size(), num_priors_);
for (int j = 0; j < num_priors_; ++j) {
if (match_index[j] <= -1) {
continue;
}
// Copy the diff to the right place.
caffe_copy(num_classes_,
conf_pred_diff + count * num_classes_,
conf_bottom_diff + j * num_classes_);
++count;
}
}
// Copy negative bboxes scores' diff.
for (int n = 0; n < all_neg_indices_[i].size(); ++n) {
int j = all_neg_indices_[i][n];
CHECK_LT(j, num_priors_);
caffe_copy(num_classes_,
conf_pred_diff + count * num_classes_,
conf_bottom_diff + j * num_classes_);
++count;
}
conf_bottom_diff += bottom[1]->offset(1);
}
} else {
// The diff is already computed and stored.
//bottom[1]->ShareDiff(*conf_pred_);
caffe_copy(conf_pred_->count(),conf_pred_diff,conf_bottom_diff);
}
}
}
if (0) {
float loss_xy = 0, loss_wh = 0, loss_obj = 0, loss_cls = 0;
Blob* loc = bottom[0];
Blob* conf = bottom[1];
int num = loc->shape()[0];
int nboxes = loc->shape()[1] / 4;
CHECK(nboxes == conf->shape()[1] / num_classes_);
for (int n=0; n<num; n++) {
for (int j=0; j<nboxes; j++) {
const float* p = loc->cpu_diff<float>() + n*nboxes*4 + j*4;
loss_xy += std::abs(p[0]);
loss_xy += std::abs(p[1]);
loss_wh += std::abs(p[2]);
loss_wh += std::abs(p[3]);
}
}
for (int n=0; n<num; n++) {
for (int j=0; j<nboxes; j++) {
const float* p = conf->cpu_diff<float>() + n*nboxes*num_classes_ + j*num_classes_;
loss_obj += std::abs(p[0]);
for (int c=1; c<num_classes_; c++) {
loss_cls += std::abs(p[c]);
}
}
}
const Solver* solver = this->parent_solver();
if ((solver && solver->display()) || solver==0) {
//LOG(INFO) << "the number of pred boxes is " << nboxes;
float loss = loss_xy+loss_wh+loss_obj+loss_cls;
char str[1024];
snprintf(str, 1024, "%s, iter %d, loss %g, loss_xy %g, loss_wh %g, loss_obj %g, loss_cls %g\n",
this->name().c_str(), this->iter(),
loss, loss_xy, loss_wh, loss_obj, loss_cls);
LOG(INFO) << str;
}
}
// After backward, remove match statistics.
all_match_indices_.clear();
all_neg_indices_.clear();
}