SSD源码解读3-MultiBoxLossLayer-CSDN博客

本文链接：https://blog.csdn.net/qianqing13579/article/details/80146425

SSD源码解读系列的第3篇，这篇博客对SSD中的MultiBoxLossLayer代码进行解读，MultiBoxLossLayer实现了priorbox的分类和回归

SSD源码阅读的时候，我对SSD源码创建了QT工程，这样方便阅读，SSD源码的QT工程我上传到CSDN了，该工程用QT可以直接打开的，大家可以直接下载该QT工程阅读，提高阅读效率。
点击下载

MultiBoxLossLayer源码解读

#include <algorithm>
#include <map>
#include <utility>
#include <vector>

#include "caffe/layers/multibox_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe
{

/*
 *
    bottom: "mbox_loc"
    bottom: "mbox_conf"
    bottom: "mbox_priorbox"
    bottom: "label"
    top: "mbox_loss"
 *
 *
 */

template <typename Dtype>
void MultiBoxLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top)
{
  LossLayer<Dtype>::LayerSetUp(bottom, top);
  if (this->layer_param_.propagate_down_size() == 0)
  {
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(true);
    this->layer_param_.add_propagate_down(false);
    this->layer_param_.add_propagate_down(false);
  }
  const MultiBoxLossParameter& multibox_loss_param =
      this->layer_param_.multibox_loss_param();
  multibox_loss_param_ = this->layer_param_.multibox_loss_param();

  num_ = bottom[0]->num();
  num_priors_ = bottom[2]->height() / 4; // 每幅图像中priorbox的数量
  // Get other parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  num_classes_ = multibox_loss_param.num_classes(); // 类别数
  CHECK_GE(num_classes_, 1) << "num_classes should not be less than 1.";
  share_location_ = multibox_loss_param.share_location();// true
  loc_classes_ = share_location_ ? 1 : num_classes_; // 1
  background_label_id_ = multibox_loss_param.background_label_id(); // 0
  use_difficult_gt_ = multibox_loss_param.use_difficult_gt(); // true
  mining_type_ = multibox_loss_param.mining_type();// 默认为MAX_NEGATIVE
  if (multibox_loss_param.has_do_neg_mining()) // true
  {
    LOG(WARNING) << "do_neg_mining is deprecated, use mining_type instead.";
    do_neg_mining_ = multibox_loss_param.do_neg_mining();
    CHECK_EQ(do_neg_mining_,
             mining_type_ != MultiBoxLossParameter_MiningType_NONE);
  }
  do_neg_mining_ = mining_type_ != MultiBoxLossParameter_MiningType_NONE; // true

  if (!this->layer_param_.loss_param().has_normalization() &&
      this->layer_param_.loss_param().has_normalize())
  {
    normalization_ = this->layer_param_.loss_param().normalize() ?
                     LossParameter_NormalizationMode_VALID :
                     LossParameter_NormalizationMode_BATCH_SIZE;
  }
  else
  {
    normalization_ = this->layer_param_.loss_param().normalization();
  }

  if (do_neg_mining_) {
    CHECK(share_location_)
        << "Currently only support negative mining if share_location is true.";
  }

  vector<int> loss_shape(1, 1);
  // Set up localization loss layer.
  loc_weight_ = multibox_loss_param.loc_weight(); // 1.0
  loc_loss_type_ = multibox_loss_param.loc_loss_type();// SMOOTH_L1
  // fake shape.
  vector<int> loc_shape(1, 1);
  loc_shape.push_back(4);
  loc_pred_.Reshape(loc_shape);
  loc_gt_.Reshape(loc_shape);
  loc_bottom_vec_.push_back(&loc_pred_);
  loc_bottom_vec_.push_back(&loc_gt_);
  loc_loss_.Reshape(loss_shape);
  loc_top_vec_.push_back(&loc_loss_);
  if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_L2)
  {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_l2_loc");
    layer_param.set_type("EuclideanLoss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else if (loc_loss_type_ == MultiBoxLossParameter_LocLossType_SMOOTH_L1) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_smooth_L1_loc");
    layer_param.set_type("SmoothL1Loss");
    layer_param.add_loss_weight(loc_weight_);
    loc_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
    loc_loss_layer_->SetUp(loc_bottom_vec_, loc_top_vec_);
  } else {
    LOG(FATAL) << "Unknown localization loss type.";
  }
  // Set up confidence loss layer.
  conf_loss_type_ = multibox_loss_param.conf_loss_type();
  conf_bottom_vec_.push_back(&conf_pred_);
  conf_bottom_vec_.push_back(&conf_gt_);
  conf_loss_.Reshape(loss_shape);
  conf_top_vec_.push_back(&conf_loss_);
  if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX)
  {
    CHECK_GE(background_label_id_, 0)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    CHECK_LT(background_label_id_, num_classes_)
        << "background_label_id should be within [0, num_classes) for Softmax.";
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_softmax_conf");
    layer_param.set_type("SoftmaxWithLoss");
    layer_param.add_loss_weight(Dtype(1.));
    layer_param.mutable_loss_param()->set_normalization(
        LossParameter_NormalizationMode_NONE);
    SoftmaxParameter* softmax_param = layer_param.mutable_softmax_param();
    softmax_param->set_axis(1);
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_gt_.Reshape(conf_shape);
    conf_shape.push_back(num_classes_);
    conf_pred_.Reshape(conf_shape);
    conf_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  }
  else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
    LayerParameter layer_param;
    layer_param.set_name(this->layer_param_.name() + "_logistic_conf");
    layer_param.set_type("SigmoidCrossEntropyLoss");
    layer_param.add_loss_weight(Dtype(1.));
    // Fake reshape.
    vector<int> conf_shape(1, 1);
    conf_shape.push_back(num_classes_);
    conf_gt_.Reshape(conf_shape);
    conf_pred_.Reshape(conf_shape);
    conf_loss_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
    conf_loss_layer_->SetUp(conf_bottom_vec_, conf_top_vec_);
  } else {
    LOG(FATAL) << "Unknown confidence loss type.";
  }
}

template <typename Dtype>
void MultiBoxLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top)
{
  LossLayer<Dtype>::Reshape(bottom, top);
  num_ = bottom[0]->num();// 图像的数量
  num_priors_ = bottom[2]->height() / 4; // priorbox的数量
  num_gt_ = bottom[3]->height();
  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
  CHECK_EQ(num_priors_ * loc_classes_ * 4, bottom[0]->channels())
      << "Number of priors must match number of location predictions.";
  CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
      << "Number of priors must match number of confidence predictions.";
}

/*
 *
 * Forward_cpu的主要流程：
    FindMatches:确定哪些priorbox是正样本，哪些是负样本，存放在all_match_indices_中
    MineHardExamples：Minig出符合条件的负样本
    计算正样本的定位loss
    计算所有正样本+Mining出来的负样本的分类loss
    最后的loss为定位和分类loss的加权和
 *
 *
 */

template <typename Dtype>
void MultiBoxLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top)
{
  /*
   *
    bottom: "mbox_loc"
    bottom: "mbox_conf"
    bottom: "mbox_priorbox"
    bottom: "label"
    top: "mbox_loss"
   *
   */
  const Dtype* loc_data = bottom[0]->cpu_data(); // mbox_loc,用于定位
  const Dtype* conf_data = bottom[1]->cpu_data(); // mbox_conf，用于分类，表示置信度
  const Dtype* prior_data = bottom[2]->cpu_data();// mbox_priorbox，生成的所有box
  const Dtype* gt_data = bottom[3]->cpu_data(); // label，也就是batchsize图像的坐标

  // 获取batchsize中所有图像的label(目标在图像中的坐标)
  // Retrieve all ground truth.
  map<int, vector<NormalizedBBox> > all_gt_bboxes;
  GetGroundTruth(gt_data, num_gt_, background_label_id_, use_difficult_gt_,
                 &all_gt_bboxes);


  // 获取所有priorbbox的坐标和权重
  // Retrieve all prior bboxes. It is same within a batch since we assume all
  // images in a batch are of same dimension.
  // 一旦所有的priorbox参数设置好，对于每一幅图像来说，所有的default box都是一样的
  vector<NormalizedBBox> prior_bboxes;
  vector<vector<float> > prior_variances;
  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);

  // Retrieve all predictions.
  // 获取每幅图像的所有的位置预测,share_location为true时all_loc_preds中map<>中的first就是-1,all_loc_pred可以等价于vector<vector<NormalizedBBox>> all_loc_preds
  vector<LabelBBox> all_loc_preds; // typedef map<int, vector<NormalizedBBox> > LabelBBox;
  GetLocPredictions(loc_data,
                    num_, // 图像的数量
                    num_priors_, // priorbox的数量(这里num_priors是一幅图像的所有priorbox)
                    loc_classes_,// 1
                    share_location_, // true
                    &all_loc_preds);

  // Find matches between source bboxes and ground truth bboxes.
  /* 计算batchsize中每一幅图像中与每个priorbox的IOU最大( >overlap_threshold，比如0.5)的那个ground truth box的序号(如果没有序号为-1，overlap为0)
   * 这一步就是确定哪些是正样本，哪些是负样本,与mtcnn中生成正负样本的原理是一样的
   */
  vector<map<int, vector<float> > > all_match_overlaps;//
  FindMatches(all_loc_preds,// 所有的位置预测
              all_gt_bboxes,// batchsize中所有图像对应的label
              prior_bboxes,// 所有priorbbox的坐标和权重
              prior_variances,
              multibox_loss_param_,
              &all_match_overlaps,// batchsize中每一幅图像中与每个priorbox的IOU最大( >overlap_threshold，比如0.5)的groundtruth的IOU值,vector<map<int, vector<float> > > all_match_overlaps;，这里的map中first为label,由于share_location,所以first为-1
              &all_match_indices_);// batchsize中每一幅图像中与每个priorbox的IOU最大( >overlap_threshold，比如0.5)的那个ground truth box的序号，vector<map<int, vector<int> > > all_match_indices_;这里的map中first为label,由于share_location,所以first为-1

  num_matches_ = 0;// 所有图像中的正样本
  int num_negs = 0;

  // Sample hard negative (and positive) examples based on mining type.
  /* 做Mining,挑选出难例,由于要计算loss，所以需要知道priorbox的类别，所以先要FindMatches
   * 对每个priorbox进行分类和回归，计算分类loss,定位loss
       * MAX_NEGATIVE只针对负样本选择(如果只对负样本做Minig,则只计算分类loss，不计算定位loss)
       * HARD_EXAMPLE会同时对正和负样本做Mining(也就是会同时计算分类和定位loss)

    * 这里以MAX_NEGATIVE为例，挑选出符合条件的负样本
    *
    * MineHardExamples中ComputeLocLoss，ComputeConfLoss计算loss的时候，batchsize中每一幅图像独立计算
    * 因为最后Mine的时候，会对每一幅图像做Mining
   */
  MineHardExamples(*bottom[1], all_loc_preds, all_gt_bboxes, prior_bboxes,
                   prior_variances, all_match_overlaps, multibox_loss_param_,
                   &num_matches_, &num_negs, &all_match_indices_,
                   &all_neg_indices_);

  // 计算正样本的定位loss(batchsize幅图像中所有的正样本一起计算)
  if (num_matches_ >= 1)
  {
    // Form data to pass on to loc_loss_layer_.
    vector<int> loc_shape(2);
    loc_shape[0] = 1;
    loc_shape[1] = num_matches_ * 4; // 匹配到的数量*4
    loc_pred_.Reshape(loc_shape); // blob which stores the matched location prediction
    loc_gt_.Reshape(loc_shape); // blob which stores the corresponding matched ground truth
    Dtype* loc_pred_data = loc_pred_.mutable_cpu_data();
    Dtype* loc_gt_data = loc_gt_.mutable_cpu_data();


    // MineHardExamples中计算定位Loss的时候也是这么算的
    EncodeLocPrediction(all_loc_preds, // bottom[0]->cpu_data(); mbox_loc,用于定位
                        all_gt_bboxes, // batchsize中所有图像对应的坐标
                        all_match_indices_, // batchsize中每一幅图像的每个priorbox IOU最大的那个ground truth box的序号(如果没有就为-1)
                        prior_bboxes,// 所有priorbbox的坐标和权重
                        prior_variances,
                        multibox_loss_param_,
                        loc_pred_data,// batchsize幅图像中所有正样本priorbox的网络预测值(这里的正样本priorbox就是有匹配的priorbox)
                        loc_gt_data); // batchsize幅图像中所有正样本priorbox的groundtruth，即groundtruth与priorbox坐标的偏移量，也就是目标在priorbox中的坐标

    // 使用Smooth_L1 loss
    // Mining中计算loss的函数ComputeLocLoss也是使用的同样的原理，ComputeLocLoss中负样本的定位Loss为0
    // loc_bottom_vec_.push_back(&loc_pred_);
    // loc_bottom_vec_.push_back(&loc_gt_);
    // loc_top_vec_.push_back(&loc_loss_);
    loc_loss_layer_->Reshape(loc_bottom_vec_, loc_top_vec_);
    loc_loss_layer_->Forward(loc_bottom_vec_, loc_top_vec_);
  }
  else
  {
    loc_loss_.mutable_cpu_data()[0] = 0;
  }

  // 计算所有正样本+Mining出来的负样本的分类loss
  // Form data to pass on to conf_loss_layer_.
  if (do_neg_mining_)
  {
    num_conf_ = num_matches_ + num_negs; // 所有的正样本加上Mining出来的负样本
  }
  else
  {
    num_conf_ = num_ * num_priors_;
  }

  if (num_conf_ >= 1)
  {
    // Reshape the confidence data.
    vector<int> conf_shape;
    if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_SOFTMAX)
    {
      conf_shape.push_back(num_conf_);
      conf_gt_.Reshape(conf_shape);
      conf_shape.push_back(num_classes_);
      conf_pred_.Reshape(conf_shape);
    }
    else if (conf_loss_type_ == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
      conf_shape.push_back(1);
      conf_shape.push_back(num_conf_);
      conf_shape.push_back(num_classes_);
      conf_gt_.Reshape(conf_shape);
      conf_pred_.Reshape(conf_shape);
    } else {
      LOG(FATAL) << "Unknown confidence loss type.";
    }
    if (!do_neg_mining_) {
      // Consider all scores.
      // Share data and diff with bottom[1].
      CHECK_EQ(conf_pred_.count(), bottom[1]->count());
      conf_pred_.ShareData(*(bottom[1]));
    }
    Dtype* conf_pred_data = conf_pred_.mutable_cpu_data();// blob which stores the confidence prediction.
    Dtype* conf_gt_data = conf_gt_.mutable_cpu_data();// blob which stores the corresponding ground truth label.
    caffe_set(conf_gt_.count(), Dtype(background_label_id_), conf_gt_data);


    // 计算分类loss，要知道网络预测值和groundtruth
    EncodeConfPrediction(conf_data, // mbox_conf，用于分类，表示置信度
                         num_,
                         num_priors_,
                         multibox_loss_param_,
                         all_match_indices_,// batchsize中每一幅图像的每个priorbox IOU最大的那个ground truth box的序号(如果没有就为-1)
                         all_neg_indices_, // 只计算hard样本
                         all_gt_bboxes,// batchsize中所有图像对应的坐标
                         conf_pred_data,// 所有正样本+Mining出来的负样本的priorbbox的网络预测值
                         conf_gt_data);// conf_pred_data中所有样本的label

    conf_loss_layer_->Reshape(conf_bottom_vec_, conf_top_vec_);// conf_bottom_vec_.push_back(&conf_pred_);conf_bottom_vec_.push_back(&conf_gt_);
    conf_loss_layer_->Forward(conf_bottom_vec_, conf_top_vec_);// conf_loss_.Reshape(loss_shape);
  }
  else
  {
    conf_loss_.mutable_cpu_data()[0] = 0;
  }

  // 最后的loss为定位和分类loss的加权和
  top[0]->mutable_cpu_data()[0] = 0;
  if (this->layer_param_.propagate_down(0))
  {
    Dtype normalizer = LossLayer<Dtype>::GetNormalizer(normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data()[0] +=loc_weight_ * loc_loss_.cpu_data()[0] / normalizer;
  }

  if (this->layer_param_.propagate_down(1))
  {
    Dtype normalizer = LossLayer<Dtype>::GetNormalizer(normalization_, num_, num_priors_, num_matches_);
    top[0]->mutable_cpu_data()[0] += conf_loss_.cpu_data()[0] / normalizer;
  }
}

template <typename Dtype>
void MultiBoxLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {

  if (propagate_down[2]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to prior inputs.";
  }
  if (propagate_down[3]) {
    LOG(FATAL) << this->type()
        << " Layer cannot backpropagate to label inputs.";
  }

  // Back propagate on location prediction.
  if (propagate_down[0]) {
    Dtype* loc_bottom_diff = bottom[0]->mutable_cpu_diff();
    caffe_set(bottom[0]->count(), Dtype(0), loc_bottom_diff);
    if (num_matches_ >= 1) {
      vector<bool> loc_propagate_down;
      // Only back propagate on prediction, not ground truth.
      loc_propagate_down.push_back(true);
      loc_propagate_down.push_back(false);
      loc_loss_layer_->Backward(loc_top_vec_, loc_propagate_down,
                                loc_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Dtype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer;
      caffe_scal(loc_pred_.count(), loss_weight, loc_pred_.mutable_cpu_diff());
      // Copy gradient back to bottom[0].
      const Dtype* loc_pred_diff = loc_pred_.cpu_diff();
      int count = 0;
      for (int i = 0; i < num_; ++i) {
        for (map<int, vector<int> >::iterator it =
             all_match_indices_[i].begin();
             it != all_match_indices_[i].end(); ++it) {
          const int label = share_location_ ? 0 : it->first;
          const vector<int>& match_index = it->second;
          for (int j = 0; j < match_index.size(); ++j) {
            if (match_index[j] <= -1) {
              continue;
            }
            // Copy the diff to the right place.
            int start_idx = loc_classes_ * 4 * j + label * 4;
            caffe_copy<Dtype>(4, loc_pred_diff + count * 4,
                              loc_bottom_diff + start_idx);
            ++count;
          }
        }
        loc_bottom_diff += bottom[0]->offset(1);
      }
    }
  }

  // Back propagate on confidence prediction.
  if (propagate_down[1]) {
    Dtype* conf_bottom_diff = bottom[1]->mutable_cpu_diff();
    caffe_set(bottom[1]->count(), Dtype(0), conf_bottom_diff);
    if (num_conf_ >= 1) {
      vector<bool> conf_propagate_down;
      // Only back propagate on prediction, not ground truth.
      conf_propagate_down.push_back(true);
      conf_propagate_down.push_back(false);
      conf_loss_layer_->Backward(conf_top_vec_, conf_propagate_down,
                                 conf_bottom_vec_);
      // Scale gradient.
      Dtype normalizer = LossLayer<Dtype>::GetNormalizer(
          normalization_, num_, num_priors_, num_matches_);
      Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer;
      caffe_scal(conf_pred_.count(), loss_weight,
                 conf_pred_.mutable_cpu_diff());
      // Copy gradient back to bottom[1].
      const Dtype* conf_pred_diff = conf_pred_.cpu_diff();
      if (do_neg_mining_) {
        int count = 0;
        for (int i = 0; i < num_; ++i) {
          // Copy matched (positive) bboxes scores' diff.
          const map<int, vector<int> >& match_indices = all_match_indices_[i];
          for (map<int, vector<int> >::const_iterator it =
               match_indices.begin(); it != match_indices.end(); ++it) {
            const vector<int>& match_index = it->second;
            CHECK_EQ(match_index.size(), num_priors_);
            for (int j = 0; j < num_priors_; ++j) {
              if (match_index[j] <= -1) {
                continue;
              }
              // Copy the diff to the right place.
              caffe_copy<Dtype>(num_classes_,
                                conf_pred_diff + count * num_classes_,
                                conf_bottom_diff + j * num_classes_);
              ++count;
            }
          }
          // Copy negative bboxes scores' diff.
          for (int n = 0; n < all_neg_indices_[i].size(); ++n) {
            int j = all_neg_indices_[i][n];
            CHECK_LT(j, num_priors_);
            caffe_copy<Dtype>(num_classes_,
                              conf_pred_diff + count * num_classes_,
                              conf_bottom_diff + j * num_classes_);
            ++count;
          }
          conf_bottom_diff += bottom[1]->offset(1);
        }
      } else {
        // The diff is already computed and stored.
        bottom[1]->ShareDiff(conf_pred_);
      }
    }
  }

  // After backward, remove match statistics.
  all_match_indices_.clear();
  all_neg_indices_.clear();
}

INSTANTIATE_CLASS(MultiBoxLossLayer);
REGISTER_LAYER_CLASS(MultiBoxLoss);

}  // namespace caffe

其中有几个比较重要的函数FindMatches(),MineHardExamples(),下面对他们详细解读一下

FindMatches

FindMatche函数就是寻找每一幅图像中与每个priorbox匹配的ground truth，这里匹配的意思就是所有IOU>阈值中的最大的ground truth

void FindMatches(const vector<LabelBBox>& all_loc_preds,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      vector<map<int, vector<float> > >* all_match_overlaps,
      vector<map<int, vector<int> > >* all_match_indices)
{
  // all_match_overlaps->clear();
  // all_match_indices->clear();
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const bool share_location = multibox_loss_param.share_location();//  true
  const int loc_classes = share_location ? 1 : num_classes; // 类别数
  const MatchType match_type = multibox_loss_param.match_type(); // match_type: PER_PREDICTION,per_prediction
  const float overlap_threshold = multibox_loss_param.overlap_threshold();// 0.5
  const bool use_prior_for_matching =multibox_loss_param.use_prior_for_matching(); // true
  const int background_label_id = multibox_loss_param.background_label_id();// 0
  const CodeType code_type = multibox_loss_param.code_type(); // CENTER_SIZE
  const bool encode_variance_in_target =multibox_loss_param.encode_variance_in_target(); // false
  const bool ignore_cross_boundary_bbox =multibox_loss_param.ignore_cross_boundary_bbox(); // false

  // Find the matches.
  int num = all_loc_preds.size();
  for (int i = 0; i < num; ++i) // 所有图像
  {
    map<int, vector<int> > match_indices;
    map<int, vector<float> > match_overlaps;
    // Check if there is ground truth for current image.
    if (all_gt_bboxes.find(i) == all_gt_bboxes.end())
    {
      // There is no gt for current image. All predictions are negative.
      all_match_indices->push_back(match_indices);
      all_match_overlaps->push_back(match_overlaps);
      continue;
    }
    // Find match between predictions and ground truth.
    const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
    if (!use_prior_for_matching)
    {
      for (int c = 0; c < loc_classes; ++c)
      {
        int label = share_location ? -1 : c;
        if (!share_location && label == background_label_id)
        {
          // Ignore background loc predictions.
          continue;
        }
        // Decode the prediction into bbox first.
        vector<NormalizedBBox> loc_bboxes;
        bool clip_bbox = false;
        DecodeBBoxes(prior_bboxes, prior_variances,
                     code_type, encode_variance_in_target, clip_bbox,
                     all_loc_preds[i].find(label)->second, &loc_bboxes);
        MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
                  overlap_threshold, ignore_cross_boundary_bbox,
                  &match_indices[label], &match_overlaps[label]);
      }
    }
    else
    {
      // Use prior bboxes to match against all ground truth.
      vector<int> temp_match_indices; // 存放batchsize中每一幅图像的每个priorbox IOU最大的那个ground truth box
      vector<float> temp_match_overlaps; // 存放batchsize中每一幅图像的每个priorbox IOU最大的那个ground truth box的IOU值
      const int label = -1;
      // 对每一幅图像的ground-truth bbox和prior_bboxes 进行匹配
      // 计算每个与priorbbox的IOU最大(>overlap_threshold)的那个ground truth
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps);
      if (share_location)
      {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      }
      else
      {
        // Get ground truth label for each ground truth bbox.
        vector<int> gt_labels;
        for (int g = 0; g < gt_bboxes.size(); ++g)
        {
          gt_labels.push_back(gt_bboxes[g].label());
        }
        // Distribute the matching results to different loc_class.
        for (int c = 0; c < loc_classes; ++c)
        {
          if (c == background_label_id)
          {
            // Ignore background loc predictions.
            continue;
          }
          match_indices[c].resize(temp_match_indices.size(), -1);
          match_overlaps[c] = temp_match_overlaps;
          for (int m = 0; m < temp_match_indices.size(); ++m)
          {
            if (temp_match_indices[m] > -1)
            {
              const int gt_idx = temp_match_indices[m];
              CHECK_LT(gt_idx, gt_labels.size());
              if (c == gt_labels[gt_idx])
              {
                match_indices[c][m] = gt_idx;
              }
            }
          }
        }
      }
    }
    all_match_indices->push_back(match_indices);
    all_match_overlaps->push_back(match_overlaps);
  }
}

void MatchBBox(const vector<NormalizedBBox>& gt_bboxes,
    const vector<NormalizedBBox>& pred_bboxes, const int label,// const int label = -1;
    const MatchType match_type, const float overlap_threshold,
    const bool ignore_cross_boundary_bbox, // false
    vector<int>* match_indices, vector<float>* match_overlaps)
{
  int num_pred = pred_bboxes.size();
  match_indices->clear();
  match_indices->resize(num_pred, -1);
  match_overlaps->clear();
  match_overlaps->resize(num_pred, 0.);

  int num_gt = 0;
  vector<int> gt_indices;
  if (label == -1)
  {
    // label -1 means comparing against all ground truth.
    num_gt = gt_bboxes.size();
    for (int i = 0; i < num_gt; ++i)
    {
      gt_indices.push_back(i);
    }
  }
  else
  {
    // Count number of ground truth boxes which has the desired label.
    for (int i = 0; i < gt_bboxes.size(); ++i)
    {
      if (gt_bboxes[i].label() == label)
      {
        num_gt++;
        gt_indices.push_back(i);
      }
    }
  }
  if (num_gt == 0)
  {
    return;
  }

  // Store the positive overlap between predictions and ground truth.
  map<int, map<int, float> > overlaps; // 该priorbox与每个ground truth box的IOU

  // 计算与每个priorbbox有交集的最大IOU的ground truth box
  for (int i = 0; i < num_pred; ++i)
  {
    if (ignore_cross_boundary_bbox && IsCrossBoundaryBBox(pred_bboxes[i]))
    {
      (*match_indices)[i] = -2;
      continue;
    }
    for (int j = 0; j < num_gt; ++j)
    {
      float overlap = JaccardOverlap(pred_bboxes[i], gt_bboxes[gt_indices[j]]);

      // 有交集
      if (overlap > 1e-6)
      {
         // 计算最大的IOU
         // 计算与第i个priorbbox的IOU最大的ground truth
        (*match_overlaps)[i] = std::max((*match_overlaps)[i], overlap);
        overlaps[i][j] = overlap; // 保存所有的IOU的值
      }
    }
  }

  // Bipartite matching.
  vector<int> gt_pool;
  for (int i = 0; i < num_gt; ++i)
  {
    gt_pool.push_back(i);
  }
  while (gt_pool.size() > 0)
  {
    // Find the most overlapped gt and cooresponding predictions.
    int max_idx = -1;
    int max_gt_idx = -1;
    float max_overlap = -1;
    for (map<int, map<int, float> >::iterator it = overlaps.begin();it != overlaps.end(); ++it)
    {
      int i = it->first;// priorbox的序号
      if ((*match_indices)[i] != -1)
      {
        // The prediction already has matched ground truth or is ignored.
        continue;
      }

      // 遍历该prior box与所有ground truth box的IOU,找出最大的IOU对应的ground truth
      for (int p = 0; p < gt_pool.size(); ++p)
      {
        int j = gt_pool[p]; // ground truth的序号
        if (it->second.find(j) == it->second.end())
        {
          // No overlap between the i-th prediction and j-th ground truth.
          continue;
        }
        // Find the maximum overlapped pair.
        if (it->second[j] > max_overlap)
        {
          // If the prediction has not been matched to any ground truth,
          // and the overlap is larger than maximum overlap, update.
          max_idx = i;
          max_gt_idx = j;
          max_overlap = it->second[j];
        }
      }
    }
    if (max_idx == -1)
    {
      // Cannot find good match.
      break;
    }
    else
    {
      CHECK_EQ((*match_indices)[max_idx], -1);

      (*match_indices)[max_idx] = gt_indices[max_gt_idx];
      (*match_overlaps)[max_idx] = max_overlap;

      // Erase the ground truth.
      gt_pool.erase(std::find(gt_pool.begin(), gt_pool.end(), max_gt_idx));
    }
  }

  switch (match_type)
  {
    case MultiBoxLossParameter_MatchType_BIPARTITE:
      // Already done.
      break;
    case MultiBoxLossParameter_MatchType_PER_PREDICTION:
      // Get most overlaped for the rest prediction bboxes.
      for (map<int, map<int, float> >::iterator it = overlaps.begin();
           it != overlaps.end(); ++it)
      {
        int i = it->first;
        if ((*match_indices)[i] != -1)
        {
          // The prediction already has matched ground truth or is ignored.
          continue;
        }
        int max_gt_idx = -1;
        float max_overlap = -1;
        for (int j = 0; j < num_gt; ++j)
        {
          if (it->second.find(j) == it->second.end())
          {
            // No overlap between the i-th prediction and j-th ground truth.
            continue;
          }
          // Find the maximum overlapped pair.
          float overlap = it->second[j];
          if (overlap >= overlap_threshold && overlap > max_overlap)
          {
            // If the prediction has not been matched to any ground truth,
            // and the overlap is larger than maximum overlap, update.
            max_gt_idx = j;
            max_overlap = overlap;
          }
        }
        if (max_gt_idx != -1)
        {
          // Found a matched ground truth.
          CHECK_EQ((*match_indices)[i], -1);

          // 寻找到了最大的ground truth,以及最大的IOU
          (*match_indices)[i] = gt_indices[max_gt_idx]; // 与第i个priorbox的IOU最大的是第gt_indices[max_gt_idx]即max_gt_idx个ground truth
          (*match_overlaps)[i] = max_overlap;
        }
      }
      break;
    default:
      LOG(FATAL) << "Unknown matching type.";
      break;
  }

  return;
}

MineHardExamples

MineHardExamples就是做Mining的

template <typename Dtype>
void MineHardExamples(const Blob<Dtype>& conf_blob,
    const vector<LabelBBox>& all_loc_preds,
    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
    const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const vector<map<int, vector<float> > >& all_match_overlaps,
    const MultiBoxLossParameter& multibox_loss_param,
    int* num_matches, int* num_negs,
    vector<map<int, vector<int> > >* all_match_indices,
    vector<vector<int> >* all_neg_indices)
{
  int num = all_loc_preds.size();
  // CHECK_EQ(num, all_match_overlaps.size());
  // CHECK_EQ(num, all_match_indices->size());
  // all_neg_indices->clear();
  *num_matches = CountNumMatches(*all_match_indices, num);// 所有图像中的正样本
  *num_negs = 0;
  int num_priors = prior_bboxes.size();
  CHECK_EQ(num_priors, prior_variances.size());
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const int background_label_id = multibox_loss_param.background_label_id();
  const bool use_prior_for_nms = multibox_loss_param.use_prior_for_nms();
  const ConfLossType conf_loss_type = multibox_loss_param.conf_loss_type();
  const MiningType mining_type = multibox_loss_param.mining_type();
  if (mining_type == MultiBoxLossParameter_MiningType_NONE)
  {
    return;
  }
  const LocLossType loc_loss_type = multibox_loss_param.loc_loss_type();
  const float neg_pos_ratio = multibox_loss_param.neg_pos_ratio();
  const float neg_overlap = multibox_loss_param.neg_overlap();
  const CodeType code_type = multibox_loss_param.code_type();
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target();
  const bool has_nms_param = multibox_loss_param.has_nms_param();
  float nms_threshold = 0;
  int top_k = -1;
  if (has_nms_param)
  {
    nms_threshold = multibox_loss_param.nms_param().nms_threshold();
    top_k = multibox_loss_param.nms_param().top_k();
  }
  const int sample_size = multibox_loss_param.sample_size();
  // Compute confidence losses based on matching results.
  vector<vector<float> > all_conf_loss;
#ifdef CPU_ONLY

  /*计算batchsize中每一幅图像每个priorbox的分类loss(softmax loss)，每个priorbox的类别是通过是否有对应的groundtruth来决定的(有就是正样本，没有就是负)
   *
    总结：
    MineHardExamples中ComputeConfLoss，EncodeLocPrediction，ComputeLocLoss基本采用的是相同的思路：

    // 遍历batchsize中的每一幅图像
    for (int i = 0; i < num; ++i)
    {
        // 获取每幅图像中与每个priorbox匹配的那个ground truth的序号(其实就是与每个priorbox的IOU最大的那个ground-truth的序号)
        const vector<int>& match_index;

        // 遍历每一个priorbox
        for (int j = 0; j < match_index.size(); ++j)
        {
            // 负样本
            if (match_index[j] <= -1)
            {
            }
        }

    }

   */
  ComputeConfLoss(conf_blob.cpu_data(),
                  num,
                  num_priors,
                  num_classes,// 21,分类类别数
                  background_label_id,
                  conf_loss_type,
                  *all_match_indices,
                  all_gt_bboxes,
                  &all_conf_loss);
#else
  ComputeConfLossGPU(conf_blob, num, num_priors, num_classes,
      background_label_id, conf_loss_type, *all_match_indices, all_gt_bboxes,
      &all_conf_loss);
#endif
  /* 计算batchsize中每一幅图像的每个priorbox的定位loss(负样本为0)
       * MAX_NEGATIVE只针对负样本选择(如果只对负样本做Minig,则只计算分类loss，不计算定位loss)
       * HARD_EXAMPLE会同时对正和负样本做Mining(也就是会同时计算分类和定位loss)
   */
  vector<vector<float> > all_loc_loss;// 如果是MAX_NEGATIVE，设置为0
  if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE)
  {
    // Compute localization losses based on matching results.
    Blob<Dtype> loc_pred, loc_gt;
    if (*num_matches != 0)
    {
      vector<int> loc_shape(2, 1);
      loc_shape[1] = *num_matches * 4;
      loc_pred.Reshape(loc_shape);
      loc_gt.Reshape(loc_shape);
      Dtype* loc_pred_data = loc_pred.mutable_cpu_data();
      Dtype* loc_gt_data = loc_gt.mutable_cpu_data();

      // 计算batchsize幅图像所有正样本的预测值和ground truth
      EncodeLocPrediction(all_loc_preds,
                          all_gt_bboxes,
                          *all_match_indices,
                          prior_bboxes,
                          prior_variances,
                          multibox_loss_param,
                          loc_pred_data, // batchsize幅图像中所有正样本priorbox的网络预测值(这里的正样本priorbox就是有匹配的priorbox)
                          loc_gt_data); // batchsize幅图像中所有正样本priorbox的groundtruth，即groundtruth与priorbox坐标的偏移量，也就是目标在priorbox中的坐标
    }

    // 计算batchsize中每一幅图像的每个priorbox的定位loss,这个loss就是|网络预测值-正样本priorbox与ground truth的偏移|
    // 对于负样本，该loss为0
    ComputeLocLoss(loc_pred, loc_gt, *all_match_indices, num,
                   num_priors, loc_loss_type, &all_loc_loss);
  }
  else
  {
    // No localization loss.
    for (int i = 0; i < num; ++i) {
      vector<float> loc_loss(num_priors, 0.f);
      all_loc_loss.push_back(loc_loss);
    }
  }

  // 每一幅图像独立计算
  for (int i = 0; i < num; ++i)
  {
    map<int, vector<int> >& match_indices = (*all_match_indices)[i];
    const map<int, vector<float> >& match_overlaps = all_match_overlaps[i];

    // loc + conf loss.
    const vector<float>& conf_loss = all_conf_loss[i];
    const vector<float>& loc_loss = all_loc_loss[i];
    vector<float> loss;
    std::transform(conf_loss.begin(), conf_loss.end(), loc_loss.begin(),
                   std::back_inserter(loss), std::plus<float>());

    // Pick negatives or hard examples based on loss.
    // 基于定位和分类两个loss做mining，这里以MAX_NEGATIVE为例
    set<int> sel_indices;
    vector<int> neg_indices; // 每幅图像选择出来的负样本priorbox的序号
    for (map<int, vector<int> >::iterator it = match_indices.begin();it != match_indices.end(); ++it)
    {
      const int label = it->first;
      int num_sel = 0;
      // Get potential indices and loss pairs.
      // 挑选所有负样本prior_box的loss和该priorbox的序号
      /*这里假设为MAX_NEGATIVE
       * MAX_NEGATIVE只针对负样本选择(如果只对负样本做Minig,则只计算分类loss，不计算定位loss)
       * HARD_EXAMPLE会同时对正和负样本做Mining(也就是会同时计算分类和定位loss)
        // Mining type during training.
        //   NONE : use all negatives.
        //   MAX_NEGATIVE : select negatives based on the score.
        //   HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al.
       */
      vector<pair<float, int> > loss_indices;
      for (int m = 0; m < match_indices[label].size(); ++m) // 遍历每一个priorbox
      {
        // 对于MAX_NEGATIVE，判断是否为负样本
        if (IsEligibleMining(mining_type, match_indices[label][m],match_overlaps.find(label)->second[m], neg_overlap)) 
        {
          // 该prior_box的loss和该priorbox的序号
          loss_indices.push_back(std::make_pair(loss[m], m));
          ++num_sel;
        }
      }
      if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE)
        {
          // 计算正样本个数
        int num_pos = 0;
        for (int m = 0; m < match_indices[label].size(); ++m) // 判断每个Priorbox的类别
        {
          if (match_indices[label][m] > -1)
          {
            ++num_pos;
          }
        }
        // 计算负样本的个数
        num_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), num_sel);
      } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {
        CHECK_GT(sample_size, 0);
        num_sel = std::min(sample_size, num_sel);
      }
      // Select samples，这里执行else
      if (has_nms_param && nms_threshold > 0)
      {
        // Do nms before selecting samples.
        vector<float> sel_loss;
        vector<NormalizedBBox> sel_bboxes;
        if (use_prior_for_nms) {
          for (int m = 0; m < match_indices[label].size(); ++m)
          {
            if (IsEligibleMining(mining_type, match_indices[label][m],
                match_overlaps.find(label)->second[m], neg_overlap))
            {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(prior_bboxes[m]);
            }
          }
        }
        else
        {
          // Decode the prediction into bbox first.
          vector<NormalizedBBox> loc_bboxes;
          bool clip_bbox = false;
          DecodeBBoxes(prior_bboxes, prior_variances,
                       code_type, encode_variance_in_target, clip_bbox,
                       all_loc_preds[i].find(label)->second, &loc_bboxes);
          for (int m = 0; m < match_indices[label].size(); ++m)
          {
            if (IsEligibleMining(mining_type, match_indices[label][m],match_overlaps.find(label)->second[m], neg_overlap)) 
            {
              sel_loss.push_back(loss[m]);
              sel_bboxes.push_back(loc_bboxes[m]);
            }
          }
        }
        // Do non-maximum suppression based on the loss.
        vector<int> nms_indices;
        ApplyNMS(sel_bboxes, sel_loss, nms_threshold, top_k, &nms_indices);
        if (nms_indices.size() < num_sel) {
          LOG(INFO) << "not enough sample after nms: " << nms_indices.size();
        }
        // Pick top example indices after nms.
        num_sel = std::min(static_cast<int>(nms_indices.size()), num_sel);
        for (int n = 0; n < num_sel; ++n) {
          sel_indices.insert(loss_indices[nms_indices[n]].second);
        }
      }
      else
      {
         // 对负样本的loss排序
        // Pick top example indices based on loss.
        std::sort(loss_indices.begin(), loss_indices.end(),SortScorePairDescend<int>);

        // 选择符合条件的负样本
        for (int n = 0; n < num_sel; ++n)
        {
          sel_indices.insert(loss_indices[n].second);
        }
      }
      // Update the match_indices and select neg_indices.
      for (int m = 0; m < match_indices[label].size(); ++m)
      {
        if (match_indices[label][m] > -1)
        {
          if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE &&sel_indices.find(m) == sel_indices.end())
          {
            match_indices[label][m] = -1;
            *num_matches -= 1;
          }
        } else if (match_indices[label][m] == -1) {
          if (sel_indices.find(m) != sel_indices.end()) {
            neg_indices.push_back(m);// 负样本的priorbox
            *num_negs += 1;
          }
        }
      }
    }
    all_neg_indices->push_back(neg_indices); // 所有的hard sample
  }
}

// 计算所有priorbox的softmax loss，每个priorbox的类别是通过是否有对应的groundtruth来决定的
template <typename Dtype>
void ComputeConfLoss(const Dtype* conf_data, const int num,
      const int num_preds_per_class, // priorbox的数量
                     const int num_classes,// 21分类
      const int background_label_id, const ConfLossType loss_type,
      const vector<map<int, vector<int> > >& all_match_indices,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      vector<vector<float> >* all_conf_loss)
{
  CHECK_LT(background_label_id, num_classes);

  // CHECK_EQ(num, all_match_indices.size());
  all_conf_loss->clear();
  for (int i = 0; i < num; ++i) // 每幅图像
  {
    vector<float> conf_loss;
    const map<int, vector<int> >& match_indices = all_match_indices[i];
    for (int p = 0; p < num_preds_per_class; ++p) // 每一幅图像所有priorbox
    {
      int start_idx = p * num_classes; // 该priorbox在预测中的起始位置

      // 获取该priorbox的label，priorbox的label通过是否有匹配的ground truth来确定
      int label = background_label_id;
      for (map<int, vector<int> >::const_iterator it =match_indices.begin(); it != match_indices.end(); ++it) 
      {
          // 每幅图像中与每个priorbox匹配的那个ground truth的序号(其实就是与每个priorbox的IOU最大的那个ground-truth的序号)
        const vector<int>& match_index = it->second;
        CHECK_EQ(match_index.size(), num_preds_per_class);
        if (match_index[p] > -1)  // 找到了该priorbox对应的groundtruth
        {
          CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
          const vector<NormalizedBBox>& gt_bboxes =
              all_gt_bboxes.find(i)->second;
          CHECK_LT(match_index[p], gt_bboxes.size());
          label = gt_bboxes[match_index[p]].label(); // 获取该priorbox的类别
          CHECK_GE(label, 0);
          CHECK_NE(label, background_label_id);
          CHECK_LT(label, num_classes);
          // A prior can only be matched to one gt bbox.
          break;
        }
      }

      // 计算softmax loss
      Dtype loss = 0;
      if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX)
      {
        CHECK_GE(label, 0);
        CHECK_LT(label, num_classes);
        // Compute softmax probability.
        // We need to subtract the max to avoid numerical issues.
        Dtype maxval = conf_data[start_idx];
        for (int c = 1; c < num_classes; ++c)
        {
          maxval = std::max<Dtype>(conf_data[start_idx + c], maxval);
        }
        Dtype sum = 0.;
        for (int c = 0; c < num_classes; ++c)
        {
          sum += std::exp(conf_data[start_idx + c] - maxval);
        }
        Dtype prob = std::exp(conf_data[start_idx + label] - maxval) / sum;
        loss = -log(std::max(prob, Dtype(FLT_MIN)));
      } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) {
        int target = 0;
        for (int c = 0; c < num_classes; ++c) {
          if (c == label) {
            target = 1;
          } else {
            target = 0;
          }
          Dtype input = conf_data[start_idx + c];
          loss -= input * (target - (input >= 0)) -
              log(1 + exp(input - 2 * input * (input >= 0)));
        }
      } else {
        LOG(FATAL) << "Unknown conf loss type.";
      }
      conf_loss.push_back(loss);
    }
    conf_data += num_preds_per_class * num_classes;
    all_conf_loss->push_back(conf_loss);
  }
}

template <typename Dtype>
void EncodeLocPrediction(const vector<LabelBBox>& all_loc_preds, // LabelBBox中的first为-1
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<map<int, vector<int> > >& all_match_indices,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      Dtype* loc_pred_data, // batchsize幅图像中所有正样本priorbox的网络预测值
        Dtype* loc_gt_data) // batchsize幅图像中所有正样本priorbox的groundtruth，即groundtruth与priorbox坐标的偏移量，也就是目标在priorbox中的坐标
{
  int num = all_loc_preds.size(); // 图片的数量
  // CHECK_EQ(num, all_match_indices.size());
  // Get parameters.
  const CodeType code_type = multibox_loss_param.code_type(); // CENTER_SIZE
  const bool encode_variance_in_target =multibox_loss_param.encode_variance_in_target(); // false
  const bool bp_inside = multibox_loss_param.bp_inside(); // false
  const bool use_prior_for_matching = multibox_loss_param.use_prior_for_matching();// true
  int count = 0; // 有匹配的priorbox的数量，也就是正样本的数量

  // 遍历batchsize中的每一幅图像
  for (int i = 0; i < num; ++i)
  {
    for (map<int, vector<int> >::const_iterator it = all_match_indices[i].begin();it != all_match_indices[i].end(); ++it)
    {
      const int label = it->first;
      const vector<int>& match_index = it->second; // 每幅图像中与每个priorbox匹配的那个ground truth的序号(其实就是与每个priorbox的IOU最大的那个ground-truth的序号)
      CHECK(all_loc_preds[i].find(label) != all_loc_preds[i].end());

      // 该幅图像中所有priorbox的预测坐标
      const vector<NormalizedBBox>& loc_pred =all_loc_preds[i].find(label)->second;

      // 通过match_index遍历每一个priorbox,判断是否是正样本，是的话计算偏移量，并保存该priorbox的预测值
      for (int j = 0; j < match_index.size(); ++j)
      {
        if (match_index[j] <= -1)
        {
          continue;
        }
        // Store encoded ground truth.
        const int gt_idx = match_index[j]; // 该幅图像中第j个priorbox对应的IOU最大的那个groundtruth
        CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
        CHECK_LT(gt_idx, all_gt_bboxes.find(i)->second.size());
        const NormalizedBBox& gt_bbox = all_gt_bboxes.find(i)->second[gt_idx]; // 该幅图像中第gt_idx的ground-truth box
        NormalizedBBox gt_encode;
        CHECK_LT(j, prior_bboxes.size());
        EncodeBBox(prior_bboxes[j],  //
                   prior_variances[j],
                   code_type, // center_size
                   encode_variance_in_target,// false
                   gt_bbox, // 与prior_bboxes[j] IOU最大的那个gt_bbox
                   &gt_encode); // gt_encode就是ground truth与对应的prior_bboxes的偏移量(这个ground truth就是与prior_bboxes的IOU最大的那个)
                                // (由于感受野的不同，感受的区域并不是priorbbox表示的区域，所以要重新计算坐标)
        loc_gt_data[count * 4] = gt_encode.xmin();
        loc_gt_data[count * 4 + 1] = gt_encode.ymin();
        loc_gt_data[count * 4 + 2] = gt_encode.xmax();
        loc_gt_data[count * 4 + 3] = gt_encode.ymax();
        // Store location prediction.
        CHECK_LT(j, loc_pred.size());
        if (bp_inside)
        {
          NormalizedBBox match_bbox = prior_bboxes[j];
          if (!use_prior_for_matching)
          {
            const bool clip_bbox = false;
            DecodeBBox(prior_bboxes[j], prior_variances[j], code_type,
                       encode_variance_in_target, clip_bbox, loc_pred[j],
                       &match_bbox);
          }
          // When a dimension of match_bbox is outside of image region, use
          // gt_encode to simulate zero gradient.
          loc_pred_data[count * 4] =
              (match_bbox.xmin() < 0 || match_bbox.xmin() > 1) ?
              gt_encode.xmin() : loc_pred[j].xmin();
          loc_pred_data[count * 4 + 1] =
              (match_bbox.ymin() < 0 || match_bbox.ymin() > 1) ?
              gt_encode.ymin() : loc_pred[j].ymin();
          loc_pred_data[count * 4 + 2] =
              (match_bbox.xmax() < 0 || match_bbox.xmax() > 1) ?
              gt_encode.xmax() : loc_pred[j].xmax();
          loc_pred_data[count * 4 + 3] =
              (match_bbox.ymax() < 0 || match_bbox.ymax() > 1) ?
              gt_encode.ymax() : loc_pred[j].ymax();
        }
        else
        {
          // 第j个prior bbox的预测值
          loc_pred_data[count * 4] = loc_pred[j].xmin();
          loc_pred_data[count * 4 + 1] = loc_pred[j].ymin();
          loc_pred_data[count * 4 + 2] = loc_pred[j].xmax();
          loc_pred_data[count * 4 + 3] = loc_pred[j].ymax();
        }
        if (encode_variance_in_target)
        {
          for (int k = 0; k < 4; ++k)
          {
            CHECK_GT(prior_variances[j][k], 0);
            loc_pred_data[count * 4 + k] /= prior_variances[j][k];
            loc_gt_data[count * 4 + k] /= prior_variances[j][k];
          }
        }
        ++count;
      }
    }
  }
}


template <typename Dtype>
void ComputeLocLoss(const Blob<Dtype>& loc_pred, const Blob<Dtype>& loc_gt,
      const vector<map<int, vector<int> > >& all_match_indices,
      const int num, const int num_priors, const LocLossType loc_loss_type,
      vector<vector<float> >* all_loc_loss)
{
  int loc_count = loc_pred.count();
  CHECK_EQ(loc_count, loc_gt.count());
  Blob<Dtype> diff;
  const Dtype* diff_data = NULL;
  if (loc_count != 0)
  {
    diff.Reshape(loc_pred.shape());
    caffe_sub(loc_count, loc_pred.cpu_data(), loc_gt.cpu_data(),
              diff.mutable_cpu_data());
    diff_data = diff.cpu_data();
  }
  int count = 0;
  for (int i = 0; i < num; ++i)
  {
    // 计算batchsizse中每幅图像每一个priorbox的定位loss
    vector<float> loc_loss(num_priors, 0.f); // 初始化都为0
    for (map<int, vector<int> >::const_iterator it = all_match_indices[i].begin();it != all_match_indices[i].end(); ++it)
    {
        // 每幅图像中与每个priorbox匹配的那个ground truth的序号(其实就是与每个priorbox的IOU最大的那个ground-truth的序号)
      const vector<int>& match_index = it->second;
      CHECK_EQ(num_priors, match_index.size());

      // 遍历所有的priorbox，计算正样本的定位loss
      // 这个loss其实就是计算了 |ground truth与priorbox之间偏移量- 该priorbox对应的预测值|
      for (int j = 0; j < match_index.size(); ++j)
      {
        if (match_index[j] <= -1)
        {
          continue;
        }

        // 只针对matched priorbox计算loss
        Dtype loss = 0;
        for (int k = 0; k < 4; ++k)
        {
          Dtype val = diff_data[count * 4 + k];
          if (loc_loss_type == MultiBoxLossParameter_LocLossType_SMOOTH_L1)
          {
            Dtype abs_val = fabs(val); // 计算L1距离
            if (abs_val < 1.)
            {
              loss += 0.5 * val * val;
            }
            else
            {
              loss += abs_val - 0.5;
            }
          } else if (loc_loss_type == MultiBoxLossParameter_LocLossType_L2)
          {
            loss += 0.5 * val * val;
          } else {
            LOG(FATAL) << "Unknown loc loss type.";
          }
        }
        loc_loss[j] = loss;
        ++count;
      }
    }
    all_loc_loss->push_back(loc_loss);
  }
}