SSD caffe 官方源码分析

最新推荐文章于 2022-05-12 20:59:54 发布

AaronJiang395

最新推荐文章于 2022-05-12 20:59:54 发布

阅读量1k

点赞数

分类专栏：深度学习文章标签：目标检测

本文链接：https://blog.csdn.net/qq_31610789/article/details/89071518

版权

深度学习专栏收录该内容

40 篇文章 1 订阅

订阅专栏

原理请参考这篇文章

https://www.cnblogs.com/fariver/p/7347197.html

关于每层抽取的特征对应的anchors的种类个数

300x300输入下

fc7层抽取的到

可以看到,后处理后是19x19的grid 每个格子生成24/4=6个anchors对应的location

如上图可以看到类别的预测, fc7对用每个格子126/6=21 每个上面提到的anchor要有长度21的向量对应

layer {
  name: "fc7_mbox_priorbox"
  type: "PriorBox"
  bottom: "fc7"
  bottom: "data"
  top: "fc7_mbox_priorbox"
  prior_box_param {
    min_size: 60.0
    max_size: 111.0
    aspect_ratio: 2.0
    aspect_ratio: 3.0
    flip: true
    clip: false
    variance: 0.10000000149
    variance: 0.10000000149
    variance: 0.20000000298
    variance: 0.20000000298
    step: 16.0
    offset: 0.5
  }
}

void PriorBoxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const int layer_width = bottom[0]->width();
  const int layer_height = bottom[0]->height();
  int img_width, img_height;
  if (img_h_ == 0 || img_w_ == 0) {
    img_width = bottom[1]->width();
    img_height = bottom[1]->height();
  } else {
    img_width = img_w_;
    img_height = img_h_;
  }
  float step_w, step_h;
  if (step_w_ == 0 || step_h_ == 0) {
    step_w = static_cast<float>(img_width) / layer_width;
    step_h = static_cast<float>(img_height) / layer_height;
  } else {
    step_w = step_w_;
    step_h = step_h_;
  }
  Dtype* top_data = top[0]->mutable_cpu_data();
  int dim = layer_height * layer_width * num_priors_ * 4;
  int idx = 0;
  for (int h = 0; h < layer_height; ++h) {
    for (int w = 0; w < layer_width; ++w) {
      float center_x = (w + offset_) * step_w;
      float center_y = (h + offset_) * step_h;
      float box_width, box_height;
      for (int s = 0; s < min_sizes_.size(); ++s) {
        int min_size_ = min_sizes_[s];
        // first prior: aspect_ratio = 1, size = min_size
        box_width = box_height = min_size_;
        // xmin
        top_data[idx++] = (center_x - box_width / 2.) / img_width;
        // ymin
        top_data[idx++] = (center_y - box_height / 2.) / img_height;
        // xmax
        top_data[idx++] = (center_x + box_width / 2.) / img_width;
        // ymax
        top_data[idx++] = (center_y + box_height / 2.) / img_height;

        if (max_sizes_.size() > 0) {
          CHECK_EQ(min_sizes_.size(), max_sizes_.size());
          int max_size_ = max_sizes_[s];
          // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
          box_width = box_height = sqrt(min_size_ * max_size_);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }

        // rest of priors
        for (int r = 0; r < aspect_ratios_.size(); ++r) {
          float ar = aspect_ratios_[r];
          if (fabs(ar - 1.) < 1e-6) {
            continue;
          }
          box_width = min_size_ * sqrt(ar);
          box_height = min_size_ / sqrt(ar);
          // xmin
          top_data[idx++] = (center_x - box_width / 2.) / img_width;
          // ymin
          top_data[idx++] = (center_y - box_height / 2.) / img_height;
          // xmax
          top_data[idx++] = (center_x + box_width / 2.) / img_width;
          // ymax
          top_data[idx++] = (center_y + box_height / 2.) / img_height;
        }
      }
    }
  }

结合caffe源码:

void PriorBoxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const PriorBoxParameter& prior_box_param =
      this->layer_param_.prior_box_param();
  CHECK_GT(prior_box_param.min_size_size(), 0) << "must provide min_size.";
  for (int i = 0; i < prior_box_param.min_size_size(); ++i) {
    min_sizes_.push_back(prior_box_param.min_size(i));
    CHECK_GT(min_sizes_.back(), 0) << "min_size must be positive.";
  }
  aspect_ratios_.clear();
  aspect_ratios_.push_back(1.);
  flip_ = prior_box_param.flip();
  for (int i = 0; i < prior_box_param.aspect_ratio_size(); ++i) {
    float ar = prior_box_param.aspect_ratio(i);
    bool already_exist = false;
    for (int j = 0; j < aspect_ratios_.size(); ++j) {
      if (fabs(ar - aspect_ratios_[j]) < 1e-6) {
        already_exist = true;
        break;
      }
    }
    if (!already_exist) {
      aspect_ratios_.push_back(ar);
      if (flip_) {
        aspect_ratios_.push_back(1./ar);
      }
    }
  }
  num_priors_ = aspect_ratios_.size() * min_sizes_.size();
  if (prior_box_param.max_size_size() > 0) {
    CHECK_EQ(prior_box_param.min_size_size(), prior_box_param.max_size_size());
    for (int i = 0; i < prior_box_param.max_size_size(); ++i) {
      max_sizes_.push_back(prior_box_param.max_size(i));
      CHECK_GT(max_sizes_[i], min_sizes_[i])
          << "max_size must be greater than min_size.";
      num_priors_ += 1;
    }
  }
  
  clip_ = prior_box_param.clip();
  if (prior_box_param.variance_size() > 1) {
    // Must and only provide 4 variance.
    CHECK_EQ(prior_box_param.variance_size(), 4);
    for (int i = 0; i < prior_box_param.variance_size(); ++i) {
      CHECK_GT(prior_box_param.variance(i), 0);
      variance_.push_back(prior_box_param.variance(i));
    }
  } else if (prior_box_param.variance_size() == 1) {
    CHECK_GT(prior_box_param.variance(0), 0);
    variance_.push_back(prior_box_param.variance(0));
  } else {
    // Set default to 0.1.
    variance_.push_back(0.1);
  }

  if (prior_box_param.has_img_h() || prior_box_param.has_img_w()) {
    CHECK(!prior_box_param.has_img_size())
        << "Either img_size or img_h/img_w should be specified; not both.";
    img_h_ = prior_box_param.img_h();
    CHECK_GT(img_h_, 0) << "img_h should be larger than 0.";
    img_w_ = prior_box_param.img_w();
    CHECK_GT(img_w_, 0) << "img_w should be larger than 0.";
  } else if (prior_box_param.has_img_size()) {
    const int img_size = prior_box_param.img_size();
    CHECK_GT(img_size, 0) << "img_size should be larger than 0.";
    img_h_ = img_size;
    img_w_ = img_size;
  } else {
    img_h_ = 0;
    img_w_ = 0;
  }

  if (prior_box_param.has_step_h() || prior_box_param.has_step_w()) {
    CHECK(!prior_box_param.has_step())
        << "Either step or step_h/step_w should be specified; not both.";
    step_h_ = prior_box_param.step_h();
    CHECK_GT(step_h_, 0.) << "step_h should be larger than 0.";
    step_w_ = prior_box_param.step_w();
    CHECK_GT(step_w_, 0.) << "step_w should be larger than 0.";
  } else if (prior_box_param.has_step()) {
    const float step = prior_box_param.step();
    CHECK_GT(step, 0) << "step should be larger than 0.";
    step_h_ = step;
    step_w_ = step;
  } else {
    step_h_ = 0;
    step_w_ = 0;
  }

  offset_ = prior_box_param.offset();
}

看上图layer的定义可知, aspect_ratio有两个, 程序生成的方式是

1个min_size 方形的anchors

1个maxsize 方形的anchors

根据我们加上去的两个aspect_radio 进行flip个数翻倍后,根据min_size进行的4个anchors

一共是是6个么没错了.

同理:

conv8_2_mbox_loc 分别是3x3个格子,每个格子4个候选目标

layer {
  name: "conv8_2_mbox_priorbox"
  type: "PriorBox"
  bottom: "conv8_2"
  bottom: "data"
  top: "conv8_2_mbox_priorbox"
  prior_box_param {
    min_size: 213.0
    max_size: 264.0
    aspect_ratio: 2.0
    flip: true
    clip: false
    variance: 0.10000000149
    variance: 0.10000000149
    variance: 0.20000000298
    variance: 0.20000000298
    step: 100.0
    offset: 0.5
  }
}

对于每个格子,这里会有4个不同的框生成.

对于loss函数的分析

layer {
  name: "mbox_loss"
  type: "MultiBoxLoss"
  bottom: "mbox_loc"
  bottom: "mbox_conf"
  bottom: "mbox_priorbox"
  bottom: "label"
  top: "mbox_loss"
  include {
    phase: TRAIN
  }
  propagate_down: true
  propagate_down: true
  propagate_down: false
  propagate_down: false
  loss_param {
    normalization: VALID
  }
  multibox_loss_param {
    loc_loss_type: SMOOTH_L1
    conf_loss_type: SOFTMAX
    loc_weight: 1.0
    num_classes: 21
    share_location: true
    match_type: PER_PREDICTION
    overlap_threshold: 0.5
    use_prior_for_matching: true
    background_label_id: 0
    use_difficult_gt: true
    neg_pos_ratio: 3.0
    neg_overlap: 0.5
    code_type: CENTER_SIZE
    ignore_cross_boundary_bbox: false
    mining_type: MAX_NEGATIVE
  }
}

上面是layer的定义,这里主要关注的是这么几个参数:

loc_loss_type: SMOOTH_L1 # 位置回归loss的类型
    conf_loss_type: SOFTMAX #类别回归的loss类型
    loc_weight: 1.0 #显然需要权重来平衡上面两个Loss的数量级
    num_classes: 21

#loc_classes_ = share_location_ ? 1 : num_classes_;

# 意味着是否一个box只对应一个21维的类别向量,

# 在faster-rcnn中相当于false, 另外这个倘若是false则会大大增加预测location的矩阵的大小,直接*class_num个

share_location: true

   


    match_type: PER_PREDICTION

#判别维正样本的iou阈值,大于该值为正样本
    overlap_threshold: 0.5
    use_prior_for_matching: true

#指定了背景类的id
    background_label_id: 0

c++源码:

bool difficult = static_cast<bool>(gt_data[start_idx + 7]);

if (!use_difficult_gt && difficult) {

// Skip reading difficult ground truth.

continue;

}

    use_difficult_gt: true


    neg_pos_ratio: 3.0

inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,

const float match_overlap, const float neg_overlap) {

if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {

return match_idx == -1 && match_overlap < neg_overlap;

} else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) {

return true;

} else {

return false;

}

}

    neg_overlap: 0.5
    code_type: CENTER_SIZE

#有时候预测的框会超过原来的图片边界(例如xmin<0 或者>1.0时, 归一化坐标),是否false则接收这种情况
    ignore_cross_boundary_bbox: false

#MAX_NEGATIVE:结合neg_pos_ratio, 使得每幅图里面的负样本数量最多是正样本的3倍(此neg_pos_ratio: 3.0 )

# HARD_EXAMPLE : 除了正样本的数量,剩下的都是负样本的数量
    mining_type: MAX_NEGATIVE  #HARD_EXAMPLE