caffe中实现SSD准确率评价方法是TestDetection()函数。mAp指标值是每个类别的Average precision的平均值。
本文分析DetectionEvaluateLayer中实现评价的过程及其评价指标。需要指出的是,一般的前向过程是不包含DetectionEvaluateLayer层的定义的,只有在训练的评价(TEST)过程中才会使用到该layer。具体使用该层时,需要在prototxt文件中把下述定义写到DetectionOutput的定义之后。
layer {
name: "detection_eval"
type: "DetectionEvaluate"
bottom: "detection_out"
bottom: "label"
top: "detection_eval"
include {
phase: TEST
}
detection_evaluate_param {
num_classes: 11
background_label_id: 0
overlap_threshold: 0.5
evaluate_difficult_gt: false
}
}
在计算Average Precision之前需要先计算出所有预测框与gt_bboxes的匹配。
SSD evaluation layer以detection_output layer的输出([image_id, label, confidence, xmin, ymin, xmax, ymax])作为输入,同时输出 [image_id, label, confidence, true_pos, false_pos]元组。
具体操作为:1)遍历每个类别;2)将该类别的预测框按confidence降序排列;3)对于每个pred_bbox,找出与其有最大iou的gt_bbox;4)如果该gt_bbox之前没有被分配且iou大于给定的阈值(比如0.5),那将该gt_bbox分配该给pred_bbox,设置该pred_bbox为true positive;否则设置该pred_bbox为false positive.
//from code:caffe-ssd/src/caffe/layers/detection_evaluate_layer.cpp
//function: DetectionEvaluateLayer<Dtype>::Forward_cpu
for (LabelBBox::iterator iit = detections.begin();
iit != detections.end(); ++iit) {
int label = iit->first;
if (label == -1) {
continue;
}
vector<NormalizedBBox>& bboxes = iit->second;
if (label_bboxes.find(label) == label_bboxes.end()) {
// No ground truth for current label. All detections become false_pos.
for (int i = 0; i < bboxes.size(); ++i) {
top_data[num_det * 5] = image_id;
top_data[num_det * 5 + 1] = label;
top_data[num_det * 5 + 2] = bboxes[i].score();
top_data[num_det * 5 + 3] = 0;
top_data[num_det * 5 + 4] = 1;
++num_det;
}
} else {
vector<NormalizedBBox>& gt_bboxes = label_bboxes.find(label)->second;
// Scale ground truth if needed.
if (!use_normalized_bbox_) {
CHECK_LT(count_, sizes_.size());
for (int i = 0; i < gt_bboxes.size(); ++i) {
OutputBBox(gt_bboxes[i], sizes_[count_], has_resize_,
resize_param_, &(gt_bboxes[i]));
}
}
vector<bool> visited(gt_bboxes.size(), false);
// Sort detections in descend order based on scores.
std::sort(bboxes.begin(), bboxes.end(), SortBBoxDescend);
for (int i = 0; i < bboxes.size(); ++i) {
top_data[num_det * 5] = image_id;
top_data[num_det * 5 + 1] = label;
top_data[num_det * 5 + 2] = bboxes[i].score();
if (!use_normalized_bbox_) {
OutputBBox(bboxes[i], sizes_[count_], has_resize_,
resize_param_, &(bboxes[i]));
}
// Compare with each ground truth bbox.
float overlap_max = -1;
int jmax = -1;
//找出与当前bboxes[i]的交集最大的gt_bboxes[jmax]
for (int j = 0; j < gt_bboxes.size(); ++j) {
float overlap = JaccardOverlap(bboxes[i], gt_bboxes[j],
use_normalized_bbox_);
if (overlap > overlap_max) {
overlap_max = overlap;
jmax = j;
}
}
//只有阈值不小于overlap_threshold_的预测框才可能是正样本
if (overlap_max >= overlap_threshold_) {
if (evaluate_difficult_gt_ ||
(!evaluate_difficult_gt_ && !gt_bboxes[jmax].difficult())) {
if (!visited[jmax]) {
// true positive.
top_data[num_det * 5 + 3] = 1;
top_data[num_det * 5 + 4] = 0;
visited[jmax] = true;
} else {//它匹配的gt_bbox被前面的pred_bbox匹配了
// false positive (multiple detection).
top_data[num_det * 5 + 3] = 0;
top_data[num_det * 5 + 4] = 1;
}
}
} else {
// false positive.
top_data[num_det * 5 + 3] = 0;//正样本标志为0
top_data[num_det * 5 + 4] = 1;//负样本标志为1
}
++num_det;
}
}
}
}
计算mAP的代码位于src/caffe/solver.cpp中的TestDetection的函数,该函数以vector<pair<float, int> > label_true_pos,vector<pair<float, int> > label_false_pos作为参数调用src/caffe/util/bbox_util.cpp文件中的ComputeAP函数。ComputeAP计算出所有的precision和recall值。average precision的计算方式有11point(VOC2007 styl),MaxIntegral(VOC2012 or ILSVRC style),Integral(会比11ponit计算出来的值略大)。
关于Average_precision的公式,可以参照wiki上的解释:Wikipedia entry for the Average precision
下面介绍下这三种AP的计算方式。首先需要画出P-R曲线(纵轴是P,横轴是R),它应该是条左上到右下的凸线;
如果一共有n个P-R值,Integral的计算方式是将横轴划分为n+1份,然后每份的宽度为recall[i+1]-recall[i],然后ap +=(recall[i+1]-recall[i])*precision[i],i逐渐增加到n。这个其实就是计算P-R曲线与横纵轴围城的面积,即积分;
MaxIntegral与Integral较相似,区别是MaxIntegral的计算方式是ap +=(recall[i+1]-recall[i])*precision[i+1],i逐渐减少到0;(是否MaxIntegral计算出来的值比Integral大?这个有待进一步验证)
11point的计算方式可以参照文章:深度学习-目标检测评估指标P-R曲线、AP、mAP。SSD的实现代码如下:
//必须先对pair<rec,prec>按rec进行降序排序,否则结果不对
// VOC2007 style for computing AP.
vector<float> max_precs(11, 0.);
int start_idx = num - 1;
for (int j = 10; j >= 0; --j) {
for (int i = start_idx; i >= 0 ; --i) {
if ((*rec)[i] < j / 10.) {
start_idx = i;
if (j > 0) {
max_precs[j-1] = max_precs[j];
}
break;
} else {
if (max_precs[j] < (*prec)[i]) {
max_precs[j] = (*prec)[i];
}
}
}
}
for (int j = 10; j >= 0; --j) {
*ap += max_precs[j] / 11;
}
上述代码感觉还有更优的实现。
其中prec,rec的计算方式如下:
const vector<pair<float, int> > tp;//<score,index>
const vector<pair<float, int> > fp;
// Compute cumsum of tp.
vector<int> tp_cumsum;
CumSum(tp, &tp_cumsum);//这里对tp和fp按照score降序排列
CHECK_EQ(tp_cumsum.size(), num);
// Compute cumsum of fp.
vector<int> fp_cumsum;
CumSum(fp, &fp_cumsum);
CHECK_EQ(fp_cumsum.size(), num);
// Compute precision.
for (int i = 0; i < num; ++i) {
//注意:prec的排序不一定是降序
prec->push_back(static_cast<float>(tp_cumsum[i]) /
(tp_cumsum[i] + fp_cumsum[i]));
}
// Compute recall.
for (int i = 0; i < num; ++i) {
CHECK_LE(tp_cumsum[i], num_pos);
rec->push_back(static_cast<float>(tp_cumsum[i]) / num_pos);//rec和tp_cumsum一样,降序排列的
}