voc数据集的map计算方式

最新推荐文章于 2024-03-17 10:47:40 发布

pursuit_zhangyu

最新推荐文章于 2024-03-17 10:47:40 发布

阅读量4k

点赞数 2

分类专栏：目标检测

本文链接：https://blog.csdn.net/pursuit_zhangyu/article/details/113741660

版权

目标检测专栏收录该内容

17 篇文章

订阅专栏

VOC数据集map的计算方式

描述

voc数据集map的计算方式包括两个

Pascal Voc 2007 ，11点法求AP。
voc 2010-2012 ，计算平滑后的曲线与recall轴围成的面积。

代码解析

检测出来的bbox包含score和bbox，按照score降序排序，所以每添加一个样本，就代表阈值降低一点（真实情况下score降低，iou不一定降低）。这样就是可以有很多种阈值，每个阈值情况下计算一个prec和recall。

d:对模型检测到的bbox循环：
j:对该bbox对应的图像(i)中所有的gt循环：
如果bb和bbgt有重叠：
计算ov=重叠部分面积/联合的面积，并记录ovmax,jmax
如果ovmax大于阈值：
如果i图像的第jmax个gt的diff（是否为diffcilut样本）不为0
如果i图像的第jmax个gt的det为0，则tp(d)=1，标记为true positive
如果i图像的第jmax个gt的det不为0，则fp(d)=1，标记为false positive（mutlti detection)
否则，
fp(d)=1，标记为false positive

具体代码如下：（voc 2010-2012 map的计算方式，voc07的算法没用就删除了）

import xml.etree.ElementTree as ET
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt

def draw_pr(rec, prec, classname):
    plt.figure()
    plt.xlabel('recall')
    plt.ylabel('precision')
    plt.title('PR cruve')
    
    plt.plot(rec.tolist(), prec.tolist())
    plt.savefig('{}_pr.png'.format(classname))


def parse_rec(filename):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text#name节点存的是class,图像类型名称
        obj_struct['pose'] = obj.find('pose').text#默认为Unspecified
        obj_struct['truncated'] = int(obj.find('truncated').text)#默认为0
        obj_struct['difficult'] = int(obj.find('difficult').text)#默认为0
        bbox = obj.find('bndbox')#获取尺寸位置信息
        obj_struct['bbox'] = [int(bbox.find('xmin').text),
                              int(bbox.find('ymin').text),
                              int(bbox.find('xmax').text),
                              int(bbox.find('ymax').text)]
        objects.append(obj_struct)

    return objects

def voc_ap(rec, prec):
    """ ap = voc_ap(rec, prec, [use_07_metric])
    Compute VOC AP given precision and recall.
    If use_07_metric is true, uses the
    VOC 07 11 point method (default:False).
    """

    # correct AP calculation
    # first append sentinel values at the end
    # 将recall和precision补全，主要用于积分计算，保证recall的域为[0,1]
    mrec = np.concatenate(([0.], rec, [1.]))
    mpre = np.concatenate(([0.], prec, [0.]))

    # 滤除fp增加条件下导致的pre减小的无效值
    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    # 滤除总检测样本数增加导致计算的recall的未增加的量
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # 通过积分计算precision对recall的平均数
    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap


def voc_eval(detpath,
             annopath,
             imagesetfile,
             classname,
             cachedir,
             ovthresh=0.25):
    """rec, prec, ap = voc_eval(detpath,
                                annopath,
                                imagesetfile,
                                classname,
                                [ovthresh],
                                [use_07_metric])
    Top level function that does the PASCAL VOC evaluation.
    detpath: Path to detections
        detpath.format(classname) should produce the detection results file.
    annopath: Path to annotations
        annopath.format(imagename) should be the xml annotations file.
    imagesetfile: Text file containing the list of images, one image per line.
    classname: Category name (duh)
    cachedir: Directory for caching the annotations
    [ovthresh]: Overlap threshold (default = 0.5)
    [use_07_metric]: Whether to use VOC07's 11 point AP computation
        (default False)
    """
    # assumes detections are in detpath.format(classname)
    # assumes annotations are in annopath.format(imagename)
    # assumes imagesetfile is a text file with each line an image name
    # cachedir caches the annotations in a pickle file

#################################################################################################
##### 第一步：获取所有的GT标签信息，存入字典recs中或文件annots.pkl中，便于使用 #####################
#################################################################################################
    # 标签信息都是GT的信息
    # 提取annotations标签文件缓存路径
    # 如果没有缓存文件，就读取信息并创建一个二进制缓存文件annots.pkl
    if not os.path.isdir(cachedir):
        os.mkdir(cachedir)
    cachefile = os.path.join(cachedir, 'annots.pkl')
    # 从图像名称文件中读取图像名称，存入imagenames列表中
    with open(imagesetfile, 'r') as f:
        lines = f.readlines()
    imagenames = [x.strip() for x in lines]

    # 根据imagenames列表存储或读取标签信息
    if not os.path.isfile(cachefile):
        # 载入标签文件，recs这个字典中，存储了验证集所有的GT信息
        recs = {}
        for i, imagename in enumerate(imagenames):
            #解析标签xml文件，annopath为/{}.xml文件，加format表示为{}中赋值
            #imagename来源于从imagesetfile中提取，循环采集所有的的信息
            recs[imagename] = parse_rec(annopath.format(imagename))
            #解析标签文件图像进度条
            if i % 100 == 0:
                print('Reading annotation for {:d}/{:d}'.format(
                    i + 1, len(imagenames)))
        # 将读取标签内容存入缓存文件annots.pkl，这是个数据流二进制文件
        print('Saving cached annotations to {:s}'.format(cachefile))
        with open(cachefile, 'wb') as f:
            pickle.dump(recs, f)#使用了pickle.dump，存入后保存成二进制文件
    else:
        # 有标签缓存文件，直接读取,recs中存的是GT标签信息
        with open(cachefile, 'rb') as f:
            recs = pickle.load(f)

#################################################################################################
##### 第二步：从字典recs中提取当前类型的GT标签信息，存入字典class_recs中，key为图片名imagename #####
#################################################################################################
    # 针对某个class名称的result文件提取对应的每个图片文件中GT的信息，存入R
    # bbox中保存该类型GT所有的box信息，difficult、det、npos等都从R中提取
    # 提取完毕针对每个图片生成一个字典，存入class_recs
    # 这里相当于根据class对图片中新不同类型目标进行归纳，每个类型计算一个AP
    class_recs = {}
    npos = 0
    
    # 上篇文章中说了当result文件名前面含有comp4_det_test_时的2种方法，这里还有个更简单的，即将classname后加上[15:]
    # 表示读取第15位开始到结束的内容，这是第3种方法
    for imagename in imagenames:
        #R中为所有图片中，类型匹配上的GT信息
        R = [obj for obj in recs[imagename] if obj['name'] == classname]
        #bbox中存储了该文件中该类型的所有box信息
        bbox = np.array([x['bbox'] for x in R])
        #difficult转化成bool型变量，其中xml文件中difficult的含义，表示目标检测的难度，如果为1的话，表示难检测出来。模型检测不出来，也不会把它当做漏检测
        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
        #该图片中，没有匹配到当前类型det=[],匹配到1个，det=[False]，匹配到多个det=[False, ...]
        #det将是和difficult不同的地方，当不是difficult的时候，det也是false，这是一个区别
        det = [False] * len(R)
        #利用difficult进行计数，这里所有的值都是difficult，如果不是difficult就累加，~是取反。其实就是统计xml文件中<difficult>0</difficult>的个数，表示gt的个数，便于求recall。
        npos = npos + sum(~difficult)
        #class_recs是一个字典，第一层key为文件名，一个文件名对应的子字典中，存储了key对应的图片文件中所有的该类型的box、difficult、det信息
        #这些box、difficult、det信息可以包含多个GT的内容
        class_recs[imagename] = {'bbox': bbox,
                                 'difficult': difficult,
                                 'det': det}
        
#################################################################################################
##### 第三步：从当前class的result文件中读取结果，并将结果按照confidence从大到小排序 ################
#####        排序后的结果存在BB和image_ids中                                      ################
#################################################################################################

    # 读取当前class的result文件内容，这里要去result文件以class命名
    detfile = detpath.format(classname)
    with open(detfile, 'r') as f:
        lines = f.readlines()
    
    # 删除result文件中的''，对于非voc数据集，有的就没有这些内容
    splitlines = [x.strip().split(' ') for x in lines]
    # 将每个结果条目中第一个数据，就是图像id,这个image_ids是文件名
    image_ids = [x[0] for x in splitlines]
    # 提取每个结果的置信度，存入confidence
    confidence = np.array([float(x[1]) for x in splitlines])
    # 提取每个结果的结果，存入BB
    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
    
    # 对confidence从大到小排序，获取id
    sorted_ind = np.argsort(-confidence)
    # 获得排序值，这个值后来没有再用过
    sorted_scores = np.sort(-confidence)
    # 按confidence排序对BB进行排序
    BB = BB[sorted_ind, :]
    # 对相应的图像的id进行排序，其实每个图像对应一个id，即对应一个目标，当一个图中识别两个相同的GT,是可以重复的
    # 这样image_ids中，不同位置就会有重复的内容
    image_ids = [image_ids[x] for x in sorted_ind]


#################################################################################################
##### 第四步：对比GT参数和result，计算出IOU，在fp和tp相应位置标记1 #################################
#################################################################################################

    # go down dets and mark TPs and FPs
    nd = len(image_ids)#图像id的长度
    tp = np.zeros(nd)#设置TP初始值
    fp = np.zeros(nd)#设置FP初始值

    #对一个result文件中所有目标进行遍历，每个图片都进行循环，有可能下次还会遇到这个图片，如果
    for d in range(nd):
        #提取排序好的GT参数值，里面可以有多个目标，当image_ids[d1]和image_ids[d2]相同时，两个R内容相同，且都可能存了多个目标信息
        R = class_recs[image_ids[d]]
        #将BB中confidence第d大的BB内容提取到bb中，这是result中的信息，只可能包含一个目标
        bb = BB[d, :].astype(float)
        ovmax = -np.inf
        #BBGT就是当前confidence从大到小排序条件下，第d个GT中bbox中的信息
        BBGT = R['bbox'].astype(float)

        #当BBGT中有信息，就是没有虚警目标，计算IOU
        #当一个图片里有多个相同目标，选择其中最大IOU，GT和检测结果不重合的IOU=0
        if BBGT.size > 0:
            # compute overlaps
            # intersection
            ixmin = np.maximum(BBGT[:, 0], bb[0])
            iymin = np.maximum(BBGT[:, 1], bb[1])
            ixmax = np.minimum(BBGT[:, 2], bb[2])
            iymax = np.minimum(BBGT[:, 3], bb[3])
            #大于0就输出正常值，小于等于0就输出0
            iw = np.maximum(ixmax - ixmin + 1., 0.)
            ih = np.maximum(iymax - iymin + 1., 0.)
            inters = iw * ih

            # union
            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)

            overlaps = inters / uni#计算交并比，就是IOU
            ovmax = np.max(overlaps)#选出最大交并比，当有
            jmax = np.argmax(overlaps)#求出两个最大交并比的值的序号

        #当高于阈值，对应图像fp = 1
        #ovmax > ovthresh的情况肯定不存在虚警，ovmax原始值为-inf，则没有目标肯定不可能进入if下面的任务
        if ovmax > ovthresh:
            #如果不存在difficult，初始状态，difficult和det都是False
            #找到jamx后，第一任务是确定一个tp，第二任务就是将R['det'][jmax]置为1，表示为已检测，下次再遇到就认为是fp
            if not R['difficult'][jmax]:
                if not R['det'][jmax]:
                    tp[d] = 1.
                    R['det'][jmax] = 1 #标记为已检测
                else:
                    fp[d] = 1.#一个目标被检测两次
        else:
            fp[d] = 1.

#################################################################################################
##### 第五步：计算ap,rec，prec ###################################################################
#################################################################################################
##difficult用于标记真值个数，prec是precision，rec是recall
    # compute precision recall
    fp = np.cumsum(fp)#采用cumsum计算结果是一种积分形式的累加序列，假设fp=[0,1,1],那么np.cumsum(fp)为[0,1,2]。
    tp = np.cumsum(tp)
    #print("float(npos):", float(npos))
    rec = tp / float(npos)#npos表示gt的个数
    # avoid divide by zero in case the first detection matches a difficult
    # ground truth
    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)#maximum这一大串表示防止分母为0
    ap = voc_ap(rec, prec)

    return rec, prec, ap


if __name__  == "__main__":
    detpath = "results/detect_result{}.txt"   #darknet valid输出的检测框的txt文件
    annopath = "Annotations/{}.xml"           #检测图片的标签文件
    imagesetfile = '/test.txt'  #测试图片imagename，如下所示
    # 003505
    # 004343
    classnames = ["person", "car"]       #类别
    cachedir = '.'
    ap_dict = {}      #保存每个类别的ap值
    for classname in classnames:
        if os.path.exists("annots.pkl"):
            os.remove("annots.pkl")
        rec, prec, ap = voc_eval(detpath,annopath,imagesetfile,classname,cachedir,ovthresh=0.5)
        ap_dict[classname] = ap
        #画出每个类别的pr曲线
        draw_pr(rec, prec, classname)

    print("ap_dict\n", ap_dict)

代码里面也画了每个类别的pr曲线

疑点解释

1、为什么map的计算没有设置score_threshold？

https://www.zhihu.com/question/299799471/answer/1088201230

计算mAP的时候，Score Threshold不算是一个可以调节的超参数.。当然，根据mAP的定义，可以把所有YOLOv3模型的Score Threshold设为0，可以得到完整的PR曲线。如果设置Score Threshold，pr曲线从横坐标直接截断了，如下图Score Threshold设置为0.3。

根据代码会带来两个影响：1）因为误检测少了，那么fp的数量降低，precision = tp/(tp+fp)就会变高；2）也会使得检测出来变少了，导致tp的数量变少，recall的值变小。似乎会设置score thread对map的计算影响很小，但是代码里面有一行代码，在不设置Score Threshold的情况下也减少了fp，使得fp对map的计算影响变小了，如果设置Score Threshold会减少map