mAP(@R)计算代码

Notes

多模态检索中常用几种评价指标:

mAP 是多模态检索常用指标,也有用 m A P @ R mAP@R mAP@R 的。这里备份一份计算 mAP 或 mAP@R 的 python 代码,主要抄自cvpr 2017 DCMH作者释出的代码。
特别地,当用 hash 码 / Hamming 距离检索时,有必要使用 tie-aware 的检索指标,参见:numpy和pytorch的argsort结果不同

About the Denominator

当计算 mAP@k 时,分母究竟是「整个检索序列的相似样本数」,还是「检索序列前 k 个结果中的相似样本数」,有意见分歧。[4] 中的代码是后者,而 [6] 中的公式是前者,相关讨论可见 [7, 8, 9]。
本文的版本直到目前(2020.9.4)都是后者,即还是与 [4] 保持一致。

Code

multiple R R R

  • multi_mAP,支持单个或多个 position thresholds,传 int 或 int tuple/list。
# import copy
# import numpy as np
# from util import *  # `euclidean` 放在这里面

def mAP(Dist, S, k=-1):
    """单 k 版
    即原 fast version(见后文),只是换了 API,用来对拍
    """
    n, m = Dist.shape
    if (k < 0) or (k > m):
        k = m
    Gnd = S.astype(np.int32)
    gnd_rs = np.sum(Gnd, axis=1)
    Rank = np.argsort(Dist)

    AP = 0.0
    for it in range(n):
        gnd = Gnd[it]
        if 0 == gnd_rs[it]:
            continue
        rank = Rank[it][:k]
        gnd = gnd[rank]
        if (k > 0) and (np.sum(gnd) == 0):
            continue
        pos = np.asarray(np.where(gnd == 1.)) + 1.0
        rel_cnt = np.arange(pos.shape[-1]) + 1.0
        AP += np.mean(rel_cnt / pos)

    return AP / n


def multi_mAP(Dist, Sim, k=-1):
    """支持单 k、多 k
    多个 k 时传 int tuple/list
    """
    if isinstance(k, int):
        k = [k]
    else:
        k = copy.deepcopy(k)
    n, m = Dist.shape
    for kid in range(len(k)):
        if (k[kid] < 0) or (k[kid] > m):
            k[kid] = m
    k = sorted(k)  # ascending
    assert k[0] != 0, "`@0` is meaningless and disallowed for efficiency"
    Gnd = (Sim > 0).astype(np.int32)
    gnd_rs = np.sum(Gnd, axis=1)
    Rank = np.argsort(Dist, axis=-1)

    AP = np.zeros([len(k)], dtype=np.float32)
    for it in range(n):
        gnd = Gnd[it]
        if 0 == gnd_rs[it]:
            continue
        rank = Rank[it]#[:k]
        gnd = gnd[rank]
        # if (k > 0) and (np.sum(gnd) == 0):
        #     continue
        pos = np.asarray(np.where(gnd == 1.)).flatten() + 1.0
        rel_cnt = np.arange(pos.shape[-1]) + 1.0
        # AP += np.mean(rel_cnt / pos)
        p_list = rel_cnt / pos

        _cnt, _p_sum = 0, 0
        for kid, _k in enumerate(k):
            if pos[_cnt] > _k:
                continue
            while (_cnt < pos.shape[0]) and (pos[_cnt] <= _k):
                _p_sum += p_list[_cnt]
                _cnt += 1
            _ap = _p_sum / _cnt
            AP[kid] += _ap
            if _cnt >= pos.shape[0]:
                break

    _mAP = AP / n
    if 1 == _mAP.shape[0]:
        _mAP = _mAP[0]
    return _mAP


if __name__ == "__main__":
    print("对拍。结论:一致")
    N, M = 5, 20
    qF = np.random.randn(N, 3)
    rF = np.random.randn(M, 3)
    qL = np.random.randint(0, 2, size=(N, 7))
    rL = np.random.randint(0, 2, size=(M, 7))
    D = euclidean(qF, rF)
    S = sim_mat(qL, rL)
    k_list = [1] + list(range(0, M + 1, 5)[1:])
    print("k_list:", k_list)
    map1 = [mAP(D, S, k=_k) for _k in k_list]  # 单 k 版,多次计算
    map2 = multi_mAP(D, S, k_list)  # 多 k 版,传 tuple/list
    print("mAP 1:", map1)
    print("mAP 2:", map2)

fast version

  • 原来那份代码的calc_mAP有些可以预处理的地方,改掉后可以提速
  • 距离计算可改用 scipy.spatial.distance.cdist 这个库,如 cosine 距离 cdist(qF, rF, 'cosine')、Hamming 距离 cdist(qF, rF, 'hamming')
  • (2020.3.13 Updates)还是用 scipy.spatial.distance.cdist 算距离了…在一个 6w+ 的 nuswide(的子集?) 上,test set 2w7,retrieval set 3w5,被 killed 了…换成自己写的 hamming 距离之后又没事
  • 另外,集合太大的话考虑分批算吧(这里加了个函数内的分批,也可以写成在函数外分批生成 test set 的特征传进来。但 retrieval set 好像不能分批来)
import numpy as np
from sklearn.preprocessing import normalize


def cos(A, B=None):
    """cosine"""
    An = normalize(A, norm='l2', axis=1)
    if (B is None) or (B is A):
        return np.dot(An, An.T)
    Bn = normalize(B, norm='l2', axis=1)
    return np.dot(An, Bn.T)


def hamming(A, B=None):
    """A, B: [None, bit]
    elements in {-1, 1}
    """
    if B is None: B = A
    bit = A.shape[1]
    return (bit - A.dot(B.T)) // 2


def euclidean(A, B=None, sqrt=False):
    aTb = np.dot(A, B.T)
    if (B is None) or (B is A):
        aTa = np.diag(aTb)
        bTb = aTa
    else:
        aTa = np.diag(np.dot(A, A.T))
        bTb = np.diag(np.dot(B, B.T))
    D = aTa[:, np.newaxis] - 2.0 * aTb + bTb[np.newaxis, :]
    if sqrt:
        D = np.sqrt(D)
    return D


def sim_mat(label, label_2=None, sparse=False):
    if label_2 is None:
        label_2 = label
    if sparse:
        S = label[:, np.newaxis] == label2[np.newaxis, :]
    else:
        S = np.dot(label, label_2.T) > 0
    return S.astype(label.dtype)


def calc_mAP(qF, rF, qL, rL, what=0, k=-1, sparse=False):
    """calculate mAP for retrieval
    Args:
        qF: query feature/hash matrix
        rF: retrieval feature/hash matrix
        qL: query label matrix
        rL: retrieval label matrix
        what: {0: cos, 1: hamming, 2: euclidean}
        k: mAP@k, default `-1` means mAP@ALL
    """
    n_query = qF.shape[0]
    if k == -1 or k > rF.shape[0]:
        k = rF.shape[0]
    Gnd = sim_mat(qL, rL, sparse).astype(np.int)
    if what == 0:
        Rank = np.argsort(1 - cos(qF, rF))
    elif what == 1:
        Rank = np.argsort(hamming(qF, rF))
    elif what == 2:
        Rank = np.argsort(euclidean(qF, rF))
        
    AP = 0.0
    for it in range(n_query):
        gnd = Gnd[it]
        if np.sum(gnd) == 0:
            continue
        rank = Rank[it][:k]
        gnd = gnd[rank]
        if np.sum(gnd) == 0:
            continue
        pos = np.asarray(np.where(gnd == 1.)) + 1.0
        rel_cnt = np.arange(pos.shape[-1]) + 1.0
        AP += np.mean(rel_cnt / pos)

    mAP = AP / n_query
    return mAP

slow version

import numpy as np
from sklearn.preprocessing import normalize

# cos 相似度
def cos_sim(f1, f2):
    """cosine similarity"""
    f1 = normalize(f1, norm='l2', axis=1)
    f2 = normalize(f2, norm='l2', axis=1)
    sim = np.dot(f1, f2.T)

    # return sim
    return 0.5 + 0.5 * sim


# cos 距离
def cos_dis(f1, f2):
    """cosine distance = 1. - cosine similarity"""
    return 1. - cos_sim(f1, f2)


# hamming 距离
def hamming_dis(B1, B2):
    """Hamming distance"""
    q = B2.shape[1]
    distH = 0.5 * (q - np.dot(B1, B2.transpose()))
    return distH


# mAP(@k)
def calc_mAP(qF, rF, qL, rL, what=0, k=-1):
    """calculate mAP
    Args:
        qF: query feature/hash matrix
        rF: retrieval feature/hash matrix
        qL: query label matrix
        rL: retrieval label matrix
        what: {0: feature, 1: hash code}
        k: mAP@k, default `-1` means mAP@ALL
    """
    n_query = qF.shape[0]
    if k == -1 or k > rF.shape[0]:  # 默认 mAP@all
        k = rF.shape[0]
    AP = 0.0

    for it in range(n_query):
        # ground-truth: 1 vs all
        gnd = (np.dot(qL[it, :], rL.transpose()) > 0).astype(np.float32)
        if np.sum(gnd) == 0:
            continue
        if what == 0:  # 连续向量用 cos 距离
            dis = cos_dis(np.expand_dims(
                qF[it], axis=0), rF).reshape(-1)  # 1 vs all
        else:  # hash code 用 hamming 距离
            dis = hamming_dis(qF[it, :], rF)

        rank = np.argsort(dis)[:k]  # 按距离升序排位,截取前 k 个
        
        gnd = gnd[rank]
        if np.sum(gnd) == 0:  # 避免除 0
            continue
        pos = np.asarray(np.where(gnd == 1.)) + 1.0  # 出现相关样本的位置
        rel_cnt = np.arange(pos.shape[-1]) + 1.0  # rel_cnt[k]:前 k 个结果中相关样本总数
        AP += np.mean(rel_cnt / pos)

    mAP = AP / n_query
    return mAP

Sample

  • 其作者释出的代码中自带一份样例,可同他的代码对拍检验mAP@all有没有写错,详见引用[4]
qB = np.array([[1, -1, 1, 1],
               [-1, -1, -1, 1],
               [1, 1, -1, 1],
               [1, 1, 1, -1]])
rB = np.array([[1, -1, 1, -1],
               [-1, -1, 1, -1],
               [-1, -1, 1, -1],
               [1, 1, -1, -1],
               [-1, 1, -1, -1],
               [1, 1, -1, 1]])
query_L = np.array([[0, 1, 0, 0],
                    [1, 1, 0, 0],
                    [1, 0, 0, 1],
                    [0, 1, 0, 1]])
retrieval_L = np.array([[1, 0, 0, 1],
                        [1, 1, 0, 0],
                        [0, 1, 1, 0],
                        [0, 0, 1, 0],
                        [1, 0, 0, 0],
                        [0, 0, 1, 0]])

References

  1. 多标签图像分类任务的评价方法-mAP
  2. 目标检测中的mAP是什么含义?
  3. IJCAI 2015: Quantized Correlation Hashing for Fast Cross-Modal Search -> 文中有mAP@R计算公式
  4. DCMH-CVPR2017/DCMH_tensorflow/DCMH_tensorflow/utils/calc_hammingranking.py
  5. Distance computations (scipy.spatial.distance)
  6. Computing Information Retrieval Performance Measures Efficiently in the Presence of Tied Scores
  7. MAP@k computation
  8. Some problem about calculating MAP #2
  9. About DCH evaluation metrics #14
  10. iTomxy/ml-template/evaluate/_mAP.py
  • 3
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值