Video Retrieval指标测试分析

最新推荐文章于 2024-10-18 00:00:00 发布

myccver

最新推荐文章于 2024-10-18 00:00:00 发布

阅读量456

点赞数 9

文章标签： python 人工智能机器学习 pytorch

本文链接：https://blog.csdn.net/qq_45270993/article/details/134690205

版权

在MSVD验证集上进行测试并分析

sim_matrix = _run_on_single_gpu(model, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list)
# sim_matrix的shape是(sentence_num, video_num)
sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
sim_matrix.shape
(4290, 100)

这里计算出了所有句子和视频的相似度，使用一个矩阵表示。
接下来对这个矩阵进行处理，便于计算各个指标

logger.info("before reshape, sim matrix size: {} x {}".format(sim_matrix.shape[0], sim_matrix.shape[1]))
# 输出
11/29/2023 00:34:43 - INFO -   before reshape, sim matrix size: 4290 x 100
# 
cut_off_points2len_ = [itm + 1 for itm in cut_off_points_]
# cut_off_points_是100条视频对应的每组captions的最后一条的索引，caption从0开始，cut_off_points2len相当于从1开始
cut_off_points_.__len__()
100
cut_off_points_
[39, 73, 118, 160, 212, 247, 303, 352,...,4289]
#
max_length = max([e_-s_ for s_, e_ in zip([0]+cut_off_points2len_[:-1], cut_off_points2len_)])
#
[0]+cut_off_points2len_[:-1]
[0, 40, 74, 119, 161, 213, 248, 304, 353, 384, 427, 473,...,4246]
cut_off_points2len_
[40, 74, 119, 161, 213, 248, 304, 353, 384, 427, 473, 514,...,4290]
这里计算出100个视频所对应的captions的数目
[e_-s_ for s_, e_ in zip([0]+cut_off_points2len_[:-1], cut_off_points2len_)]
[40, 34, 45, 42, 52, 35, 56, 49, 31, 43, 46, 41, 35, ...,44]
max_length是一条视频包含最多captions的数目
#
sim_matrix_new = []
# 这里对每条视频对应的captions的相似度做填充到形状为[max_length,100]
for s_, e_ in zip([0] + cut_off_points2len_[:-1], cut_off_points2len_):
            sim_matrix_new.append(np.concatenate((sim_matrix[s_:e_],
                                                  np.full((max_length-e_+s_, sim_matrix.shape[1]), -np.inf)), axis=0))
# 这里sim_matrix是[shape为[max_length,100]的向量,shape为[max_length,100]的向量,..., shape为[max_length,100]的向量]
# 进行堆叠
sim_matrix = np.stack(tuple(sim_matrix_new), axis=0)
logger.info("after reshape, sim matrix size: {} x {} x {}".
                    format(sim_matrix.shape[0], sim_matrix.shape[1], sim_matrix.shape[2]))
# 输出
11/29/2023 14:59:44 - INFO -   after reshape, sim matrix size: 100 x 62 x 100

# 计算文本检索视频的指标
tv_metrics = tensor_text_to_video_metrics(sim_matrix)

def tensor_text_to_video_metrics(sim_tensor, top_k = [1,5,10]):
    # numpy.array转torch.tensor
    if not torch.is_tensor(sim_tensor):
      sim_tensor = torch.tensor(sim_tensor)
      
    # Permute sim_tensor so it represents a sequence of text-video similarity matrices.
    # Then obtain the double argsort to position the rank on the diagonal
    stacked_sim_matrices = sim_tensor.permute(1, 0, 2)
    #
    stacked_sim_matrices.shape
    torch.Size([62, 100, 100])
    stacked_sim_matrices[0][0]
    tensor([27.3189, 11.9506, 14.7600, 15.3767, 12.4263, 14.1658, 13.1632, 15.3417,
        12.9654,  8.3538, 14.1689,  9.2711, 12.5321, 13.9367, 10.4555, 10.6652,
        11.1756, 16.8021, 13.0225, 10.1496, 20.7404, 15.3992, 17.0187,  4.7106,
        16.0491, 16.8424, 17.6171, 10.1525, 12.2927, 11.0196, 16.0360, 15.5589,
        17.0313, 20.0688, 19.5513, 14.1102, 11.8426, 14.3085, 12.0711, 14.0612,
        17.7800, 16.1889, 20.4792, 15.3155, 19.9463, 20.9955, 14.0207, 11.0278,
        12.2528,  6.4979, 11.2442, 15.0072, 13.1231, 16.8470, 10.8007, 10.5979,
        12.5088, 11.1976, 14.0388,  6.7474, 15.2539, 14.6888, 10.5917, 11.7881,
        19.4673,  9.0201, 11.6811, 15.0995, 10.4226,  9.4162, 21.0467, 17.2501,
        21.9660, 17.7090,  8.3185, 12.3387, 11.1219, 11.0675, 18.9486, 16.8323,
        11.8102, 17.1494,  7.7283, 17.8285, 18.1599, 20.6162, 20.8480, 16.7183,
        12.4198, 10.4714, 11.7942, 15.2633, 11.4040, 16.9508, 13.4711, 12.6439,
        12.0092, 15.9236, 17.4429, 10.1693], dtype=torch.float64)
    #
    first_argsort = torch.argsort(stacked_sim_matrices, dim = -1, descending= True)
    # 
    first_argsort[0][0]
    tensor([ 0, 72, 70, 45, 86, 20, 85, 42, 33, 44, 34, 64, 78, 84, 83, 40, 73, 26,
        98, 71, 81, 32, 22, 93, 53, 25, 79, 17, 87, 41, 24, 30, 97, 31, 21,  3,
         7, 43, 91, 60, 67, 51,  2, 61, 37, 10,  5, 35, 39, 58, 46, 13, 94,  6,
        52, 18,  8, 95, 12, 56,  4, 88, 75, 28, 48, 38, 96,  1, 36, 80, 90, 63,
        66, 92, 50, 57, 16, 76, 77, 47, 29, 54, 15, 55, 62, 89, 14, 68, 99, 27,
        19, 69, 11, 65,  9, 74, 82, 59, 49, 23])
    # 这里返回根据相似度降序排序后的索引矩阵，72表示相似度第二位于原相似度矩阵的第72索引
    # 
    second_argsort = torch.argsort(first_argsort, dim = -1, descending= False)
    # 这里对first_argsort做升序排序，得到的矩阵是原相似度矩阵的每个元素的相似度的排名
    second_argsort[0][0]
    tensor([ 0, 67, 42, 35, 60, 46, 53, 36, 56, 94, 45, 92, 58, 51, 86, 82, 76, 27,
        55, 90,  5, 34, 22, 99, 30, 25, 17, 89, 63, 80, 31, 33, 21,  8, 10, 47,
        68, 44, 65, 48, 15, 29,  7, 37,  9,  3, 50, 79, 64, 98, 74, 41, 54, 24,
        81, 83, 59, 75, 49, 97, 39, 43, 84, 71, 11, 93, 72, 40, 87, 91,  2, 19,
         1, 16, 95, 62, 77, 78, 12, 26, 69, 20, 96, 14, 13,  6,  4, 28, 61, 85,
        70, 38, 73, 23, 52, 57, 66, 32, 18, 88])
    # Extracts ranks i.e diagonals
    ranks = torch.flatten(torch.diagonal(second_argsort, dim1 = 1, dim2 = 2))
    # 这里先打印出torch.diagonal(second_argsort, dim1 = 1, dim2 = 2)
    torch.diagonal(second_argsort, dim1 = 1, dim2 = 2)
    tensor([[ 0,  0,  1,  ...,  0,  3,  2],
        [ 0,  0,  1,  ...,  0,  1,  0],
        [ 0, 49,  1,  ...,  0,  3,  0],
        ...,
        [88, 74, 73,  ..., 28, 27, 26],
        [88, 74, 73,  ..., 28, 27, 26],
        [88, 74, 73,  ..., 28, 27, 26]])
    # 对角线代表，句子和对应视频的相似度
    torch.diagonal(second_argsort, dim1 = 1, dim2 = 2).shape
    torch.Size([62, 100])
    # 后续做了展平操作
    ranks.shape
    torch.Size([6200])
    #     
    # Now we need to extract valid ranks, as some belong to inf padding values
    permuted_original_data = torch.flatten(torch.diagonal(sim_tensor, dim1 = 0, dim2 = 2))
    # permuted_original_data是ranks所对应的数据
    sim_tensor.shape
    torch.Size([100, 62, 100])
    torch.diagonal(sim_tensor, dim1 = 0, dim2 = 2).shape
    torch.Size([62, 100])
    permuted_original_data.shape
    torch.Size([6200])
    permuted_original_data
    tensor([27.3189, 28.0503, 22.9410,  ...,    -inf,    -inf,    -inf],
       dtype=torch.float64)
    # mask
    tensor([ True,  True,  True,  ..., False, False, False])
    mask = ~ torch.logical_or(torch.isinf(permuted_original_data), torch.isnan(permuted_original_data))
    # 取有效的值
    valid_ranks
    tensor([0, 0, 1,  ..., 0, 0, 0])
    valid_ranks = ranks[mask]
    valid_ranks.shape
    torch.Size([4290])
    # A quick dimension check validates our results, there may be other correctness tests pending
    # Such as dot product localization, but that is for other time.
    #assert int(valid_ranks.shape[0]) ==  sum([len(text_dict[k]) for k in text_dict])
    if not torch.is_tensor(valid_ranks):
      valid_ranks = torch.tensor(valid_ranks)
    # 计算召回R@1,R@5,R@10
    results = {f"R{k}": float(torch.sum(valid_ranks < k) * 100 / len(valid_ranks)) for k in top_k}
    # 中位数排名 (MedianR), 中位数排名是一个重要的指标，它表示所有排名的中间值，这有助于了解一般情况下文档的排名位置。
    results["MedianR"] = float(torch.median(valid_ranks + 1))
    # 平均排名 (MeanR), 平均排名提供了模型整体排名性能的一个指标。
    results["MeanR"] = float(np.mean(valid_ranks.numpy() + 1))
    # 排名标准差 (Std_Rank), 用于衡量排名的变异性或一致性。标准差越小，表示模型的排名结果越一致。
    results["Std_Rank"] = float(np.std(valid_ranks.numpy() + 1))
    results['MR'] = results["MedianR"]
    return results

# 计算视频检索文本的指标
vt_metrics = compute_metrics(tensor_video_to_text_sim(sim_matrix))

def tensor_video_to_text_sim(sim_tensor):
    if not torch.is_tensor(sim_tensor):
      sim_tensor = torch.tensor(sim_tensor)
    # Code to avoid nans
    sim_tensor[sim_tensor != sim_tensor] = float('-inf')
    # Forms a similarity matrix for use with rank at k
    # 这里求出每个视频对应每组句子的最大值
    values, _ = torch.max(sim_tensor, dim=1, keepdim=True)
    # 转置，shape为(100,100)
    return torch.squeeze(values).T

def compute_metrics(x):
    # x
    tensor([[31.3992, 22.5782, 20.7784,  ..., 21.1828, 20.2004, 18.9807],
        [21.7923, 29.4904, 19.4861,  ..., 18.5809, 19.6253, 15.3143],
        [21.1787, 23.9858, 27.9316,  ..., 22.8542, 19.0947, 18.0181],
        ...,
        [20.6959, 19.4620, 18.9654,  ..., 26.8898, 18.0134, 16.5276],
        [22.3443, 19.8543, 21.3892,  ..., 20.2485, 24.6035, 18.5602],
        [18.8025, 17.3398, 18.7633,  ..., 15.8109, 16.2060, 28.3321]],
       dtype=torch.float64)
    # 对所有列排序
    sx
    array([[-31.39920616, -27.66517639, -26.41374969, ..., -15.53889465,
        -14.74133682, -13.20669365],
       [-29.49041939, -23.12905502, -22.62351799, ..., -14.17019653,
        -13.0715313 , -12.63034534],
       [-27.93160629, -24.15420914, -24.01354027, ..., -15.53527451,
        -14.91190338, -13.8048563 ],
       ...,
       [-26.88976288, -24.45761299, -24.3878746 , ..., -13.69131374,
        -12.05869675, -11.71543217],
       [-27.90517235, -24.98619461, -24.88581085, ..., -16.37366295,
        -15.42610073, -14.36475182],
       [-28.33205795, -26.38947678, -25.50229645, ..., -12.77716923,
        -12.7628994 , -12.21085739]])
    sx.shape
    (100, 100)
    sx = np.sort(-x, axis=1)
    # 对角线
    np.diag(-x)
    array([-31.39920616, -29.49041939, -27.93160629, -28.35176849,
       -30.22580528, -29.49812126, -31.84345627, -26.48898125,
       -31.19614029, -32.50800705, -26.63794136, -33.53875732,
       -29.70412445, -29.99581909, -30.4330349 , -27.77479935,
       -30.91006851, -28.78204346, -29.88134956, -26.90159988,
       -27.77323151, -24.84206009, -26.65229988, -29.98462296,
       -27.21224785, -28.58442307, -29.22354317, -31.91251755,
       -27.74791145, -27.34547806, -27.2964592 , -29.30712318,
       -27.25808716, -30.44463348, -30.50127983, -29.57052231,
       -28.83847618, -30.84763336, -26.25956535, -27.45178604,
       -28.43003845, -30.1237545 , -29.49394798, -31.24961853,
       -27.51448059, -28.08011246, -29.11924934, -28.89242554,
       -30.01185608, -29.20703316, -33.81283188, -32.47666168,
       -28.57559013, -32.03510666, -26.41053772, -31.41538811,
       -31.59647369, -30.60676193, -32.23682404, -28.99020386,
       -27.32891655, -25.65361595, -27.20375061, -27.37989044,
       -26.2093544 , -30.18797684, -24.17000771, -31.20343018,
       -27.10717583, -29.43353271, -27.55513763, -30.85765457,
       -27.35139656, -28.78152657, -30.12675095, -29.8391819 ,
       -33.15964127, -27.80212402, -29.50061607, -25.8354454 ,
       -24.66520309, -29.17455101, -27.82661057, -25.51939201,
       -30.82431221, -28.19944   , -28.81469727, -31.60061264,
       -28.77458572, -31.28947067, -31.12048149, -28.49939156,
       -28.01663208, -26.56383896, -27.14196777, -33.90385056,
       -30.66762352, -26.88976288, -24.60349464, -28.33205795])
    d = np.diag(-x)
    # d.shape
    (100, 1)
    d = d[:, np.newaxis]
    # ind
    array([[ 0.        ,  3.73402977,  4.98545647, ..., 15.86031151,
        16.65786934, 18.19251251],
       [ 0.        ,  6.36136436,  6.8669014 , ..., 15.32022285,
        16.41888809, 16.86007404],
       [ 0.        ,  3.77739716,  3.91806602, ..., 12.39633179,
        13.01970291, 14.12674999],
       ...,
       [ 0.        ,  2.43214989,  2.50188828, ..., 13.19844913,
        14.83106613, 15.17433071],
       [-3.3016777 , -0.38269997, -0.28231621, ...,  8.2298317 ,
         9.17739391, 10.23874283],
       [ 0.        ,  1.94258118,  2.82976151, ..., 15.55488873,
        15.56915855, 16.12120056]])
    ind = sx - d
    # 
    ind = np.where(ind == 0)
    # ind[1]
    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])
   # ind[0]
    array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
    ind = ind[1]
    metrics = {}
    # 计算召回
    metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
    metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
    metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
    metrics['MR'] = np.median(ind) + 1
    metrics["MedianR"] = metrics['MR']
    metrics["MeanR"] = np.mean(ind) + 1
    metrics["cols"] = [int(i) for i in list(ind)]
    return metrics