X3D代码理解之test(cfg)

最新推荐文章于 2024-03-27 10:24:42 发布

强宝的球球

最新推荐文章于 2024-03-27 10:24:42 发布

阅读量576

点赞数 1

文章标签：深度学习 pytorch 神经网络计算机视觉

本文链接：https://blog.csdn.net/ysy10599105/article/details/122073210

版权

test(cfg)

def test(cfg):
	# some settings
	......
	# Build the video model and printtest_meter.finalize_metrics()
    return test_meter model statistics.
    model = build_model(cfg)    # 建好模型类对象
    ......
    cu.load_test_checkpoint(cfg, model) # 加载模型参数
    # Create video testing loaders.
    test_loader = loader.construct_loader(cfg, "test")
    ......

build_model和cu.load_test_checkpoint函数参见demo代码理解篇

loader.construct_loader(cfg, “test”)在slowfast/datasets/loader.py:

construct_loader(cfg, split, is_precise_bn=False)

def construct_loader(cfg, split, is_precise_bn=False):
    """Constructs the data loader for the given dataset."""
    assert split in ["train", "val", "test"]
    if split in ["train"]:
        ......
        shuffle,drop_last = True,True
    elif split in ["val"]:
        ......
        shuffle,drop_last = False,False
    elif split in ["test"]:
        dataset_name = cfg.TEST.DATASET
        batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
        shuffle = False
        drop_last = False

    # Construct the dataset
    dataset = build_dataset(dataset_name, cfg, split)   # 主要属性包含路径列表（每个重复num_clips次），标签列表（同前），长度都为num_clips*n
                                                        # 索引列表（0～num_clips为一组，重复n(视频数量)组），视频数据字典的key为0~num_clips*n，value暂为空
    
    # if ..else... 根据条件设置loader
    loader = torch.utils.data.DataLoader(dataset, ......)
    return loader

其中build_dataset函数返回DATASET_REGISTRY.get(name)(cfg, split)，类似build_model，根据名字选择相应dataset类，构建类对象并返回。dataset类定义在slowfast/datasets下相应的py文件中类名包含Ava、Charades、Kinetics、Ssv2
使用得到的dataset类对象构造loader。

下面看Kinetics类

Kinetics.init(self, cfg, mode, num_retries=10)

@DATASET_REGISTRY.register()
class Kinetics(torch.utils.data.Dataset):
	"""
    Kinetics video loader. Construct the Kinetics video loader, then sample
    clips from the videos. For training and validation, a single clip is
    randomly sampled from every video with random cropping, scaling, and
    flipping. For testing, multiple clips are uniformaly sampled from every
    video with uniform cropping. For uniform cropping, we take the left, center,
    and right crop if the width is larger than height, or take top, center, and
    bottom crop if the height is larger than the width.
    """
    def __init__(self, cfg, mode, num_retries=10):
    	assert mode in [....
		self.mode = mode
        self.cfg = cfg

        self._video_meta = {}
        self._num_retries = num_retries
        # For training or validation mode, one single clip is sampled from every
        # video. For testing, NUM_ENSEMBLE_VIEWS clips are sampled from every
        # video. For every clip, NUM_SPATIAL_CROPS is cropped spatially from
        # the frames.
        if self.mode in ["train", "val"]:
            self._num_clips = 1
        elif self.mode in ["test"]: # 测试时 要采样多个clips 并将每个clip从空间上分割为3块
            self._num_clips = (
                cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS
            )

        logger.info("Constructing Kinetics {}...".format(mode))
        self._construct_loader()
	
	def _construct_loader(self):
        """
        Construct the video loader.
        """
        path_to_file = os.path.join(
            self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode)
        )
        assert g_pathmgr.exists(path_to_file), "{} dir not found".format(
            path_to_file
        )

        self._path_to_videos = []
        self._labels = []
        self._spatial_temporal_idx = []
        with g_pathmgr.open(path_to_file, "r") as f:
            for clip_idx, path_label in enumerate(f.read().splitlines()):   # 按行分割 逐行处理
                assert (
                    len(path_label.split(self.cfg.DATA.PATH_LABEL_SEPARATOR))   # 每行数据的分隔符默认为空格
                    == 2
                )
                path, label = path_label.split(
                    self.cfg.DATA.PATH_LABEL_SEPARATOR
                )
                for idx in range(self._num_clips):
                    self._path_to_videos.append(    # 重复添加视频路径
                        os.path.join(self.cfg.DATA.PATH_PREFIX, path)
                    )
                    self._labels.append(int(label)) # 重复添加标签
                    self._spatial_temporal_idx.append(idx) #  重复添加0～self._num_clips-1
                    self._video_meta[clip_idx * self._num_clips + idx] = {}
        assert (
            len(self._path_to_videos) > 0
        ), "Failed to load Kinetics split {} from {}".format(
            self._split_idx, path_to_file
        )
        logger.info(
            "Constructing kinetics dataloader (size: {}) from {}".format(
                len(self._path_to_videos), path_to_file
            )
        )

Kinetics类中的主要数据成员包括：视频数据字典、路径列表、标签列表、索引列表
对于每个原始视频，对其均匀采样出NUM_ENSEMBLE_VIEWS（10）个clip，并将每个clip从空间上剪裁为NUM_SPATIAL_CROPS（3）块，所以共self._num_clips = cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS（30）个子视频片段。
所以要将原视频的路径和标签重复self._num_clips次，放入路径列表和标签列表。索引列表中放入0~num_clips -1，其中每组相连的3个索引对应一个clip的左中右块。视频数据字典的key为0 ~num_clips*n，value暂为空。n为测试原视频数量。

test(cfg)

回到test(cfg)

def test(cfg):
	......  
	# Create video testing loaders.
    test_loader = loader.construct_loader(cfg, "test")
	......
	# Create meters for multi-view testing.
    test_meter = TestMeter(
        test_loader.dataset.num_videos
        // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
        cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
        cfg.MODEL.NUM_CLASSES,
        len(test_loader),   # ceil（num_clips/batch_size）
        cfg.DATA.MULTI_LABEL,
        cfg.DATA.ENSEMBLE_METHOD,
    )
    ...... # tensorboard相关
    
    # Perform multi-view test on the entire dataset.
    test_meter = perform_test(test_loader, model, test_meter, cfg, writer)
    ......

构造TestMeter对象，用于记录每个输入原视频的预测值和标签，以及融合clips的结果。
其中数据成员主要包括几个计时器，几个tensor：video_preds:(n,2)，video_labels:(n,)，clip_count(n,)记录以叠加的clip数，以及列表topk_accs和字典stats。 n为输入的测试视频总数

下面看perform_test函数

perform_test(test_loader, model, test_meter, cfg, writer=None)


@torch.no_grad()
def perform_test(test_loader, model, test_meter, cfg, writer=None):
	# Enable eval mode.
    model.eval()
    test_meter.iter_tic()   # iter_timer和data_timer开始计时

    for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader):
    # 取出batchsize个经过时序剪裁、采样和空间剪裁后得到的帧序列，相应有batchsize个labels和video_idx
    	......

此处for循环会调用构造了test_loader的类对象的__getitem__(self, index)方法

Kinetics.getitem(self, index)

计算时序索引（0~9）和空间索引（ 0 ~ 2，对应左中右或上中下）
设置min_scale, max_scale, crop_size，采样率
获取视频video_container = container.get_video_container(…)，加载失败则重试，或读取别的视频
frames = decoder.decode(container, …)
- frames, fps, decode_all_video = pyav_decode(container, …)
  - 从container中获取视频fps、frames_length（总帧数）、duration
  - 第clip_idx个clip对应的起止帧 start_idx, end_idx = get_start_end_idx(frames_length, sampling_rate * num_frames / target_fps * fps, clip_idx, num_clips)，其中sampling_rate * num_frames / target_fps * fps，即采样率* 采样帧数/目标fps*原视频fps，为待采样clip的帧数clip_size。
    - delta = max(video_size - clip_size, 0)
    - start_idx = delta * clip_idx / num_clips
    - end_idx = start_idx + clip_size - 1
    - 略显奇怪。因为1.采样数和采样率固定，与每个片段长度无关。2.start_idx不是从video_size / num_clips*clip_idx开始
  - 根据起止帧计算起止时间戳
  - video_frames, max_pts = pyav_decode_stream(…)：从起止时间戳对应区间提取帧，帧数为sampling_rate * num_frames / target_fps * fps
- 计算clip_sz = sampling_rate * num_frames / target_fps * fps
- 计算提取的clip的起止帧，即0和clip_sz-1
- 从中采样num_frames帧并return
若采样失败则重试，或读取别的视频
对帧数据进行标准化、变换维度、空间采样frames = utils.spatial_sampling(…)：
- 将短边固定为crop_size(356)，长边等比缩放(632)
- 根据空间索引从左右中或上中下裁剪出crop_size*crop_size大小的clip
获得label，将frames变换为所需形式

perform_test(test_loader, model, test_meter, cfg, writer=None)

回到perform_test


@torch.no_grad()
def perform_test(test_loader, model, test_meter, cfg, writer=None):
	# Enable eval mode.
    model.eval()
    test_meter.iter_tic()   # iter_timer和data_timer开始计时

    for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader):
    # 取出batchsize个经过时序剪裁、采样和空间剪裁后得到的帧序列，相应有batchsize个labels和video_idx
    	......
    	# Transfer the data to the current GPU device.
    	.....
    	test_meter.data_toc()	# data_timer停止 net_timer开始
    	......
    	preds = model(inputs)   # 得到batchsize个clip 属于各类别的预测值
    	......
    	test_meter.iter_toc()   # iter_timer和net_timer都停止
        # Update and log stats.
        test_meter.update_stats(	# 记录下原视频的label 将clip的预测值叠加到相应原视频上
            preds.detach(), labels.detach(), video_idx.detach()
        )
        test_meter.log_iter_stats(cur_iter)

    	test_meter.iter_tic() # 下一轮计时开始
    ......
    test_meter.finalize_metrics()	# 计算准确率
    return test_meter