测试分布式代码正确性的技巧

鱼儿会飞吗

已于 2024-08-01 09:54:11 修改

阅读量515

点赞数 12

文章标签：分布式

于 2024-08-01 09:25:31 首次发布

本文链接：https://blog.csdn.net/qq_34425255/article/details/140838540

版权

其实可以小批量样本测试一下代码正确性，不一定非得56880个视频样本一股脑八张卡，不然为了测试代码正确性太烧钱了

举个例子

运行下面代码distribute_custom_2d_skeleton.py的时候

# Copyright (c) OpenMMLab. All rights reserved.
# coding=utf-8
# coding=gbk
import argparse
import os
import os.path as osp
import pyskl
from mmdet.apis import inference_detector, init_detector
from mmpose.apis import inference_top_down_pose_model, init_pose_model
import decord
import mmcv
import numpy as np
import torch
import torch.distributed as dist
from tqdm import tqdm
import cv2
from pyskl.smp import mrlines


def extract_frame(video_path):
    vid = decord.VideoReader(video_path)
    return [x.asnumpy() for x in vid]


def detection_inference(model, frames):
    model = model.cuda()
    results = []
    for frame in frames:
        result = inference_detector(model, frame)
        results.append(result)
    return results


def pose_inference(model, frames, det_results):
    model = model.cuda()
    assert len(frames) == len(det_results)
    total_frames = len(frames)
    num_person = max([len(x) for x in det_results])
    kp = np.zeros((num_person, total_frames, 17, 3), dtype=np.float32)

    for i, (f, d) in enumerate(zip(frames, det_results)):
        d = [dict(bbox=x) for x in list(d)]
        pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
        for j, item in enumerate(pose):
            kp[j, i] = item['keypoints']
    return kp


pyskl_root = osp.dirname(pyskl.__path__[0])
default_det_config = f'{pyskl_root}/demo/faster_rcnn_r50_fpn_1x_coco-person.py'
default_det_ckpt = (
    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/'
    'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth')
default_pose_config = f'{pyskl_root}/demo/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py'
default_pose_ckpt = (
    '/root/pyskl/demo/litehrnet18_coco_256x192-6bace359_20211230.pth')


def parse_args():
    parser = argparse.ArgumentParser(
        description='Generate 2D pose annotations for a custom video dataset')
    parser.add_argument(
        '--det-config',
        default=default_det_config,
        help='human detection config file path (from mmdet)')
    parser.add_argument(
        '--det-ckpt',
        default=default_det_ckpt,
        help='human detection checkpoint file/url')
    parser.add_argument('--pose-config', type=str, default=default_pose_config)
    parser.add_argument('--pose-ckpt', type=str, default=default_pose_ckpt)
    parser.add_argument('--det-score-thr', type=float, default=0.7)
    parser.add_argument('--det-area-thr', type=float, default=1300)
    parser.add_argument('--video-list', default=r'/root/pyskl/until/no_miss.list', type=str, help='the list of source videos')
    parser.add_argument('--out', default=r'/root/pyskl/until/no_miss_ntu_train.pkl', type=str, help='output pickle name')
    parser.add_argument('--tmpdir', type=str, default='tmp')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument('--num-gpus', type=int, default=1, help='number of GPUs to use')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    assert args.out.endswith('.pkl')

    dist.init_process_group(backend='nccl', world_size=args.num_gpus)
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    torch.cuda.set_device(args.local_rank)

    lines = mrlines(args.video_list)
    lines = [x.split() for x in lines]

    assert len(lines[0]) in [1, 2]
    if len(lines[0]) == 1:
        annos = [dict(frame_dir=osp.basename(x[0]).split('.')[0], filename=x[0]) for x in lines]
    else:
        annos = [dict(frame_dir=osp.basename(x[0]).split('.')[0], filename=x[0], label=int(x[1])) for x in lines]

    my_part = annos[rank::world_size]

    if rank == 0:
        os.makedirs(args.tmpdir, exist_ok=True)
    dist.barrier()

    det_model = init_detector(args.det_config, args.det_ckpt, 'cuda')
    assert det_model.CLASSES[0] == 'person', 'A detector trained on COCO is required'
    pose_model = init_pose_model(args.pose_config, args.pose_ckpt, 'cuda')

    for anno in tqdm(my_part):
        frames = extract_frame(anno['filename'])
        det_results = detection_inference(det_model, frames)
        det_results = [x[0] for x in det_results]
        for i, res in enumerate(det_results):
            res = res[res[:, 4] >= args.det_score_thr]
            box_areas = (res[:, 3] - res[:, 1]) * (res[:, 2] - res[:, 0])
            assert np.all(box_areas >= 0)
            res = res[box_areas >= args.det_area_thr]
            det_results[i] = res

        pose_results = pose_inference(pose_model, frames, det_results)
        shape = frames[0].shape[:2]
        anno['img_shape'] = anno['original_shape'] = shape
        anno['total_frames'] = len(frames)
        anno['num_person_raw'] = pose_results.shape[0]
        anno['keypoint'] = pose_results[..., :2].astype(np.float16)
        anno['keypoint_score'] = pose_results[..., 2].astype(np.float16)
        anno.pop('filename')

    mmcv.dump(my_part, osp.join(args.tmpdir, f'part_{rank}.pkl'))
    dist.barrier()

    if rank == 0:
        parts = [mmcv.load(osp.join(args.tmpdir, f'part_{i}.pkl')) for i in range(world_size)]
        rem = len(annos) % world_size
        if rem:
            for i in range(rem, world_size):
                parts[i].append(None)

        ordered_results = []
        for res in zip(*parts):
            ordered_results.extend(list(res))
        ordered_results = ordered_results[:len(annos)]
        mmcv.dump(ordered_results, args.out)
    dist.destroy_process_group()


if __name__ == '__main__':
    main()

发现使用分布式8张卡的代码将56880个视频样本生成train.pkl的时候，发现日志文件运行到了最后居然报错了

[E ProcessGroupNCCL.cpp:719] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=3, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1809474 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:406] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. To avoid this inconsistency, we are taking the entire process down.
terminate called after throwing an instance of 'std::runtime_error'
  what():  [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=3, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1809474 milliseconds before timing out.
/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See 
https://pytorch.org/docs/stable/distributed.html#launch-utility for 
further instructions

  FutureWarning,
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1324 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1328 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 1321) of binary: /root/miniconda3/envs/pyskl/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/pyskl/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/root/miniconda3/envs/pyskl/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in <module>
    main()
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main
    launch(args)
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch
    run(args)
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/run.py", line 718, in run
    )(*cmd_args)
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/pyskl/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 247, in launch_agent
    failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=====================================================
distribute_custom_2d_skeleton.py FAILED
-----------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-----------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-07-29_18:53:20
  host      : autodl-container-b07c47a3b9-531cc242
  rank      : 0 (local_rank: 0)
  exitcode  : -6 (pid: 1321)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 1321
=====================================================

这个时候通过修改代码，想看看自己是否解决了错误，难道还无脑8张卡跑一遍吗？一方面是时间太长，另外一方面是8张卡太贵

所以不妨小批量视频样本测试一下修改之后的代码的正确性

比如80个视频样本，采用8张卡

修改代码如下：

# Copyright (c) OpenMMLab. All rights reserved.
# coding=utf-8
# coding=gbk
import argparse
import os
import os.path as osp
import pyskl
from mmdet.apis import inference_detector, init_detector
from mmpose.apis import inference_top_down_pose_model, init_pose_model
import decord
import mmcv
import numpy as np
import torch
import torch.distributed as dist
from tqdm import tqdm
import cv2
from pyskl.smp import mrlines


def extract_frame(video_path):
    vid = decord.VideoReader(video_path)
    return [x.asnumpy() for x in vid]


def detection_inference(model, frames):
    model = model.cuda()
    results = []
    for frame in frames:
        result = inference_detector(model, frame)
        results.append(result)
    return results


def pose_inference(model, frames, det_results):
    model = model.cuda()
    assert len(frames) == len(det_results)
    total_frames = len(frames)
    num_person = max([len(x) for x in det_results])
    kp = np.zeros((num_person, total_frames, 17, 3), dtype=np.float32)

    for i, (f, d) in enumerate(zip(frames, det_results)):
        d = [dict(bbox=x) for x in list(d)]
        pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
        for j, item in enumerate(pose):
            kp[j, i] = item['keypoints']
    return kp


pyskl_root = osp.dirname(pyskl.__path__[0])
default_det_config = f'{pyskl_root}/demo/faster_rcnn_r50_fpn_1x_coco-person.py'
default_det_ckpt = (
    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/'
    'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth')
default_pose_config = f'{pyskl_root}/demo/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py'
default_pose_ckpt = (
    '/root/pyskl/demo/litehrnet18_coco_256x192-6bace359_20211230.pth')


def parse_args():
    parser = argparse.ArgumentParser(
        description='Generate 2D pose annotations for a custom video dataset')
    parser.add_argument(
        '--det-config',
        default=default_det_config,
        help='human detection config file path (from mmdet)')
    parser.add_argument(
        '--det-ckpt',
        default=default_det_ckpt,
        help='human detection checkpoint file/url')
    parser.add_argument('--pose-config', type=str, default=default_pose_config)
    parser.add_argument('--pose-ckpt', type=str, default=default_pose_ckpt)
    parser.add_argument('--det-score-thr', type=float, default=0.7)
    parser.add_argument('--det-area-thr', type=float, default=1300)
    parser.add_argument('--video-list', default=r'/root/pyskl/until/Modify/modify.list', type=str, help='the list of source videos')
    parser.add_argument('--out', default=r'/root/pyskl/until/Modify/modify_train.pkl', type=str, help='output pickle name')
    parser.add_argument('--tmpdir', type=str, default='tmp')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument('--num-gpus', type=int, default=1, help='number of GPUs to use')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    assert args.out.endswith('.pkl')

    dist.init_process_group(backend='nccl', world_size=args.num_gpus)
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    torch.cuda.set_device(args.local_rank)

    lines = mrlines(args.video_list)
    lines = [x.split() for x in lines]

    assert len(lines[0]) in [1, 2]
    if len(lines[0]) == 1:
        annos = [dict(frame_dir=osp.basename(x[0]).split('.')[0], filename=x[0]) for x in lines]
    else:
        annos = [dict(frame_dir=osp.basename(x[0]).split('.')[0], filename=x[0], label=int(x[1])) for x in lines]

    my_part = annos[rank::world_size]

    if rank == 0:
        os.makedirs(args.tmpdir, exist_ok=True)
    dist.barrier()

    det_model = init_detector(args.det_config, args.det_ckpt, 'cuda')
    assert det_model.CLASSES[0] == 'person', 'A detector trained on COCO is required'
    pose_model = init_pose_model(args.pose_config, args.pose_ckpt, 'cuda')

    for anno in tqdm(my_part):
        frames = extract_frame(anno['filename'])
        det_results = detection_inference(det_model, frames)
        det_results = [x[0] for x in det_results]
        for i, res in enumerate(det_results):
            res = res[res[:, 4] >= args.det_score_thr]
            box_areas = (res[:, 3] - res[:, 1]) * (res[:, 2] - res[:, 0])
            assert np.all(box_areas >= 0)
            res = res[box_areas >= args.det_area_thr]
            det_results[i] = res

        pose_results = pose_inference(pose_model, frames, det_results)
        shape = frames[0].shape[:2]
        anno['img_shape'] = anno['original_shape'] = shape
        anno['total_frames'] = len(frames)
        anno['num_person_raw'] = pose_results.shape[0]
        anno['keypoint'] = pose_results[..., :2].astype(np.float16)
        anno['keypoint_score'] = pose_results[..., 2].astype(np.float16)
        anno.pop('filename')

    mmcv.dump(my_part, osp.join(args.tmpdir, f'part_{rank}.pkl'))
    dist.barrier()

    if rank == 0:
        parts = [mmcv.load(osp.join(args.tmpdir, f'part_{i}.pkl')) for i in range(world_size)]
        rem = len(annos) % world_size
        if rem:
            for i in range(rem, world_size):
                parts[i].append(None)

        ordered_results = []
        for res in zip(*parts):
            ordered_results.extend(list(res))
        ordered_results = ordered_results[:len(annos)]
        mmcv.dump(ordered_results, args.out)
    dist.destroy_process_group()


if __name__ == '__main__':
    main()

修改代码之后发现小批量视频样本80个视频样本测试就不报错了

而且正确生成了train.pkl

https://github.com/kennymckormick/pyskl/blob/main/tools/data/custom_2d_skeleton.py

鱼儿会飞吗

关注

12
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
测试分布式代码正确性的技巧

这个时候通过修改代码，想看看自己是否解决了错误，难道还无脑8张卡跑一遍吗？其实可以小批量样本测试一下代码正确性，不一定非得56880个视频样本一股脑八张卡，不然为了测试代码正确性太烧钱了。发现使用分布式8张卡的代码将56880个视频样本生成train.pkl的时候，发现日志文件运行到了最后居然报错了。运行下面代码distribute_custom_2d_skeleton.py的时候。所以不妨小批量视频样本测试一下修改之后的代码的正确性。比如80个视频样本，采用8张卡。
复制链接

扫一扫