pytorch分布式训练过程中,如果想实现将某个进程的数据传给其他进程,可以使用dist.broadcast
操作实现,以下是一个例子。
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
def setup():
# 获取环境变量
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
dist.init_process_group(backend='nccl', init_method='env://')
torch.cuda.set_device(local_rank)
print(f"Rank {rank}/{world_size} is initialized.")
def cleanup():
dist.destroy_process_group()
def broadcast_tensor():
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
# 在源进程(比如进程0)中初始化张量
if rank == 0:
tensor = torch.tensor([1, 2, 3, 4], dtype=torch.float32).cuda(local_rank)
else:
# 这边不创建一个空tensor,其他进程下一步调用广播操作会报错,并且大小要和rank0的大小一致!
tensor = torch.empty(4, dtype=torch.float32).cuda(local_rank)
print(f"Before broadcast - Rank {rank} has tensor {tensor}")
# 所有进程调用广播操作
dist.broadcast(tensor, src=0)
print(f"After broadcast - Rank {rank} has tensor {tensor}")
def main():
setup()
broadcast_tensor()
cleanup()
if __name__ == "__main__":
main()
执行:
python -m torch.distributed.launch --nproc_per_node=4 x.py
输出: