前言
项目:https://github.com/OpenDriveLab/PersFormer_3DLane/tree/main
提示:以下是本篇文章正文内容,下面仅用openlane数据集
一、创建环境
本实验环境在autodl服务器上
4*GPU-RTX3080
PyTorch 1.11.0
Python 3.8(ubuntu20.04)
Cuda 11.3
安装参考官方文档说明
pip3 install -r requirements.txt
cd models/nms/
python setup.py install
cd ../ops/
bash make.sh
二、准备数据集
安装官方要求准备openlane数据集
整体目录文件如下
1.准备好数据集后,修改persformer_openlane.py下数据集路径,其中persformer_openlane.py在autodl-tmp/PersFormer_3DLane/config文件夹下
修改后例如:
2.本地手动下载权重https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth
下载完成后传入autodl-tmp中
mkdir -p ~/.cache/torch/hub/checkpoints/
cp tf_efficientnet_b7_ns-1dbc32de.pth ~/.cache/torch/hub/checkpoints/
注意:此时你在cp时你终端应在autodl-tmp目录下,以便cp
3.由于本实验环境4张3080分布式运行,在ddp.py中修改
原先已注释掉,重新修改如下
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import os
import subprocess
import numpy as np
import random
def setup_dist_launch(args):
args.proc_id = args.local_rank
world_size = int(os.getenv('WORLD_SIZE', 1))*args.nodes
print("proc_id: " + str(args.proc_id))
print("world size: " + str(world_size))
print("local_rank: " + str(args.local_rank))
os.environ['WORLD_SIZE'] = str(world_size)
os.environ['RANK'] = str(args.proc_id)
os.environ['LOCAL_RANK'] = str(args.local_rank)
def setup_slurm(args):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
args.proc_id = int(os.environ['SLURM_PROCID'])
ntasks = int(os.environ['SLURM_NTASKS'])
node_list = os.environ['SLURM_NODELIST']
num_gpus = torch.cuda.device_count()
local_rank = args.proc_id % num_gpus
args.local_rank = local_rank
print("proc_id: " + str(args.proc_id))
print("world size: " + str(ntasks))
print("local_rank: " + str(local_rank))
addr = subprocess.getoutput(
f'scontrol show hostname {node_list} | head -n1')
os.environ['MASTER_PORT'] = str(args.port)
os.environ['MASTER_ADDR'] = addr
os.environ['WORLD_SIZE'] = str(ntasks)
os.environ['RANK'] = str(args.proc_id)
os.environ['LOCAL_RANK'] = str(local_rank)
# def setup_distributed(args):
# if not dist.is_initialized():#新加
# dist.init_process_group(backend='nccl')#新加
# print(f"Rank: {dist.get_rank()}, World size: {dist.get_world_size()}")#新加
# args.gpu = args.local_rank
# torch.cuda.set_device(args.gpu)
# dist.init_process_group(backend='nccl')
# args.world_size = dist.get_world_size()
# torch.set_printoptions(precision=10)
def setup_distributed(args):
if not dist.is_initialized():
dist.init_process_group(backend='nccl')
print(f"Rank: {dist.get_rank()}, World size: {dist.get_world_size()}")
args.gpu = args.local_rank
torch.cuda.set_device(args.gpu)
args.world_size = dist.get_world_size()
torch.set_printoptions(precision=10)
# def ddp_init(args):
# setup_distributed(args)#新加
# args.proc_id, args.gpu, args.world_size = 0, 0, 1
# if args.use_slurm == True:
# setup_slurm(args)
# else:
# setup_dist_launch(args)
# if 'WORLD_SIZE' in os.environ:
# args.distributed = int(os.environ['WORLD_SIZE']) >= 1
# if args.distributed:
# setup_distributed(args)
# # deterministic
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True
# torch.manual_seed(args.proc_id)
# np.random.seed(args.proc_id)
# random.seed(args.proc_id)
def ddp_init(args):
args.proc_id, args.gpu, args.world_size = 0, 0, 1
if args.use_slurm == True:
setup_slurm(args)
else:
setup_dist_launch(args)
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) >= 1
if args.distributed:
setup_distributed(args)
# deterministic
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.manual_seed(args.proc_id)
np.random.seed(args.proc_id)
random.seed(args.proc_id)
# def to_python_float(t):
# if hasattr(t, 'item'):
# return t.item()
# else:
# return t[0]
# def reduce_tensor(tensor, world_size):
# rt = tensor.clone()
# dist.all_reduce(rt, op=dist.ReduceOp.SUM)
# rt /= world_size
# return rt
# def reduce_tensors(*tensors, world_size):
# return [reduce_tensor(tensor, world_size) for tensor in tensors]
def to_python_float(t):
if hasattr(t, 'item'):
return t.item()
else:
return t[0]
def reduce_tensor(tensor, world_size):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= world_size
return rt
def reduce_tensors(*tensors, world_size):
return [reduce_tensor(tensor, world_size) for tensor in tensors]
4.编辑bashrc
vim ~/.bashrc
编辑加入以下信息,注意cuda版本
export LIBRARY_PATH=/usr/local/cuda-11.3/lib64/:$LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64/:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda-11.3/bin/:$PATH
export CUDA_HOME=/usr/local/cuda-11.3/
然后按esc输入:wq保存退出
最后刷新一下
source ~/.bashrc
5.运行
python -m torch.distributed.launch --nproc_per_node 4 main_persformer.py --mod=PersFormer --batch_size=2
总结
提示:这里对文章进行总结:
第一轮训练时间总3:13:38,训练19726张
第一轮测试时间总21:59,测试4998张
第一轮训练时间总3:16:38
第一轮测试时间总23:38