使用一个函数 load_yaml_conf
从 YAML 文件中加载配置信息,结果存储在 yaml_conf
变量中。
yaml_conf = load_yaml_conf(yaml_file)
检查配置中是否有 use_container
键,如果有,根据其值决定使用 Docker 还是 Kubernetes 容器。如果使用 Kubernetes,则调用 submit_to_k8s
函数并返回。
if 'use_container' in yaml_conf:
if yaml_conf['use_container'] == "docker":
use_container = "docker"
ports = yaml_conf['ports']
elif yaml_conf['use_container'] == "k8s":
submit_to_k8s(yaml_conf)
return
else:
print(f'Error: unknown use_container:{yaml_conf["use_container"]}, the supported options are ["docker", "k8s"].')
exit(1)
else:
use_container = "default"
从配置中获取 Parameter Server 的 IP 和 Worker 的 IPs。
ps_ip = yaml_conf['ps_ip']
worker_ips, total_gpus = [], []
遍历配置中的 Worker IPs,将每个 IP 与 GPU 列表对应起来。
executor_configs = "=".join(yaml_conf['worker_ips']) if 'worker_ips' in yaml_conf else ''
if 'worker_ips' in yaml_conf:
for ip_gpu in yaml_conf['worker_ips']:
ip, gpu_list = ip_gpu.strip().split(':')
worker_ips.append(ip)
total_gpus.append(eval(gpu_list))
生成当前时间的时间戳,并初始化作业名称。
time_stamp = datetime.datetime.fromtimestamp(
time.time()).strftime('%m%d_%H%M%S')
running_vms = set()
job_name = 'fedscale_job'
设置默认的日志路径,以及从配置中获取的 SSH 用户。
log_path = './logs'
submit_user = f"{yaml_conf['auth']['ssh_user']}@" if len(yaml_conf['auth']['ssh_user']) else ""
构建作业配置信息
job_conf = {'time_stamp': time_stamp,
'ps_ip': ps_ip,
}
for conf in yaml_conf['job_conf']:
job_conf.update(conf)
构建配置脚本和设置命令。如果存在设置命令,则将其拼接为字符串。
conf_script = ''
setup_cmd = ''
if yaml_conf['setup_commands'] is not None:
setup_cmd += (yaml_conf['setup_commands'][0] + ' && ')
for item in yaml_conf['setup_commands'][1:]:
setup_cmd += (item + ' && ')
构建命令后缀字符串 ,处理作业命与日志路径。
cmd_sufix = f" "
for conf_name in job_conf:
conf_script = conf_script + f' --{conf_name} {job_conf[conf_name]}'
if conf_name == "job_name":
job_name = job_conf[conf_name]
if conf_name == "log_path":
log_path = os.path.join(
job_conf[conf_name], 'log', job_name, time_stamp)
计算所有 GPU 的总进程数。
total_gpu_processes = sum([sum(x) for x in total_gpus])