1.docker info
docker stats --format "table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
2.docker环境打包
docker commit jx_4 jx:bert
3.Horvod分布式训练环境搭建
docker pull horovod/horovod:0.18.1-tf1.14.0-torch1.2.0-mxnet1.5.0-py3.6
nvidia-docker run -it -d -p 6007:6006 --memory 200gb -v /data/jx:/root/jx --name jx_4 horovod/horovod:0.18.1-tf1.14.0-torch1.2.0-mxnet1.5.0-py3.6 bash
docker exec -it jx_4 bash
horovodrun -np 4 -H localhost:4 python3 run_tsc_hvd_cls.py --do_train=True
--do_export=True --num_train_epochs=30 --dev_dataset="dev" |& grep -v "Read -1"
4.docker镜像save与load
docker save aigroup:ngc-1.0.0 -o ./aigroup_ngc.tar
docker load -i aigroup_ngc.tar
5.docker容器创建
docker run --gpus all -d -p 6008:6006 -p 1229:22 -p 6767:8888 --memory 200gb -v /data/jx:/root/jx -v /data/aigroup:/root/aigroup -v /etc/localtime:/etc/localtime -e ROOT_PASS="sl123456" -e NOTEBOOK_PASS="sl123456" -e NOTEBOOK_USER="jx" --name jx_ngc aigroup:ngc-1.0.0