1 cpu的设备识别检测
yum install pciutils
lspci | grep NVIDIA
00:03.0 3D controller: NVIDIA Corporation Device 1b38 (rev a1)
2 安装显卡驱动
wget http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-repo-rhel7-8.0.61-1.x86_64.rpm
rpm -Uvh cuda-repo-rhel7-8.0.61-1.x86_64.rpm
依赖安装
wget http://vault.centos.org/7.0.1406/updates/x86_64/Packages/kernel-devel-3.10.0-123.4.4.el7.x86_64.rpm
wget http://vault.centos.org/7.0.1406/updates/x86_64/Packages/kernel-headers-3.10.0-123.4.4.el7.x86_64.rpm
rpm -Uvh kernel-devel-3.10.0-123.4.4.el7.x86_64.rpm
rpm -Uvh kernel-headers-3.10.0-123.4.4.el7.x86_64.rpm
yum install cuda-8-0
3 查看gpu使用情况
nvidia-smi
Mon Nov 27 11:01:51 2017
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P40 Off | 00000000:00:03.0 Off | 0 |
| N/A 23C P0 45W / 250W | 0MiB / 22912MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
4.测试GPU是否安装成功
cd /usr/local/cuda/bin
sh cuda-install-samples-8.0.sh ~/cuda-test/
cd ~/cuda-test/NVIDIA_CUDA-8.0_Samples
make
./bin/x86_64/linux/release/deviceQuery 获取设备状态 ./bin/x86_64/linux/release/bandwidthTest 测试设备带宽
5.配置环境变量
PATH=$PATH:$HOME/bin:/usr/local/cuda/bin
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/
CUDA_HOME=/usr/local/cuda
export PATH
export LD_LIBRARY_PATH
export CUDA_HOME
6.安装cudnn
下载对应的:
http://www.nvidia.cn/Download/index.aspx?lang=cn
7.安装docker
yum update
curl -sSL https://get.docker.com/ | sh
报错:
Delta RPMs disabled because /usr/bin/applydeltarpm not installed.
查找包含的包:
yum provides '*/applydeltarpm'
安装:
yum install deltarpm
yum install -y nvidia-docker
docker tensorflow image安装
nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu
修改默认的存储位置
1.修改配置文件
/usr/lib/systemd/system/docker.service
/usr/lib/systemd/system/nvidia-docker.service
root@10-23-14-125 data]#cat /usr/lib/systemd/system/docker.service
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
#ExecStart=/usr/bin/dockerd
#graph 指定存储位置,registry-mirror 加速镜像
ExecStart=/usr/bin/dockerd --graph /data/docker-data/docker --registry-mirror=https://u1qbyfsc.mirror.aliyuncs.com
ExecReload=/bin/kill -s HUP $MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target
[root@10-23-14-125 data]# cat /usr/lib/systemd/system/nvidia-docker.service
[Unit]
Description=NVIDIA Docker plugin
Documentation=https://github.com/NVIDIA/nvidia-docker/wiki
After=local-fs.target network.target
Wants=docker.service
[Service]
Environment="SOCK_DIR=/data/docker-data/nvidia-docker"
Environment="SPEC_FILE=/etc/docker/plugins/nvidia-docker.spec"
#User=nvidia-docker
User=root
PermissionsStartOnly=true
Restart=on-failure
RestartSec=1
TimeoutStartSec=0
TimeoutStopSec=20
ExecStart=/usr/bin/nvidia-docker-plugin -s $SOCK_DIR
ExecStartPost=/bin/sh -c '/bin/mkdir -p $( dirname $SPEC_FILE )'
ExecStartPost=/bin/sh -c '/bin/echo unix://$SOCK_DIR/nvidia-docker.sock > $SPEC_FILE'
ExecStopPost=/bin/rm -f $SPEC_FILE
[Install]
WantedBy=multi-user.target
2.数据盘建立软件
ln -s /data/docker /var/lib/docker
3 注意启动需要加 –privileged=true
docker run -it -d --privileged=true xcartensorflow/xcartensorflow:new-gpu /bin/bash
-privileged=true 加这个参数,否则nvida设备不会挂载上
查看设备是否正确挂载
ll /dev/nv*
crw-rw-rw- 1 root root 195, 0 12月 18 17:59 /dev/nvidia0
crw-rw-rw- 1 root root 195, 255 12月 18 17:59 /dev/nvidiactl
crw-rw-rw- 1 root root 247, 0 12月 18 17:59 /dev/nvidia-uvm
crw-rw-rw- 1 root root 247, 1 12月 18 17:59 /dev/nvidia-uvm-tools
crw------- 1 root root 10, 144 12月 18 17:59 /dev/nvram
参考:
https://hub.docker.com/r/tensorflow/tensorflow/
https://aur.archlinux.org/packages/nvidia-docker/