“ 深度学习GPU环境准备工作。”
# 查看当前GPU型号以及驱动推荐
> sudo apt install ubuntu-drivers-common
> ubuntu-drivers devices
== /sys/devices/pci0000:00/0000:00:1e.0 ==
modalias : pci:v000010DEd00000
vendor : NVIDIA Corporation
model : GK210GL [Tesla K80]
driver : nvidia-driver-430 - distro non-free recommended
driver : nvidia-driver-390 - distro non-free
driver : xserver-xorg-video-nouveau - distro free builtin
# 显卡特斯拉K80,推荐使用驱动版本430,直接安装,也可以到这里下载安装, 可能要麻烦一些(https://www.nvidia.cn/Download/index.aspx?lang=cn)
> sudo ubuntu-drivers autoinstall
# 这里要重启机器
> reboot
> nvidia-smi
ue Aug 20 06:17:02 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.48 Driver Version: 410.48 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 Off | 00000000:00:1E.0 Off | 0 |
| N/A 52C P0 70W / 149W | 1183MiB / 11441MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 12632 C python3 1170MiB |
+-----------------------------------------------------------------------------+
# 下面安装cuda, cudnn
# 由于tensorflow当前版本不支持cuda10.1, 故从网站下载cuda10.0(https://developer.nvidia.com/cuda-10.0-download-archive)
> chmod 777 cuda_10.0.130_410.48_linux.run cuda_10.0.130.1_linux.run
> sudo ./cuda_10.0.130_410.48_linux.run
# 不需要安装OpenGL, Sample
# 这个是补丁
> sudo ./cuda_10.0.130.1_linux.run
# cudnn-10.0-linux-x64-v7.6.2.24.tgz 官网下载需要注册账户(https://developer.nvidia.com/rdp/cudnn-download)
> tar xvf cudnn-10.0-linux-x64-v7.6.2.24.tgz
> sudo cp cuda/include/cudnn.h /usr/local/cuda/include/
> sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64/
> sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
# 配置环境变量
> vim ~/.bashrc
export PATH=/usr/local/cuda-10.0/bin${PATH:+:$PATH}
export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
> source ~/.bashrc
# 测试
# 问题, 解决(https://blog.csdn.net/Adam_liu94/article/details/79644282)
> nvidia-smi
Failed to initialize NVML: Driver/library version mismatch
# 简单来看,就两步
# unload nvidia kernel mod
# reload nvidia kernel mod
# 执行起来就是
> sudo rmmod nvidia
rmmod: ERROR: Module nvidia is in use by: nvidia_modeset
> lsmod | grep nvidia
nvidia_drm 45056 0
nvidia_modeset 1110016 1 nvidia_drm
nvidia 18792448 1 nvidia_modeset
drm_kms_helper 167936 1 nvidia_drm
drm 393216 3 drm_kms_helper,nvidia_drm
i2c_core 73728 3 drm_kms_helper,nvidia,drm
ipmi_msghandler 53248 2 ipmi_devintf,nvidia
> sudo lsof -n -w /dev/nvidia*
> sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset
> sudo rmmod nvidia
> sudo nvidia-smi
原文链接
转载来源:(SuperX315)微信公众号,欢迎大家扫码关注.