Centos7.9版本docker与cuda安装

操作系统初始化设置

操作系统:推荐centos 7.4版本以上

修改ip地址

[root@localhost ~]# vim /etc/sysconfig/network-scripts/ifcfg-ens192 
TYPE=Ethernet
BOOTPROTO=static
DEVICE=ens192
ONBOOT=yes
IPADDR=192.168.0.200
NETMASK=255.255.255.0
GATEWAY=192.168.0.254
DNS1=223.5.5.5

修改主机名

[root@lx-test ~]# vim /etc/hostname 
lx-test

禁用不常用服务

systemctl disable postfix.service
systemctl disable firewalld.service
systemctl disable NetworkManager.service    
systemctl disable kdump.service
systemctl disable cups.path
systemctl disable cups.socket
systemctl disable cups.service 
systemctl disable rpcbind.socket 
systemctl disable bluetooth.service 
systemctl disable abrt-ccpp.service                           
systemctl disable abrt-oops.service                           
systemctl disable abrt-vmcore.service                         
systemctl disable abrt-xorg.service                           
systemctl disable abrtd.service                               
systemctl disable accounts-daemon.service                     
systemctl disable atd.service                                 
systemctl disable avahi-daemon.service                        
systemctl disable chronyd.service                             
systemctl disable dbus-org.freedesktop.Avahi.service          
systemctl disable dbus-org.freedesktop.ModemManager1.service  
systemctl disable display-manager.service                     
systemctl disable dmraid-activation.service                   
systemctl disable gdm.service                                 
systemctl disable hypervkvpd.service                          
systemctl disable hypervvssd.service                          
systemctl disable iscsi.service                               
systemctl disable ksm.service                                 
systemctl disable ksmtuned.service                            
systemctl disable libstoragemgmt.service                      
systemctl disable libvirtd.service                            
systemctl disable mdmonitor.service                           
systemctl disable ModemManager.service                        
systemctl disable multipathd.service                          
systemctl disable packagekit-offline-update.service           
systemctl disable rngd.service                                
systemctl disable rtkit-daemon.service                        
systemctl disable smartd.service                              
systemctl disable spice-vdagentd.service                      
systemctl disable sysstat.service                                                        
systemctl disable avahi-daemon.socket                         
systemctl disable iscsid.socket                               
systemctl disable iscsiuio.socket  
# RHEL 8.4 额外添加  https://docs.rancher.cn/docs/rke/os/_index
systemctl disable nm-cloud-setup.service nm-cloud-setup.timer

关闭防火墙&selinux

(firewalld 上面不常用服务已禁用),禁用selinux
对于有特殊要求的环境,根据实际配置

#禁用 firewalld 服务
systemctl stop firewalld
systemctl disable firewalld

#禁用 SELinux
setenforce 0
sed -i 's/^SELINUX=enforcing$/SELINUX=disabled/' /etc/selinux/config

禁用swap

[root@lx-test ~]# vim /etc/fsta  注释swap所在行
#/dev/mapper/centos-swap swap                    swap    defaults        0 0

添加yum仓库 (无外网可忽略此步骤)

备份现有repo文件

[root@ lx-test yum.repos.d]# tar -cvf CentOS-repo.tar.gz CentOS-* --remove-files
CentOS-Base.repo
CentOS-CR.repo
CentOS-Debuginfo.repo
CentOS-fasttrack.repo
CentOS-Media.repo
CentOS-Sources.repo
CentOS-Vault.repo
CentOS-x86_64-kernel.repo

常用yum仓库

包含centos bas源、docker-ce源、epel源、kubernetes 源

vim /etc/yum.repos.d/yum-out-all.repo

#============ centos ==============
[base]
name=CentOS-$releasever - Base - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/os/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
 
#released updates 
[updates]
name=CentOS-$releasever - Updates - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
 
#additional packages that may be useful
[extras]
name=CentOS-$releasever - Extras - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
 
#additional packages that extend functionality of existing packages
[centosplus]
name=CentOS-$releasever - Plus - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/centosplus/$basearch/
gpgcheck=0
enabled=1
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7

#============ docker-ce  ==============
[docker-ce-stable]
name=Docker CE Stable - $basearch
baseurl=https://mirrors.tuna.tsinghua.edu.cn/docker-ce/linux/centos/$releasever/$basearch/stable
enabled=1
gpgcheck=0
gpgkey=https://mirrors.tuna.tsinghua.edu.cn/docker-ce/linux/centos/gpg

#============ epel ==============
[epel]
name=Extra Packages for Enterprise Linux 7 - $basearch
baseurl=https://mirrors.tuna.tsinghua.edu.cn/epel/7/$basearch
#mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=$basearch
failovermethod=priority
enabled=1
gpgcheck=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7

#============ kubernetes  ==============
[kubernetes]
name=kubernetes
baseurl=https://mirrors.tuna.tsinghua.edu.cn/kubernetes/yum/repos/kubernetes-el7-$basearch
enabled=1
gpgcheck=0

centos8 yum源更换 (可选操作)

修改所有的CentOS文件内容
sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*

更新yum源为阿里镜像
wget -O /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-vault-8.5.2111.repo
yum clean all
yum makecache

#yum安装测试是否可以yum安装 (可选)
yum install wget –y

安装常用工具

yum install vim wget git iotop net-tools telnet ntpdate lsof sysstat psmisc iftop nmon unzip -y

时间同步

2种方式二选一

(ntp方式)

[root@lx-test ~]# yum install ntp -y  #安装ntp服务
[root@lx-test ~]# ntpdate ntp.aliyun.com  #手动同步时间
22 Apr 17:32:58 ntpdate[9380]: adjust time server 203.107.6.88 offset -0.000697 sec

[root@lx-test ~]# vim /etc/ntp.conf
#修改国内常用时间服务器
server ntp.aliyun.com iburst

[root@lx-test ~]# systemctl enable --now  ntpd
[root@lx-test ~]# systemctl status ntpd

[root@lx-test ~]# ntpq -p

在这里插入图片描述

(Chrony方式)

[root@lx-test ~]# yum install chrony -y

[root@lx-test ~]# vim /etc/chrony.conf
#注释原有server,添加国内常用时间服务器
server ntp.aliyun.com iburst
server ntp.tencent.com iburst
server ntp.ntsc.ac.cn iburst

[root@lx-test ~]# systemctl  enable --now chronyd
[root@lx-test ~]# systemctl status chronyd
#手动同步时间
[root@lx-test ~]#  chronyc -a  makestep
200 OK
[root@lx-test ~]# chronyc  sources -v

在这里插入图片描述

节点OS调优

内核调优

执行以下命令

echo "
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-iptables=1
net.ipv4.ip_forward=1
net.ipv4.conf.all.forwarding=1
net.ipv4.neigh.default.gc_thresh1=4096
net.ipv4.neigh.default.gc_thresh2=6144
net.ipv4.neigh.default.gc_thresh3=8192
net.ipv4.neigh.default.gc_interval=60
net.ipv4.neigh.default.gc_stale_time=120

# 参考 https://github.com/prometheus/node_exporter#disabled-by-default
kernel.perf_event_paranoid=-1

#sysctls for k8s node config
net.ipv4.tcp_slow_start_after_idle=0
net.core.rmem_max=16777216
fs.inotify.max_user_watches=524288
kernel.softlockup_all_cpu_backtrace=1

kernel.softlockup_panic=0

kernel.watchdog_thresh=30
fs.file-max=2097152
fs.inotify.max_user_instances=8192
fs.inotify.max_queued_events=16384
vm.max_map_count=262144
fs.may_detach_mounts=1
net.core.netdev_max_backlog=16384
net.ipv4.tcp_wmem=4096 12582912 16777216
net.core.wmem_max=16777216
net.core.somaxconn=32768
net.ipv4.ip_forward=1
net.ipv4.tcp_max_syn_backlog=8096
net.ipv4.tcp_rmem=4096 12582912 16777216

net.ipv6.conf.all.disable_ipv6=1
net.ipv6.conf.default.disable_ipv6=1
net.ipv6.conf.lo.disable_ipv6=1

kernel.yama.ptrace_scope=0
vm.swappiness=0

# 可以控制core文件的文件名中是否添加pid作为扩展。
kernel.core_uses_pid=1

# Do not accept source routing
net.ipv4.conf.default.accept_source_route=0
net.ipv4.conf.all.accept_source_route=0

# Promote secondary addresses when the primary address is removed
net.ipv4.conf.default.promote_secondaries=1
net.ipv4.conf.all.promote_secondaries=1

# Enable hard and soft link protection
fs.protected_hardlinks=1
fs.protected_symlinks=1

# 源路由验证
# see details in https://help.aliyun.com/knowledge_detail/39428.html
net.ipv4.conf.all.rp_filter=0
net.ipv4.conf.default.rp_filter=0
net.ipv4.conf.default.arp_announce = 2
net.ipv4.conf.lo.arp_announce=2
net.ipv4.conf.all.arp_announce=2

# see details in https://help.aliyun.com/knowledge_detail/41334.html
net.ipv4.tcp_max_tw_buckets=5000
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_synack_retries=2
kernel.sysrq=1

#redis优化
vm.overcommit_memory = 1

" > /etc/sysctl.conf


#执行sysctl  -p 生效
sysctl  -p 

nproc资源限制

修改添加普通用户资源限制

cat > /etc/security/limits.d/20-nproc.conf <<EOF
# Default limit for number of user's processes to prevent
# accidental fork bombs.
# See rhbz #432903 for reasoning.

*          soft    nproc        65536
*          hard    nproc        65536
*          soft    nofile       65536
*          hard    nofile       65536
root       soft    nproc     unlimited
EOF

在这里插入图片描述

Docker安装

卸载docker

若docker版本非docker-ce版本,需卸载安装docker-ce版本

sudo yum -y remove docker \
              docker-client \
              docker-client-latest \
              docker-common \
              docker-latest \
              docker-latest-logrotate \
              docker-logrotate \
              docker-selinux \
              docker-engine-selinux \
              docker-engine \
              container* \
              docker*
rm -rf /var/lib/docker/ /etc/docker/

安装docker

yum install docker-ce  -y 

二进制安装方式(可选)
针对离线环境无法访问yum仓库,可二进制方式安装

1.下载docker二进制安装包
下载地址>> https://download.docker.com/linux/static/stable/x86_64/

此次下载为: docker-20.10.16.tgz
2. 解压安装包
tar zxf docker-20.10.16.tgz

3. 复制二进制文件到/usr/local/bin目录下
cp docker/*  /usr/local/bin

4. 编写docker启动脚本
vim /usr/lib/systemd/system/docker.service
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
OOMScoreAdjust=-1000
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/local/bin/dockerd
ExecReload=/bin/kill -s HUP $MAINPID
#ubuntu系统注释下面一行
ExecStartPost=/usr/sbin/iptables -P FORWARD ACCEPT
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
 
[Install]
WantedBy=multi-user.target

5. 启动docker并设为开机自启
systemctl daemon-reload
systemctl enable --now docker

docker.service 配置调优

对于 CentOS 系统,docker.service 默认位于/usr/lib/systemd/system/docker.service;对于 Ubuntu 系统,docker.service 默认位于/lib/systemd/system/docker.service。编辑docker.service,添加以下参数。

  • 防止 docker 服务 OOM: OOMScoreAdjust=-1000 (配置文件可能已包含,更改值)
  • 开启 iptables 转发链:

ExecStartPost=/usr/sbin/iptables -P FORWARD ACCEPT在这里插入图片描述

docker默认root目录

方式一:修改docker.service

如需切换docker默认数据存放目录(默认路径:/var/lib/docker),示例为切换为/data/docker目录,(已启动的docker服务需停止:systemctl stop docker,systemctl stop docker.socket) 编辑docker配置文件,增加 --data-root=/data/docker参数
,添加后无需再在daemon.json中配置

vim /usr/lib/systemd/system/docker.service
ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock --data-root=/data/docker

在这里插入图片描述## 方式二:修改daemon.json配置(推荐方式)

docker 参数调优

指定存储驱动、镜像加速,日志大小及限制配置

mkdir -p  /etc/docker  #已启动过docker服务会自动生成,无需手动创建
cat > /etc/docker/daemon.json <<EOF
{
    "data-root": "/data/docker",
    "log-driver": "json-file",
    "log-opts": { "max-size": "100m", "max-file": "3"},
    "max-concurrent-downloads": 10,
    "max-concurrent-uploads": 10,
    "registry-mirrors": ["https://3284ug2c.mirror.aliyuncs.com"],
    "storage-driver": "overlay2",
    "exec-opts": ["native.cgroupdriver=systemd"]
}
EOF
systemctl daemon-reload && systemctl restart docker && systemctl enable docker

GPU驱动安装

cuda安装

centos7 cuda和runtime安装

(安装cuda可以一并安装nvida驱动,也可单独下载nvida驱动安装,cuda可安装多个版本)

查询显卡(可选)

验证系统是否已识别NVIDIA显卡

lspci |grep -i nvidia (若没有lspci命令,执行安装:yum install pciutils -y)

对与能直接显示的显卡:Tesla T4卡

在这里插入图片描述

对于不能直接显示型号的GPU显卡,如图:25b6卡,查询结果为A2/A16卡

查询网址:http://pci-ids.ucw.cz/mods/PC/10de?action=help?help=pci

在这里插入图片描述

在这里插入图片描述

cuda支持列表与GPU 的计算能力

cuda与Driver兼容性列表:https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html

算力查询地址:CUDA GPU | NVIDIA Developer

安装所需的软件

yum install gcc-c++ kernel-devel-$(uname -r) kernel-headers-$(uname -r)  -y

下载cuda

驱动下载官网地址:https://www.nvidia.cn/Download/index.aspx?lang=cn (可选下载)

cuda官网地址:https://developer.nvidia.com/cuda-toolkit-archive

当前下载为官网11.6.0版本,可根据需求安装相应版本

wget https://developer.download.nvidia.com/compute/cuda/11.6.0/local_installers/cuda_11.6.0_510.39.01_linux.run

禁用默认nouveau驱动

编辑dist-blacklist.conf文件在最后添加以下内容

vim /usr/lib/modprobe.d/dist-blacklist.conf

#######禁用默认nouveau###
blacklist nouveau
options nouveau modeset=0
#######################

备份原始内核

mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak

重新生成

dracut /boot/initramfs-$(uname -r).img $(uname -r)

重启操作系统,(确保rpm -qa |grep kernel启动内核和上面安装的kernel-devel和kernel-headers版本号一致)

reboot

查看是否已禁用默认nouveau,无输出表示已禁用

lsmod | grep nouveau

查看系统运行级别

runlevel (若为5图形化模式,请切换至3多用户模式)

systemctl isolate runlevel3.target \\切换到3多用户

systemctl set-default multi-user.target \\runlevel3级别 (可选操作:设置系统开机默认启动级别为多用户模式)

安装cuda

开始安装

sudo sh cuda_11.6.0_510.39.01_linux.run

输入accept后可按空格取消示例demo,向下移动箭头选中Install安装(一般是取消后三项中的“X”)

##############
CUDA Installer                                                               
- [X] Driver                                                                 
    [X] 510.39.01                                                           
+ [X] CUDA Toolkit 11.6                                                      
[ ] CUDA Samples 11.6                                                      
[ ] CUDA Demo Suite 11.6                                                   
[ ] CUDA Documentation 11.6                                                
Options                                                                    
 Install
###################

安装完成

在这里插入图片描述

添加配置变量

根据实际安装版本修改

vim /etc/profile 配置文件末尾添加

# cuda 默认安装路径为/usr/local/cuda-x版本号
export  PATH=/usr/local/cuda-11.6/bin:$PATH
export  LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64$LD_LIBRARY_PATH

配置生效

source /etc/profile

查看nvida显卡信息

nvidia-smi 

在这里插入图片描述

查看cuda版本

# nvcc  --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0

启用GPU持久模式

提高GPU加载速度

nvidia-smi -pm 1 (也可使用nvidia-persistenced --persistence-mode 命令)仅当前生效

# nvidia-smi  -pm 1
Enabled persistence mode for GPU 00000000:2F:00.0.
All done.

再次查看:Persistence-M 已从Off状态变成On

在这里插入图片描述

配置开机自启持久化模式

cat > /etc/systemd/system/multi-user.target.wants/nvidia-pm.service <<EOF
[Unit]
Description=Set NVIDIA Persistence Mode to Enable
Wants=syslog.target

[Service]
Type=forking
ExecStart=/usr/bin/nvidia-smi -pm 1

[Install]
WantedBy=multi-user.target
EOF

runtime安装

nvidia-container-runtime 下载

nvidia-container-runtime 地址:https://github.com/NVIDIA/nvidia-container-runtime/

nvidia-container-runtime.repo yum源

vim /etc/yum.repos.d/nvidia-container-runtime.repo 

[libnvidia-container]
name=libnvidia-container
baseurl=https://nvidia.github.io/libnvidia-container/stable/centos7/$basearch
gpgcheck=0
enabled=1

安装nvidia-container-runtime

yum install nvidia-container-runtime -y

离线安装(离线环境提前下载的包)

上传runtime目录汇总rpm包

rpm -ivh docker-runtime/*.rpm

配置nvidia-rumtime

vim /etc/docker/daemon.json 增添参数

{
    "default-runtime": "nvidia",
    "runtimes": {
        "nvidia": {
            "path": "/usr/bin/nvidia-container-runtime",
            "runtimeArgs": []
        }
    },

完整daemon.json文件内容 建议复制粘贴至GPU服务器执行,一步到位!

#GPU服务器
mkdir -p  /etc/docker
cat > /etc/docker/daemon.json <<EOF
{
    "default-runtime": "nvidia",
    "runtimes": {
        "nvidia": {
            "path": "/usr/bin/nvidia-container-runtime",
            "runtimeArgs": []
        }
    },
     "exec-opts": ["native.cgroupdriver=systemd"],
    "log-driver": "json-file",
    "log-opts": { "max-size": "100m", "max-file": "3"},
    "max-concurrent-downloads": 10,
    "max-concurrent-uploads": 10,
    "registry-mirrors": ["https://3284ug2c.mirror.aliyuncs.com"],
    "storage-driver": "overlay2",
    "storage-opts": [
    "overlay2.override_kernel_check=true"
    ]
}
EOF
#重载配置、重启docker服务并设置开机自启动
systemctl daemon-reload && systemctl restart docker && systemctl enable docker

在这里插入图片描述

重载docker 已执行可忽略

systemctl daemon-reload
systemctl restart docker 

查看docker runtime

查看Default Runtime: 为nvidia表示切换成功

docker info

在这里插入图片描述

  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值