k8s集群纳管GPU节点
k8s版本: v1.26.9
gpu节点操作系统: Ubuntu 20.04
gpu节点containerd版本: 1.7.2
wget https://cn.download.nvidia.cn/tesla/550.54.15/nvidia-driver-local-repo-ubuntu2004-550.54.15_1.0-1_amd64.deb
apt install ./nvidia-driver-local-repo-ubuntu2004-550.54.15_1.0-1_amd64.deb -y
apt update
apt install nvidia-driver-550 -y
apt update
apt install containerd -y
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime = containerd
sudo systemctl restart containerd
kubeadm token create --print-join-command
kubeadm join xxx:6443 --token xxx.xxx --discovery-token-ca-cert-hash sha256:xxx
helm repo add nvidia https://nvidia.github.io/gpu-operator \
&& helm repo update
helm install --wait --generate-name
-n gpu-operator \
--create-namespace \
nvidia/gpu-operator \
--set node-feature-discovery.image.repository= k8s.dockerproxy.com/nfd/node-feature-discovery
测试启动pod,设置NVIDIA_DISABLE_REQUIRE=true可让高版本CUDA运行在低版本CUDA的节点上
apiVersion : v1
kind : Pod
metadata :
name : test
labels :
app : test
spec :
containers :
- name : test
image : pytorch/pytorch: 2.0.1- cuda11.7- cudnn8- devel
imagePullPolicy : IfNotPresent
env :
- name : NVIDIA_DISABLE_REQUIRE
value : "true"
command : [ 'sh' , '-c' , 'tail -f /dev/null' ]
resources :
limits :
nvidia.com/gpu : 1