背景
实验室共用一个服务器,常常不知道到底是谁占了gpu,如果通过传统方式查询,需要先输入“nvidia-smi”查询到pid,然后再通过pid查询对应的dokcer的scope,再通过对应的scope查到对应的docker名字,才能知道到底是谁在用对应的gpu
(base) server05@ps:~$ cat /proc/1877323/cgroup
0::/system.slice/docker-96b512ca1352e0df092a644b33ea640723befaf7cd980507a35f35e335740089.scope
(base) server05@ps:~$ docker inspect --format '{{.Name}}' "96b512ca1352e0df092a644b33ea640723befaf7cd980507a35f35e335740089"|sed 's/^\///'
docker_name
新的改变
现在只需要创建一个.py文件,然后把代码拖进去,然后直接python运行脚本即可
脚本分享如下
import subprocess
import xml.etree.ElementTree as ET
import psutil
import re
def get_docker_name_by_pid(pid):
# 从/proc/[pid]/cgroup文件中获取docker container id
try:
with open(f"/proc/{pid}/cgroup") as f:
lines = f.readlines()
for line in lines:
matched = re.match(".*docker-([a-f0-9]+)\.scope.*", line)
if matched:
container_id = matched.group(1)
# 通过docker inspect查询docker名称
command = f"docker inspect --format '{{{{.Name}}}}' {container_id} | sed 's/^\///'"
docker_name = subprocess.check_output(command, shell=True).decode('utf-8').strip()
return docker_name
except Exception as e:
return str(e)
def get_gpu_info():
try:
# 运行 nvidia-smi 命令获取 GPU 信息
result = subprocess.check_output("nvidia-smi -q -x", shell=True)
result = result.decode('utf-8')
# 解析 XML 数据
root = ET.fromstring(result)
gpus = root.findall('.//gpu')
# 创建字典用于存储 PID 对应的 GPU 和 Docker 信息
pid_info_dict = {}
for gpu in gpus:
gpu_id = gpu.find('minor_number').text
processes = gpu.findall('.//process_info')
for process in processes:
pid = process.find('pid').text
process_name = process.find('used_memory').attrib.get('process_name')
# 使用 psutil 获取进程信息,包括父进程
try:
process_info = psutil.Process(int(pid))
docker_name = get_docker_name_by_pid(int(pid))
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
docker_name = "N/A"
# 将 PID、GPU 和 Docker 信息存储到字典中
if pid in pid_info_dict:
pid_info_dict[pid]["gpus"].append(f"GPU {gpu_id}")
else:
pid_info_dict[pid] = {"gpus": [f"GPU {gpu_id}"], "docker": docker_name}
# 输出结果
for pid, info in pid_info_dict.items():
gpus = ', '.join(info["gpus"])
docker_info = info["docker"]
print(f"PID: {pid}, GPUs: {gpus}, Docker: {docker_info}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
get_gpu_info()
参考文献:
[1]: https://blog.csdn.net/qq_50757624/article/details/128156608