GPU检测显卡是否空闲排队程序
本程序特有地加入了检测部分显卡空闲时,可以使用部分显卡直接运行程序,更加实用
测试GPU为3090,不同型号可能略有差别
import os
import sys
import time
from IPython import embed
CUDA_cmd = 'CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7'
cmd = 'echo hello world'
NUM_GPU = 8
def gpu_info(num_gpu=NUM_GPU):
info = os.popen('nvidia-smi|grep %').read().split('\n')
power = []
memory = []
for i in range(num_gpu):
memory.append(int(info[i].split('|')[2].split('/')[0][:-4]))
power.append(int(info[i].split('|')[1].split('/')[0].split(' ')[-2][:-1]))
# print('memory', memory)
# print('power', power)
return power, memory
def narrow_setup(interval=2, num_gpu=NUM_GPU):
gpu_power_list, gpu_memory_list = gpu_info()
k = 0
# check if there is empty gpu initially
empty_gpu = []
for i in range(num_gpu):
if gpu_memory_list[i] < 1000 and gpu_power_list[i] < 30:
gpu_index = i
print('gpu %d is empty' % gpu_index)
empty_gpu.append(gpu_index)
if len(empty_gpu) >=2:
print('gpu', empty_gpu, 'is empty, ready to run')
return
else:
print('no enough empty gpus now')
while(1):
available_gpu = []
gpu_power_list, gpu_memory_list = gpu_info()
for j in range(num_gpu):
if gpu_memory_list[j] < 1000 and gpu_power_list[j] < 30:
gpu_index = j
print('gpu %d is empty' % gpu_index)
available_gpu.append(gpu_index)
k = k % 5
symbol = 'monitoring: ' + '>' * k + ' ' * (10 - k - 1) + '|'
gpu_power_str = f'gpu {j} power:{gpu_power_list[j]} W |'
gpu_memory_str = f'gpu {j} memory:{gpu_memory_list[j]} MiB |'
sys.stdout.write('\r' + gpu_memory_str + ' ' + gpu_power_str + ' ' + symbol)
sys.stdout.flush()
time.sleep(interval)
k += 1
if len(available_gpu) >= 2:
break
CUDA_cmd = generate_cuda_visible_devices_string(available_gpu)
print('\n' + cmd)
os.system(CUDA_cmd)
os.system(cmd)
def generate_cuda_visible_devices_string(int_list):
cuda_visible_devices = 'CUDA_VISIBLE_DEVICES='
elements = ','.join(str(element) for element in int_list)
return cuda_visible_devices + elements
if __name__ == '__main__':
narrow_setup()
# gpu_info()
或者使用bash脚本运行
#!/bin/bash
while true
do
stat1=$(gpustat | awk '{print $11}' | sed -n '2p')
stat2=$(gpustat | awk '{print $11}' | sed -n '3p')
stat3=$(gpustat | awk '{print $11}' | sed -n '4p')
stat4=$(gpustat | awk '{print $11}' | sed -n '5p')
stat5=$(gpustat | awk '{print $11}' | sed -n '6p')
stat6=$(gpustat | awk '{print $11}' | sed -n '7p')
stat7=$(gpustat | awk '{print $11}' | sed -n '8p')
stat8=$(gpustat | awk '{print $11}' | sed -n '9p')
echo 'GPU显存占用情况:' $stat1 $stat2 $stat3 $stat4 $stat5 $stat6 $stat7 $stat8
stat_arr=($stat1 $stat2 $stat3 $stat4 $stat5 $stat6 $stat7 $stat8)
gpu_available=0
gpu_available_index_arr=()
# 得到空闲GPU的数量和对应的序号
for i in ${!stat_arr[@]}
do
# 如果显存占用小于100M,继续
if [ "${stat_arr[$i]}" -lt 100 ]
then
gpu_available=$[gpu_available+1]
gpu_available_index_arr[${#gpu_available_index_arr[@]}]=$i
fi
done
echo '-可用GPU数:'$gpu_available', 第'${gpu_available_index_arr[@]}'块GPU可用'
sleep 10
if [ $stat1 -lt 100 ] #你想用的GPU
then
echo 'start running my code...'
bash scripts/train_gpus.sh
break
fi
sleep 30
done