说明
为了提高GPU的利用率,该脚本在当前的GPU内存占用释放的时候自动运行下一个程序,真的牛!感谢mz大佬的分享,我只是一个无情的搬运工😀
import os
import time
import pynvml
pynvml.nvmlInit() # 初始化
def watch_GPU(GPU_free=0.):
# 设备情况
deviceCount = pynvml.nvmlDeviceGetCount()
print('显卡数量:', deviceCount)
for i in range(deviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_name = pynvml.nvmlDeviceGetName(handle)
print('GPU %d is :%s' % (i, gpu_name))
# 显存信息
memo_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
print("GPU %d Memory Total: %.4f G" % (i, memo_info.total / 1024 / 1024 / 1000))
GPU_free += memo_info.free / 1024 / 1024
print("GPU %d Memory Free: %.4f G" % (i, memo_info.free / 1024 / 1024 / 1000))
print("GPU %d Memory Used: %.4f G" % (i, memo_info.used / 1024 / 1024 / 1000))
# 温度
Temperature = pynvml.nvmlDeviceGetTemperature(handle, 0)
print("Temperature is %.1f C" % (Temperature))
# 风扇转速
speed = pynvml.nvmlDeviceGetFanSpeed(handle)
print("Fan speed is ", speed)
# 电源状态
power_ststus = pynvml.nvmlDeviceGetPowerState(handle)
print("Power ststus", power_ststus)
return GPU_free
def work_v2():
# iter_list = ['0009', '0019', '0029', '0039', '0049', '0059', '0069', '0079', '0089', '0099', '0109', 'final', 'best']
#
# time.sleep(14400)
# model_list = [0, 1]
# for j in model_list:
# for i in range(1):
os.system(
"python3 tools/train_net.py --config-file ./configs/VeRi/sbs_R50-ibn.yml")
os.system(
"python3 tools/train_net.py --config-file ./configs/VeRi/bagtricks.yml")
os.system(
"python3 tools/train_net.py --config-file ./configs/VeRi/MGN.yml")
os.system(
"python3 tools/train_net.py --config-file ./configs/VeRi/AGW.yml")
while True:
if watch_GPU() > 5300:
print(' ****** GPU is satisfied for our work. Get ready to start! ******')
try:
work_v2()
finally:
pynvml.nvmlShutdown()
exit()
print('waiting for next time to detect ……')
time.sleep(900)
在安装pynvml包的时候,使用conda install pynvml安装
因为用pip install pynvml的话会报没有匹配的版本