在服务器上面跑代码时,怕代码跑完了,云平台服务器还一直开着收费,特别是晚上离开实验室回去睡觉
这时候就需要有一个脚本实时监控云端服务器代码是否还在一直跑,如果中途退出训练,那么让服务器自动关机。
以下为python代码实现:
import subprocess import re import time import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText import os import pycuda.driver as cuda import pycuda.autoinit # 连续低占用率计数器 low_utilization_count = 0 def get_gpu_utilization(): try: output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) gpu_utilization = [int(x) for x in output.decode('utf-8').strip().split('\n')] return gpu_utilization except subprocess.CalledProcessError: return None def get_gpu_info(): num_devices = cuda.Device.count() gpu_info_list = [] for i in range(num_devices): device = cuda.Device(i) gpu_info = { "Device ID": i, "Name": device.name(), "Total Memory": device.total_memory() / (1024 ** 2), # Convert to MB "Clock Rate": device.get_attribute(cuda.device_attribute.CLOCK_RATE) / 1000 # Convert to MHz } gpu_info_list.append(gpu_info) for gpu_info in gpu_info_list: print("GPU信息:") for key, value in gpu_info.items(): print(f"{key}: {value}") print() def shutdown(): try: os.system('shutdown /s /t 1') # 关机命令 except: os.system('poweroff') # 关机命令 def monitor(): global low_utilization_count while True: gpu_utilization = get_gpu_utilization() if gpu_utilization is not None: low_utilization = False for i, util in enumerate(gpu_utilization): if util < 50: low_utilization = True break if low_utilization: low_utilization_count += 1 else: low_utilization_count = 0 if low_utilization_count >= 3: message = "连续三次检测到 GPU 占用率低于 50%,将执行关机操作。" print(message) time.sleep(3) shutdown() break else: print("Failed to retrieve GPU utilization.") print(fr'gpu使用率为:{gpu_utilization[0]}%') time.sleep(30) # 每半分钟检查一次 def main(): get_gpu_info() monitor() if __name__ == "__main__": main()