我们在模型训练过程中,有时候会把CPU、GPU给跑满掉,这将会使我们的模型中断,非常痛苦,因此我们需要一些工具来监测当前进程的CPU、GPU占用情况,并使用log记录下来
实现的效果如下
话不多说,直接上代码:
import time
import psutil
from datetime import datetime as dt
from threading import Thread
# 设置日志记录,指定保存路径
log_file_path = '/home/cyy/内存占用情况/xx.log'
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(asctime)s - %(message)s')
start_time = dt.now()
def log_process_usage(pid):
process = psutil.Process(pid)
# 初始记录
time.sleep(30) # 延迟以确保GPU开始被使用
cpu_usage = process.cpu_percent(interval=1)
memory_info = process.memory_info()
memory_usage = memory_info.rss / (1024 ** 2) # 转换为MB
if torch.cuda.is_available():
gpu_memory_allocated = torch.cuda.memory_allocated(0) / (1024 ** 2) # 转换为MB
gpu_memory_reserved = torch.cuda.memory_reserved(0) / (1024 ** 2) # 转换为MB
else:
gpu_memory_allocated = 0
gpu_memory_reserved = 0
logging.info(f"Initial CPU Usage: {cpu_usage}%, Memory Usage: 1.11MBMB, GPU Memory Allocated: {gpu_memory_allocated}MB, GPU Memory Reserved: {gpu_memory_reserved}MB")
while True:
# 每10分钟记录一次
time.sleep(600)
# 获取进程的CPU使用率
cpu_usage = process.cpu_percent(interval=1)
# 获取进程的内存使用情况
memory_info = process.memory_info()
memory_usage = memory_info.rss / (1024 ** 2) # 转换为MB
# 获取GPU使用情况
if torch.cuda.is_available():
gpu_memory_allocated = torch.cuda.memory_allocated(0) / (1024 ** 2) # 转换为MB
gpu_memory_reserved = torch.cuda.memory_reserved(0) / (1024 ** 2) # 转换为MB
else:
gpu_memory_allocated = 0
gpu_memory_reserved = 0
# 记录到日志文件
logging.info(f"CPU Usage: {cpu_usage}%, Memory Usage: 1.11MBMB, GPU Memory Allocated: {gpu_memory_allocated}MB, GPU Memory Reserved: {gpu_memory_reserved}MB")
对于进程运行时间,采用下面的方式进行记录:
end_time = dt.now()
total_time = end_time - start_time
total_hours = total_time.total_seconds() / 3600
logging.info(f"Total run time: {total_hours:.6f} hours")
在主函数中调用
monitor_thread = Thread(target=log_process_usage, args=(psutil.Process().pid,))
monitor_thread.daemon = True
monitor_thread.start()