import os
import copy
import pynvml
import time
import threading
defmonitor_gpu_usage(func):"""
`@monitor_gpu_usage`是一个装饰器,它的主要功能是监控并打印出装饰的函数在执行过程中的GPU使用情况。以下是对其功能和使用方法的详细解释:
1. **功能解释**:
`@monitor_gpu_usage`装饰器首先会在函数执行前获取当前进程的GPU使用情况,并打印出来。然后,它会在一个新的线程中,每0.1秒获取一次当前进程的GPU使用情况,并将这些使用情况存储起来。最后,当函数执行完毕后,它会计算并打印出函数执行期间的平均GPU使用情况。
2. **使用方法**:
要使用`@monitor_gpu_usage`装饰器,只需要在你想要监控的函数定义前加上`@monitor_gpu_usage`即可。例如:
```python
@monitor_gpu_usage
def gpu_task(gpu_id):
# 设置当前设备
torch.cuda.set_device(gpu_id)
# 创建一个大的随机矩阵
a = torch.randn([10000, 10000]).cuda()
# 执行一些计算任务
for _ in range(10):
a = (a * a).sum()
```
在上述代码中,`@monitor_gpu_usage`装饰器被应用到了`gpu_task`函数上,因此当`gpu_task`函数被调用时,`@monitor_gpu_usage`装饰器就会开始监控并打印出`gpu_task`函数的GPU使用情况。
注意:`@monitor_gpu_usage`装饰器使用了`pynvml`库来获取GPU的使用情况,因此在使用该装饰器前,需要确保已经正确安装了`pynvml`库。
:param func: 被装饰器修饰的函数
:return: wrapper
"""defwrapper(*args,**kwargs):# 初始化NVML
pynvml.nvmlInit()# 获取当前进程的PID
current_pid = os.getpid()# 获取当前进程的GPU使用情况defget_gpu_usage():
used ={}
gpu_count = pynvml.nvmlDeviceGetCount()# 获取GPU数量for i inrange(gpu_count):# 遍历每个GPU
handle = pynvml.nvmlDeviceGetHandleByIndex(i)# 获取当前GPU的句柄
info = pynvml.nvmlDeviceGetMemoryInfo(handle)# 单位Byte
total = info.total
# free = info.free# used = info.used
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)for proc in processes:if proc.pid == current_pid:
used[i]=[proc.usedGpuMemory, total /(1024**3)]# 返回GPU内存使用情况,总内存和GPU索引ifnot used.get(i):
used[i]=[0, total /(1024**3)]return used
defprint_usage(used_gpu:list, base:dict=None):"""
打印GPU占用
:param used_gpu: 调用函数中的GPU占用
:param base: 调用函数前的GPU占用
:return:
"""
used_gpu_avg = copy.deepcopy(used_gpu[0])# 将占用置为0for i in used_gpu_avg:
used_gpu_avg[i][0]=0if base isNoneorlen(base)==0:
base = copy.deepcopy(used_gpu_avg)# 深拷贝for item in used_gpu:iflen(item)>0:for i, _ in item.items():if item[i][0]>= base[i][0]:
used_gpu_avg[i][0]+=(item[i][0]- base[i][0])# 计算平均GPU占用for i in used_gpu_avg:
used_gpu_avg[i][0]=(used_gpu_avg[i][0]/len(used_gpu)/(1024**3))
no_use =Truefor i,(used, total)in used_gpu_avg.items():if used >0:
no_use =Falseprint(f"\nGPU {i} used {used :.4f} GB of {total:.4f} GB as {(used / total):.4f}%", end="")if no_use:print(' 0')else:print()# 减去函数运行前的本进程其它函数造成的GPU占用
used = get_gpu_usage()print(f"Process {current_pid} Function {func.__name__} before run with GPU usage :", end="")
print_usage([used])# 记录函数执行期间的GPU占用率
usages =[]defmonitor_usage():whilenot done:
time.sleep(0.005)
used_gpu = get_gpu_usage()
usages.append(used_gpu)
time.sleep(0.094)# 每0.1秒检查一次
done =False
monitor_thread = threading.Thread(target=monitor_usage)
monitor_thread.start()
start_time = time.time()
result = func(*args,**kwargs)# 运行被装饰的函数
end_time = time.time()
done =True
monitor_thread.join()
total_time = end_time - start_time
# 打印结果print(f"Process {current_pid} Function {func.__name__} ran for {total_time:.4f} seconds with average GPU usage :",
end="")
print_usage(usages, used)return result
return wrapper
if __name__ =='__main__':# 使用装饰器测量特定函数的GPU占用率import torch
from get_function_gpu_usage import monitor_gpu_usage
import multiprocessing
@monitor_gpu_usagedefgpu_task(gpu_id):# 设置当前设备
torch.cuda.set_device(gpu_id)# 创建一个大的随机矩阵
a = torch.randn([10000,10000]).cuda()# 执行一些计算任务for _ inrange(10):
a =(a * a).sum()# 运行函数"""
示例输出:
Process 3115658 Function gpu_task before run with GPU usage : 0
Process 3115658 Function gpu_task ran for 2.0419 seconds with average GPU usage :
GPU 0 used 0.2639 GB of 24.0000 GB as 0.0110%
Process 3115658 Function gpu_task before run with GPU usage :
GPU 0 used 1.0938 GB of 24.0000 GB as 0.0456%
Process 3115658 Function gpu_task ran for 1.3439 seconds with average GPU usage :
GPU 1 used 0.1611 GB of 24.0000 GB as 0.0067%
"""# gpu_task(0) # 运行在GPU 0上# gpu_task(1) # 运行在GPU 1上# 创建进程"""
示例输出:
Process 3120099 Function gpu_task before run with GPU usage : 0
Process 3120100 Function gpu_task before run with GPU usage : 0
Process 3120099 Function gpu_task ran for 1.8187 seconds with average GPU usage :
GPU 0 used 0.2004 GB of 24.0000 GB as 0.0084%
Process 3120100 Function gpu_task ran for 1.9047 seconds with average GPU usage :
GPU 1 used 0.2070 GB of 24.0000 GB as 0.0086%
"""
p1 = multiprocessing.Process(target=gpu_task, args=(0,))# 运行在GPU 0上
p2 = multiprocessing.Process(target=gpu_task, args=(1,))# 运行在GPU 1上# 启动进程
p1.start()
p2.start()# 等待进程结束
p1.join()
p2.join()