实时监控python进程中某个函数使用nvidia GPU情况的python代码

本文链接：https://blog.csdn.net/qq_33843237/article/details/139033196
实时监控python进程中某个函数使用nvidia GPU情况的python代码

代码

import os
import copy

import pynvml
import time
import threading


def monitor_gpu_usage(func):
    """
    `@monitor_gpu_usage`是一个装饰器，它的主要功能是监控并打印出装饰的函数在执行过程中的GPU使用情况。以下是对其功能和使用方法的详细解释：
    1. **功能解释**：
       `@monitor_gpu_usage`装饰器首先会在函数执行前获取当前进程的GPU使用情况，并打印出来。然后，它会在一个新的线程中，每0.1秒获取一次当前进程的GPU使用情况，并将这些使用情况存储起来。最后，当函数执行完毕后，它会计算并打印出函数执行期间的平均GPU使用情况。

    2. **使用方法**：
       要使用`@monitor_gpu_usage`装饰器，只需要在你想要监控的函数定义前加上`@monitor_gpu_usage`即可。例如：
       ```python
       @monitor_gpu_usage
       def gpu_task(gpu_id):
           # 设置当前设备
           torch.cuda.set_device(gpu_id)

           # 创建一个大的随机矩阵
           a = torch.randn([10000, 10000]).cuda()

           # 执行一些计算任务
           for _ in range(10):
               a = (a * a).sum()
       ```
       在上述代码中，`@monitor_gpu_usage`装饰器被应用到了`gpu_task`函数上，因此当`gpu_task`函数被调用时，`@monitor_gpu_usage`装饰器就会开始监控并打印出`gpu_task`函数的GPU使用情况。

    注意：`@monitor_gpu_usage`装饰器使用了`pynvml`库来获取GPU的使用情况，因此在使用该装饰器前，需要确保已经正确安装了`pynvml`库。
    :param func: 被装饰器修饰的函数
    :return: wrapper
    """

    def wrapper(*args, **kwargs):
        # 初始化NVML
        pynvml.nvmlInit()
        # 获取当前进程的PID
        current_pid = os.getpid()

        # 获取当前进程的GPU使用情况
        def get_gpu_usage():
            used = {}
            gpu_count = pynvml.nvmlDeviceGetCount()  # 获取GPU数量
            for i in range(gpu_count):  # 遍历每个GPU
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)  # 获取当前GPU的句柄
                info = pynvml.nvmlDeviceGetMemoryInfo(handle)  # 单位Byte
                total = info.total
                # free = info.free
                # used = info.used
                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for proc in processes:
                    if proc.pid == current_pid:
                        used[i] = [proc.usedGpuMemory, total / (1024 ** 3)]  # 返回GPU内存使用情况，总内存和GPU索引
                if not used.get(i):
                    used[i] = [0, total / (1024 ** 3)]
            return used

        def print_usage(used_gpu: list, base: dict = None):
            """
            打印GPU占用
            :param used_gpu: 调用函数中的GPU占用
            :param base: 调用函数前的GPU占用
            :return:
            """
            used_gpu_avg = copy.deepcopy(used_gpu[0])
            # 将占用置为0
            for i in used_gpu_avg:
                used_gpu_avg[i][0] = 0

            if base is None or len(base) == 0:
                base = copy.deepcopy(used_gpu_avg)  # 深拷贝

            for item in used_gpu:
                if len(item) > 0:
                    for i, _ in item.items():
                        if item[i][0] >= base[i][0]:
                            used_gpu_avg[i][0] += (item[i][0] - base[i][0])

            # 计算平均GPU占用
            for i in used_gpu_avg:
                used_gpu_avg[i][0] = (used_gpu_avg[i][0] / len(used_gpu) / (1024 ** 3))

            no_use = True
            for i, (used, total) in used_gpu_avg.items():
                if used > 0:
                    no_use = False
                    print(
                        f"\nGPU {i} used {used :.4f} GB of {total:.4f} GB as {(used / total):.4f}%", end="")
            if no_use:
                print(' 0')
            else:
                print()

        # 减去函数运行前的本进程其它函数造成的GPU占用
        used = get_gpu_usage()
        print(f"Process {current_pid} Function {func.__name__} before run with GPU usage :", end="")
        print_usage([used])

        # 记录函数执行期间的GPU占用率
        usages = []

        def monitor_usage():
            while not done:
                time.sleep(0.005)
                used_gpu = get_gpu_usage()
                usages.append(used_gpu)
                time.sleep(0.094)  # 每0.1秒检查一次

        done = False
        monitor_thread = threading.Thread(target=monitor_usage)
        monitor_thread.start()

        start_time = time.time()
        result = func(*args, **kwargs)  # 运行被装饰的函数
        end_time = time.time()

        done = True
        monitor_thread.join()
        total_time = end_time - start_time

        # 打印结果
        print(
            f"Process {current_pid} Function {func.__name__} ran for {total_time:.4f} seconds with average GPU usage :",
            end="")
        print_usage(usages, used)
        return result

    return wrapper


if __name__ == '__main__':
    # 使用装饰器测量特定函数的GPU占用率
    import torch
    from get_function_gpu_usage import monitor_gpu_usage
    import multiprocessing


    @monitor_gpu_usage
    def gpu_task(gpu_id):
        # 设置当前设备
        torch.cuda.set_device(gpu_id)

        # 创建一个大的随机矩阵
        a = torch.randn([10000, 10000]).cuda()

        # 执行一些计算任务
        for _ in range(10):
            a = (a * a).sum()


    # 运行函数
    """
    示例输出：
    Process 3115658 Function gpu_task before run with GPU usage : 0
    Process 3115658 Function gpu_task ran for 2.0419 seconds with average GPU usage :
    GPU 0 used 0.2639 GB of 24.0000 GB as 0.0110%
    Process 3115658 Function gpu_task before run with GPU usage :
    GPU 0 used 1.0938 GB of 24.0000 GB as 0.0456%
    Process 3115658 Function gpu_task ran for 1.3439 seconds with average GPU usage :
    GPU 1 used 0.1611 GB of 24.0000 GB as 0.0067%
    """
    # gpu_task(0)  # 运行在GPU 0上
    # gpu_task(1)  # 运行在GPU 1上

    # 创建进程
    """
    示例输出：
    Process 3120099 Function gpu_task before run with GPU usage : 0
    Process 3120100 Function gpu_task before run with GPU usage : 0
    Process 3120099 Function gpu_task ran for 1.8187 seconds with average GPU usage :
    GPU 0 used 0.2004 GB of 24.0000 GB as 0.0084%
    Process 3120100 Function gpu_task ran for 1.9047 seconds with average GPU usage :
    GPU 1 used 0.2070 GB of 24.0000 GB as 0.0086%
    """
    p1 = multiprocessing.Process(target=gpu_task, args=(0,))  # 运行在GPU 0上
    p2 = multiprocessing.Process(target=gpu_task, args=(1,))  # 运行在GPU 1上

    # 启动进程
    p1.start()
    p2.start()

    # 等待进程结束
    p1.join()
    p2.join()