GPU显存，计算量，计算密度，访存量等推理速度

齐名南

已于 2024-03-18 09:05:10 修改

阅读量2.0k

点赞数

分类专栏： gpu 文章标签：计算机视觉深度学习人工智能

于 2022-02-08 19:52:42 首次发布

本文链接：https://blog.csdn.net/qq_51609636/article/details/122830157

版权

gpu 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

释放pytorch占用的gpu显存_Pytorch释放显存占用方式

计算复杂度

from thop import profile
model = resnet50()
input = torch.randn(1, 3, 224, 224) 
macs, params = profile(model, inputs=(input, ))

macs把一次乘法和加减当成一次运算，其实复杂度是 macs的1-2倍，一般取2

from thop import profile
filepath = './weightsresnet18/last224stand_sit_resnet18.pth'
checkpoint = torch.load(filepath)
model = checkpoint['model']  # 提取网络结构
model.load_state_dict(checkpoint['model_state_dict'])  # 加载网络权重参数
model.eval()
import torch
from thop import profile
input = torch.randn(1, 3, 224,224).cuda()
flops, params = profile(model, inputs=(input, ))
print(flops/1e9, params/1e6,)

下面代码是更改profile增加模型访存量的计算

#profile计算计算量的函数更改成下面的
def profile_macs(model: nn.Module, inputs, custom_ops=None, verbose=True):
    handler_collection = {}
    types_collection = set()
    if custom_ops is None:
        custom_ops = {}

    def add_hooks(m: nn.Module):
        m.register_buffer('total_ops', torch.zeros(1, dtype=torch.float64))
        m.register_buffer('total_params', torch.zeros(1, dtype=torch.float64))
        # m.register_buffer('',torch.zeros(1,dtype=torch.float64))
        
        m.register_buffer('total_kernel_macs',torch.zeros(1,dtype=torch.float64))
        m.register_buffer('total_output_macs',torch.zeros(1,dtype=torch.float64))

        # for p in m.parameters():
        #     m.total_params += torch.DoubleTensor([p.numel()])

        m_type = type(m)

        fn = None
        print(m_type)
        if m_type in custom_ops:  # if defined both op maps, use custom_ops to overwrite.
            fn = custom_ops[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Customize rule %s() %s." % (fn.__qualname__, m_type))
        elif m_type in register_hooks:
            fn = register_hooks[m_type]
            if m_type not in types_collection and verbose:
                print("[INFO] Register %s() for %s." % (fn.__qualname__, m_type))
        else:
            if m_type not in types_collection and verbose:
                prRed("[WARN] Cannot find rule for %s. Treat it as zero Macs and zero Params." % m_type)

        if fn is not None:
            handler_collection[m] = (m.register_forward_hook(fn), m.register_forward_hook(count_parameters))
        types_collection.add(m_type)

    prev_training_status = model.training

    model.eval()
    model.apply(add_hooks)

    with torch.no_grad():
        model(*inputs)

    def dfs_count_macs(module: nn.Module, prefix="\t") -> (int, int):
        # total_ops, total_params = 0, 0
        total_ops, total_params, total_kernel_macs, total_output_macs = module.total_ops.item(), 0, 0, 0
        ret_dict = {}
        for n, m in module.named_children():
            # if not hasattr(m, "total_ops") and not hasattr(m, "total_params"):  # and len(list(m.children())) > 0:
            #     m_ops, m_params = dfs_count(m, prefix=prefix + "\t")
            # else:
            #     m_ops, m_params = m.total_ops, m.total_params
            next_dict = {}
            if m in handler_collection and not isinstance(m, (nn.Sequential, nn.ModuleList)):
                # m_ops, m_params = m.total_ops.item(), m.total_params.item()
                m_ops, m_params, m_kernel_macs, m_output_macs = m.total_ops.item(), m.total_params.item(), m.total_kernel_macs.item(), m.total_output_macs.item()
            else:
                # m_ops, m_params = dfs_count(m, prefix=prefix + "\t")
                m_ops, m_params, m_kernel_macs, m_output_macs, next_dict = dfs_count_macs(m, prefix=prefix + "\t")
            ret_dict[n] = (m_ops, m_params, m_kernel_macs, m_output_macs, next_dict)
            total_ops += m_ops
            total_params += m_params
            total_kernel_macs += m_kernel_macs
            total_output_macs += m_output_macs

        #  print(prefix, module._get_name(), (total_ops.item(), total_params.item()))
        # return total_ops, total_params
        return total_ops, total_params, total_kernel_macs, total_output_macs, ret_dict

# total_ops, total_params = dfs_count(model)
    total_ops, total_params, total_kernel_macs, total_output_macs, ret_dict = dfs_count_macs(model)

    # reset model to original status
    model.train(prev_training_status)
    for m, (op_handler, params_handler) in handler_collection.items():
        op_handler.remove()
        params_handler.remove()
        m._buffers.pop("total_ops")
        m._buffers.pop("total_params")
        m._buffers.pop("total_kernel_macs")
        m._buffers.pop("total_output_macs")
        

    return  total_ops, total_params, total_kernel_macs, total_output_macs

#各个算子的访存量计算成这个样的
import argparse
import logging

import torch
import torch.nn as nn
from torch.nn.modules.conv import _ConvNd

multiply_adds = 1


def count_parameters(m, x, y):
    total_params = 0
    for p in m.parameters():
        total_params += torch.DoubleTensor([p.numel()])
    m.total_params[0] = total_params


def zero_ops(m, x, y):
    m.total_ops += torch.DoubleTensor([int(0)])


def count_convNd(m: _ConvNd, x: (torch.Tensor,), y: torch.Tensor):
    x = x[0]

    kernel_ops = torch.zeros(m.weight.size()[2:]).numel()  # Kw x Kh
    bias_ops = 1 if m.bias is not None else 0

    # N x Cout x H x W x  (Cin x Kw x Kh + bias)
    total_ops = y.nelement() * (m.in_channels // m.groups * kernel_ops + bias_ops)

    m.total_ops += torch.DoubleTensor([int(total_ops)])
    
    
    
    total_kernel_macs = kernel_ops * m.in_channels * m.out_channels  # K^2 * C_in * C_out
    m.total_kernel_macs += torch.DoubleTensor([int(total_kernel_macs)])


    total_output_macs = y.nelement() # N x Cout x H x W
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])


def count_convNd_ver2(m: _ConvNd, x: (torch.Tensor,), y: torch.Tensor):
    x = x[0]

    # N x H x W (exclude Cout)
    output_size = torch.zeros((y.size()[:1] + y.size()[2:])).numel()
    # Cout x Cin x Kw x Kh
    kernel_ops = m.weight.nelement()
    if m.bias is not None:
        # Cout x 1
        kernel_ops += + m.bias.nelement()
    # x N x H x W x Cout x (Cin x Kw x Kh + bias)
    m.total_ops += torch.DoubleTensor([int(output_size * kernel_ops)])


def count_bn(m, x, y):
    x = x[0]

    nelements = x.numel()
    if not m.training:
        # subtract, divide, gamma, beta
        total_ops = 2 * nelements

    m.total_ops += torch.DoubleTensor([int(total_ops)])
    
    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])


def count_relu(m, x, y):
    x = x[0]

    nelements = x.numel()

    m.total_ops += torch.DoubleTensor([int(nelements)])

    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])

def count_softmax(m, x, y):
    x = x[0]

    batch_size, nfeatures = x.size()

    total_exp = nfeatures
    total_add = nfeatures - 1
    total_div = nfeatures
    total_ops = batch_size * (total_exp + total_add + total_div)

    m.total_ops += torch.DoubleTensor([int(total_ops)])

    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])

def count_avgpool(m, x, y):
    # total_add = torch.prod(torch.Tensor([m.kernel_size]))
    # total_div = 1
    # kernel_ops = total_add + total_div
    kernel_ops = 1
    num_elements = y.numel()
    total_ops = kernel_ops * num_elements

    m.total_ops += torch.DoubleTensor([int(total_ops)])

    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])

def count_adap_avgpool(m, x, y):
    kernel = torch.DoubleTensor([*(x[0].shape[2:])]) // torch.DoubleTensor(list((m.output_size,))).squeeze()
    total_add = torch.prod(kernel)
    total_div = 1
    kernel_ops = total_add + total_div
    num_elements = y.numel()
    total_ops = kernel_ops * num_elements

    m.total_ops += torch.DoubleTensor([int(total_ops)])

    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])

# TODO: verify the accuracy
def count_upsample(m, x, y):
    if m.mode not in ("nearest", "linear", "bilinear", "bicubic",):  # "trilinear"
        logging.warning("mode %s is not implemented yet, take it a zero op" % m.mode)
        return zero_ops(m, x, y)

    if m.mode == "nearest":
        return zero_ops(m, x, y)

    x = x[0]
    if m.mode == "linear":
        total_ops = y.nelement() * 5  # 2 muls + 3 add
    elif m.mode == "bilinear":
        # https://en.wikipedia.org/wiki/Bilinear_interpolation
        total_ops = y.nelement() * 11  # 6 muls + 5 adds
    elif m.mode == "bicubic":
        # https://en.wikipedia.org/wiki/Bicubic_interpolation
        # Product matrix [4x4] x [4x4] x [4x4]
        ops_solve_A = 224  # 128 muls + 96 adds
        ops_solve_p = 35  # 16 muls + 12 adds + 4 muls + 3 adds
        total_ops = y.nelement() * (ops_solve_A + ops_solve_p)
    elif m.mode == "trilinear":
        # https://en.wikipedia.org/wiki/Trilinear_interpolation
        # can viewed as 2 bilinear + 1 linear
        total_ops = y.nelement() * (13 * 2 + 5)

    m.total_ops += torch.DoubleTensor([int(total_ops)])
    
    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])


# nn.Linear
def count_linear(m, x, y):
    # per output element
    total_mul = m.in_features
    # total_add = m.in_features - 1
    # total_add += 1 if m.bias is not None else 0
    num_elements = y.numel()
    total_ops = total_mul * num_elements

    m.total_ops += torch.DoubleTensor([int(total_ops)])
    
    total_output_macs = y.nelement()
    m.total_output_macs += torch.DoubleTensor([int(total_output_macs)])
    
    total_kernel_macs = m.in_features * num_elements
    m.total_kernel_macs += torch.DoubleTensor([int(total_kernel_macs)])

from models.CBN_model import CBRES2 as md
model = md()
from thop import profile_macs
input = torch.randn(1, 16, 448,448)
flops, params,kmacs,omacs = profile_macs(model, inputs=(input, ))
# flops = (flops*224/)
omacs = omacs
print(flops/1e9*2, params/1e6,(kmacs/1e6+omacs/1e6)*8)
#kmacs是kernel的访存，omacs是输出的访存，随后*8是因为python中float32是8个字节
print(flops/1e9*2, params/1e6,(kmacs/1e6+omacs/1e6)*8)

https://blog.51cto.com/u_16213599/8039726

6种方法计算神经网络参数量Params、计算量FLOPs、Macs简单代码_计算网络的param和glops的方法-CSDN博客

深度学习模型大小与模型推理速度的探讨 - 知乎

【原创】如何解决python进程被kill掉后GPU显存不释放的问题_jzrita的博客-CSDN博客_python 释放gpu内存

释放pytorch占用的gpu显存_Pytorch释放显存占用方式_weixin_39929595的博客-CSDN博客

神经网络计算量、带宽、计算强度、访存比 - 知乎

【深度笔记】模型理论性能计算调研_计算访存比-CSDN博客

Yolov5 计算访存量MAC与计算量FLOPS

齐名南

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
GPU显存，计算量，计算密度，访存量等推理速度

释放pytorch占用的gpu显存_Pytorch释放显存占用方式释放pytorch占用的gpu显存_Pytorch释放显存占用方式_weixin_39929595的博客-CSDN博客
复制链接

扫一扫