浅谈深度学习:如何计算模型以及中间变量的显存占用大小
pytorch: torchstat
from torchstat import stat
import torchvision.models as models
model = model.alexnet()
stat(model, (3, 224, 224))
import torchvision.models as models
import torch
from ptflops import get_model_complexity_info
with torch.cuda.device(0):
net = models.densenet161()
flops, params = get_model_complexity_info(net, (3, 224, 224), as_strings=True, print_per_layer_stat=True)
print('{:<30} {:<8}'.format('Computational complexity: ', flops))
print('{:<30} {:<8}'.format('Number of parameters: ', params))
FLOPS:全大写,是floating point operations per second的缩写,意指每秒浮点运算次数,理解为计算速度。是一个衡量硬件性能的指标。
FLOPs:s小写,是floating point operations的缩写(s表复数),意指浮点运算数,理解为计算量。可以用来衡量算法/模型的复杂度。
对于普通卷积层来说:FLOPs=2HW(CinK^2+1)Cout
对于深度可分离卷积: FLOPs=2HW*Cin*(K^2+Cout)
可视化模型有很多方法,caffe的用netscope。而最方便的莫过于netron,几乎所有的格式都支持,但有个不好的地方就是没法打印输出的形状和计算量等比较关键的信息,中间格式ONNX(tutorials),还有微软的mmdnn,百度的X2Paddle.
- V1使用1*1降维和GAP代替Flatten操作
- V2使用两个3*3卷积代替5*5卷积
- V3使用 N*1与 1*N卷积级联替代 N*N 卷积
- Xception使用可分离卷积代替普通卷积
- 同等通道大小最小化内存访问量, 1x1卷积进行平衡输入和输出的通道大小
- 过量使用组卷积会增加MAC, 组卷积要谨慎使用, 注意分组数
- 网络碎片化会降低并行度, 避免网络的碎片化
- 不能忽略元素级操作, 减少元素级运算. 比如ReLU和Add,虽然它们的FLOPs较小,但是却需要较大的MAC
caffe:
import sys
import caffe
from numpy import prod, sum
def get_flops(deploy_file, show_detail=True,flop_layers = ['Convolution', 'DepthwiseConvolution', 'InnerProduct']):
net=caffe.Net(deploy_file,caffe.TEST)
params=0
flops=0
infos=[]
print(deploy_file)
maxnamelen = 8
for item in net.params.items():
name, layer = item
layer_type = net.layer_dict[name].type
if layer_type in flop_layers:
maxnamelen =len(name) if len(name) > maxnamelen else maxnamelen
param = layer[0].count# + layer[1].count
#bm = net.blobs[net.bottom_names[name][0]]
bt = net.blobs[net.top_names[name][0]]
flop = param*bt.width*bt.height
if show_detail:
info={}
info['name']=name
info['filter_shape']=layer[0].data.shape
info['out_shape']=bt.data.shape
info['params']=param
info['flops']=flop
infos.append(info)
params += param
flops += flop
if show_detail:
print('layer name'.ljust(maxnamelen+1), 'Filter Shape'.ljust(16),'Output Size'.ljust(16), 'Params'.ljust(8), 'Flops'.ljust(12),"Ratio")
for info in infos:
ratio = round(info['flops']*100.0/flops,3)
print(info['name'].ljust(maxnamelen+1),str(info['filter_shape']).ljust(16),
str(info['out_shape']).ljust(16),str(info['params']).ljust(8),str(info['flops']).ljust(12),str(ratio))
print ('Layers num: ' + str(len(net.params.items())))
print("Total number of parameters: ",params)
print("Total number of FLOPs: ",flops)
return params, flops
if __name__ == '__main__':
deploy_file="deploy.prototxt"
if len(sys.argv) > 1:
deploy_file = sys.argv[1]
get_flops(deploy_file)
输出
deploy.prototxt
layer name Filter Shape Output Size Params Flops Ratio
conv1 (96, 3, 11, 11) (1, 96, 55, 55) 34848 105415200 14.552
conv2 (256, 48, 5, 5) (1, 256, 27, 27) 307200 223948800 30.915
conv3 (384, 256, 3, 3) (1, 384, 13, 13) 884736 149520384 20.64
conv4 (384, 192, 3, 3) (1, 384, 13, 13) 663552 112140288 15.48
conv5 (256, 192, 3, 3) (1, 256, 13, 13) 442368 74760192 10.32
fc6 (4096, 9216) (1, 4096) 37748736 37748736 5.211
fc7 (4096, 4096) (1, 4096) 16777216 16777216 2.316
fc8 (1000, 4096) (1, 1000) 4096000 4096000 0.565
Layers num: 8
Total number of parameters: 60954656
Total number of FLOPs: 724406816
参考: