1、Parameters 和 Flops计算
使用torchstat库。一般自带的。
pip install torchstat
from torchstat import stat
from torchvision.models import resnet18
model = resnet18()
stat(model, (3, 224, 224))
[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!
module name input shape output shape params memory(MB) MAdd Flops MemRead(B) MemWrite(B) duration[%] MemR+W(B)
0 conv1 3 224 224 64 112 112 9408.0 3.06 235,225,088.0 118,013,952.0 639744.0 3211264.0 12.48% 3851008.0
1 bn1 64 112 112 64 112 112 128.0 3.06 3,211,264.0 1,605,632.0 3211776.0 3211264.0 3.12% 6423040.0
2 relu 64 112 112 64 112 112 0.0 3.06 802,816.0 802,816.0 3211264.0 3211264.0 3.12% 6422528.0
3 maxpool 64 112 112 64 56 56 0.0 0.77 1,605,632.0 802,816.0 3211264.0 802816.0 15.62% 4014080.0
4 layer1.0.conv1 64 56 56 64 56 56 36864.0 0.77 231,010,304.0 115,605,504.0 950272.0 802816.0 3.12% 1753088.0
5 layer1.0.bn1 64 56 56 64 56 56 128.0 0.77 802,816.0 401,408.0 803328.0 802816.0 0.00% 1606144.0
6 layer1.0.relu 64 56 56 64 56 56 0.0 0.77 200,704.0 200,704.0 802816.0 802816.0 0.00% 1605632.0
7 layer1.0.conv2 64 56 56 64 56 56 36864.0 0.77 231,010,304.0 115,605,504.0 950272.0 802816.0 3.12% 1753088.0
8 layer1.0.bn2 64 56 56 64 56 56 128.0 0.77 802,816.0 401,408.0 803328.0 802816.0 0.00% 1606144.0
9 layer1.1.conv1 64 56 56 64 56 56 36864.0 0.77 231,010,304.0 115,605,504.0 950272.0 802816.0 3.12% 1753088.0
10 layer1.1.bn1 64 56 56 64 56 56 128.0 0.77 802,816.0 401,408.0 803328.0 802816.0 3.16% 1606144.0
11 layer1.1.relu 64 56 56 64 56 56 0.0 0.77 200,704.0 200,704.0 802816.0 802816.0 0.00% 1605632.0
12 layer1.1.conv2 64 56 56 64 56 56 36864.0 0.77 231,010,304.0 115,605,504.0 950272.0 802816.0 3.12% 1753088.0
13 layer1.1.bn2 64 56 56 64 56 56 128.0 0.77 802,816.0 401,408.0 803328.0 802816.0 0.00% 1606144.0
14 layer2.0.conv1 64 56 56 128 28 28 73728.0 0.38 115,505,152.0 57,802,752.0 1097728.0 401408.0 0.00% 1499136.0
15 layer2.0.bn1 128 28 28 128 28 28 256.0 0.38 401,408.0 200,704.0 402432.0 401408.0 0.00% 803840.0
16 layer2.0.relu 128 28 28 128 28 28 0.0 0.38 100,352.0 100,352.0 401408.0 401408.0 0.00% 802816.0
17 layer2.0.conv2 128 28 28 128 28 28 147456.0 0.38 231,110,656.0 115,605,504.0 991232.0 401408.0 3.12% 1392640.0
18 layer2.0.bn2 128 28 28 128 28 28 256.0 0.38 401,408.0 200,704.0 402432.0 401408.0 0.00% 803840.0
19 layer2.0.downsample.0 64 56 56 128 28 28 8192.0 0.38 12,744,704.0 6,422,528.0 835584.0 401408.0 3.12% 1236992.0
20 layer2.0.downsample.1 128 28 28 128 28 28 256.0 0.38 401,408.0 200,704.0 402432.0 401408.0 0.00% 803840.0
21 layer2.1.conv1 128 28 28 128 28 28 147456.0 0.38 231,110,656.0 115,605,504.0 991232.0 401408.0 3.12% 1392640.0
22 layer2.1.bn1 128 28 28 128 28 28 256.0 0.38 401,408.0 200,704.0 402432.0 401408.0 0.00% 803840.0
23 layer2.1.relu 128 28 28 128 28 28 0.0 0.38 100,352.0 100,352.0 401408.0 401408.0 0.00% 802816.0
24 layer2.1.conv2 128 28 28 128 28 28 147456.0 0.38 231,110,656.0 115,605,504.0 991232.0 401408.0 3.12% 1392640.0
25 layer2.1.bn2 128 28 28 128 28 28 256.0 0.38 401,408.0 200,704.0 402432.0 401408.0 0.00% 803840.0
26 layer3.0.conv1 128 28 28 256 14 14 294912.0 0.19 115,555,328.0 57,802,752.0 1581056.0 200704.0 3.12% 1781760.0
27 layer3.0.bn1 256 14 14 256 14 14 512.0 0.19 200,704.0 100,352.0 202752.0 200704.0 0.00% 403456.0
28 layer3.0.relu 256 14 14 256 14 14 0.0 0.19 50,176.0 50,176.0 200704.0 200704.0 0.00% 401408.0
29 layer3.0.conv2 256 14 14 256 14 14 589824.0 0.19 231,160,832.0 115,605,504.0 2560000.0 200704.0 3.12% 2760704.0
30 layer3.0.bn2 256 14 14 256 14 14 512.0 0.19 200,704.0 100,352.0 202752.0 200704.0 0.00% 403456.0
31 layer3.0.downsample.0 128 28 28 256 14 14 32768.0 0.19 12,794,880.0 6,422,528.0 532480.0 200704.0 0.00% 733184.0
32 layer3.0.downsample.1 256 14 14 256 14 14 512.0 0.19 200,704.0 100,352.0 202752.0 200704.0 0.00% 403456.0
33 layer3.1.conv1 256 14 14 256 14 14 589824.0 0.19 231,160,832.0 115,605,504.0 2560000.0 200704.0 3.12% 2760704.0
34 layer3.1.bn1 256 14 14 256 14 14 512.0 0.19 200,704.0 100,352.0 202752.0 200704.0 0.00% 403456.0
35 layer3.1.relu 256 14 14 256 14 14 0.0 0.19 50,176.0 50,176.0 200704.0 200704.0 0.00% 401408.0
36 layer3.1.conv2 256 14 14 256 14 14 589824.0 0.19 231,160,832.0 115,605,504.0 2560000.0 200704.0 3.13% 2760704.0
37 layer3.1.bn2 256 14 14 256 14 14 512.0 0.19 200,704.0 100,352.0 202752.0 200704.0 0.00% 403456.0
38 layer4.0.conv1 256 14 14 512 7 7 1179648.0 0.10 115,580,416.0 57,802,752.0 4919296.0 100352.0 3.13% 5019648.0
39 layer4.0.bn1 512 7 7 512 7 7 1024.0 0.10 100,352.0 50,176.0 104448.0 100352.0 0.00% 204800.0
40 layer4.0.relu 512 7 7 512 7 7 0.0 0.10 25,088.0 25,088.0 100352.0 100352.0 0.00% 200704.0
41 layer4.0.conv2 512 7 7 512 7 7 2359296.0 0.10 231,185,920.0 115,605,504.0 9537536.0 100352.0 6.25% 9637888.0
42 layer4.0.bn2 512 7 7 512 7 7 1024.0 0.10 100,352.0 50,176.0 104448.0 100352.0 0.00% 204800.0
43 layer4.0.downsample.0 256 14 14 512 7 7 131072.0 0.10 12,819,968.0 6,422,528.0 724992.0 100352.0 3.12% 825344.0
44 layer4.0.downsample.1 512 7 7 512 7 7 1024.0 0.10 100,352.0 50,176.0 104448.0 100352.0 0.00% 204800.0
45 layer4.1.conv1 512 7 7 512 7 7 2359296.0 0.10 231,185,920.0 115,605,504.0 9537536.0 100352.0 6.25% 9637888.0
46 layer4.1.bn1 512 7 7 512 7 7 1024.0 0.10 100,352.0 50,176.0 104448.0 100352.0 0.00% 204800.0
47 layer4.1.relu 512 7 7 512 7 7 0.0 0.10 25,088.0 25,088.0 100352.0 100352.0 0.00% 200704.0
48 layer4.1.conv2 512 7 7 512 7 7 2359296.0 0.10 231,185,920.0 115,605,504.0 9537536.0 100352.0 6.25% 9637888.0
49 layer4.1.bn2 512 7 7 512 7 7 1024.0 0.10 100,352.0 50,176.0 104448.0 100352.0 0.00% 204800.0
50 avgpool 512 7 7 512 1 1 0.0 0.00 0.0 0.0 0.0 0.0 0.00% 0.0
51 fc 512 1000 513000.0 0.00 1,023,000.0 512,000.0 2054048.0 4000.0 0.00% 2058048.0
total 11689512.0 25.65 3,638,757,912.0 1,821,399,040.0 2054048.0 4000.0 100.00% 101756992.0
=================================================================================================================================================================
Total params: 11,689,512
-----------------------------------------------------------------------------------------------------------------------------------------------------------------
Total memory: 25.65MB
Total MAdd: 3.64GMAdd
Total Flops: 1.82GFlops
Total MemR+W: 97.04MB
2、模型推理速度计算
需要克服GPU异步执行和GPU预热两个问题,下面例子使用 Efficient-net-b0,在进行任何时间测量之前,我们通过网络运行一些虚拟示例来进行“GPU 预热”。这将自动初始化 GPU 并防止它在我们测量时间时进入省电模式。接下来,我们使用 tr.cuda.event 来测量 GPU 上的时间。在这里使用 torch.cuda.synchronize() 至关重要。这行代码执行主机和设备(即GPU和CPU)之间的同步,因此只有在GPU上运行的进程完成后才会进行时间记录。这克服了不同步执行的问题。
mean_syn表示检测一张图片的耗时;mean_fps表示一秒内检测图片的数量。
model = EfficientNet.from_pretrained("efficientnet-b0")
device = torch.device("cuda")
model.to(device)
dummy_input = torch.randn(1, 3, 224, 224,dtype=torch.float).to(device)
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))
#GPU-WARM-UP
for _ in range(10):
_ = model(dummy_input)
# MEASURE PERFORMANCE
with torch.no_grad():
for rep in range(repetitions):
starter.record()
_ = model(dummy_input)
ender.record()
# WAIT FOR GPU SYNC
torch.cuda.synchronize()
curr_time = starter.elapsed_time(ender)
timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
mean_fps = 1000. / mean_syn
print(' * Mean@1 {mean_syn:.3f}ms Std@5 {std_syn:.3f}ms FPS@1 {mean_fps:.2f}'.format(mean_syn=mean_syn, std_syn=std_syn, mean_fps=mean_fps))
print(mean_syn)
3、模型吞吐量计算
神经网络的吞吐量定义为网络在单位时间内(例如,一秒)可以处理的最大输入实例数。与涉及单个实例处理的延迟不同,为了实现最大吞吐量,我们希望并行处理尽可能多的实例。有效的并行性显然依赖于数据、模型和设备。因此,为了正确测量吞吐量,我们执行以下两个步骤:(1)我们估计允许最大并行度的最佳批量大小;(2)给定这个最佳批量大小,我们测量网络在一秒钟内可以处理的实例数
要找到最佳批量大小,一个好的经验法则是达到 GPU 对给定数据类型的内存限制。这个大小当然取决于硬件类型和网络的大小。找到这个最大批量大小的最快方法是执行二进制搜索。当时间不重要时,简单的顺序搜索就足够了。为此,我们使用 for 循环将批量大小增加 1,直到达到运行时错误为止,这确定了 GPU 可以处理的最大批量大小,用于我们的神经网络模型及其处理的输入数据。
在找到最佳批量大小后,我们计算实际吞吐量。为此,我们希望处理多个批次(100 个批次就足够了),然后使用以下公式:
(批次数 X 批次大小)/(以秒为单位的总时间)
这个公式给出了我们的网络可以在一秒钟内处理的示例数量。下面的代码提供了一种执行上述计算的简单方法(给定最佳批量大小)
model = EfficientNet.from_pretrained(‘efficientnet-b0’)
device = torch.device(“cuda”)
model.to(device)
dummy_input = torch.randn(optimal_batch_size, 3,224,224, dtype=torch.float).to(device)
repetitions=100
total_time = 0
with torch.no_grad():
for rep in range(repetitions):
starter, ender = torch.cuda.Event(enable_timing=True),torch.cuda.Event(enable_timing=True)
starter.record()
_ = model(dummy_input)
ender.record()
torch.cuda.synchronize()
curr_time = starter.elapsed_time(ender)/1000
total_time += curr_time
Throughput = (repetitions*optimal_batch_size)/total_time
print(‘Final Throughput:’,Throughput)