Nsight Compute 是怎么计算Roofline的呢
用Roofline模型去分析pytorch和Triton算子 发现Nsight Compute中的Peak Work跟峰值算力对不上.这里进一步分析
1.参考链接
- Metrics smsp__sass_thread_inst_executed_op
- sm__sass_thread_inst_executed_op_ffma_
- 使用Nsight Compute构建roofline model
- NsightComputeCli
- Roofline_model
- 用Roofline模型去分析pytorch和Triton算子
- H800基础能力测试
- nvtx-include
2.小结
- 理论算力: 35841.852=13.26 TFLOPS
- 硬件的理论算力密度: 36.87
- 该测例pytorch测出的实际算力:4147.84 GFOPS【是峰值算力的:31.2%】; 测出的带宽: 6.07GB/s .(当黑盒处理,统计周期里包括了计算和IO,所以并不准确)
- 该测例的算力密度:682.66 (>36.87) 是计算瓶颈
- 按Nsight Compute的算法 PeakWork(FFMA): 9.46 TFLOPS (跟具体的规模无关)
- 按Nsight Compute的算法 PeakTraffic: 349.92 GB/s
- 按Nsight Compute的算法 AchievedWork: 6.02 TFLOPS 是峰值算力的: 63%
- 按Nsight Compute的算法 AchievedTraffic: 42.82 Gbyte/second
- sgemm single kernels: 9936.39 GFLOPS
- sgemm N=10 without streams: 10260.6 GFLOPS
- sgemm N=10 with stream: 10339.9 GFLOPS
- sgemm N=10 batched: 8482.82 GFLOPS
- 根据该内核的占用情况,理论上每个调度程序可以发出 4.00 个 warp,低于硬件最大值 12。该内核的理论占用率 (33.3%) 受到所需寄存器数量的限制。
3.Nsight Compute 是怎么计算Roofline的呢
公式:C:\Program Files\NVIDIA Corporation\Nsight Compute 2024.1.1\sections\SpeedOfLight_HierarchicalSingleRooflineChart.section
内容如下(实际比这个多):
MetricDefinitions {
MetricDefinitions {
Name: "derived__sm__sass_thread_inst_executed_op_ffma_pred_on_x2"
Expression: "sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained * 2"
}
MetricDefinitions {
Name: "derived__smsp__sass_thread_inst_executed_op_ffma_pred_on_x2"
Expression: "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed * 2"
}
}
Rooflines {
PeakWork {
ValueCyclesPerSecondExpression {
ValuePerCycleMetrics {
Label: "Theoretical Predicated-On FFMA Operations"
Name: "derived__sm__sass_thread_inst_executed_op_ffma_pred_on_x2"
}
CyclesPerSecondMetric {
Label: "SM Frequency"
Name: "sm__cycles_elapsed.avg.per_second"
}
}
}
PeakTraffic {
ValueCyclesPerSecondExpression {
ValuePerCycleMetrics {
Label: "Theoretical DRAM Bytes Accessible"
Name: "dram__bytes.sum.peak_sustained"
}
CyclesPerSecondMetric {
Label: "DRAM Frequency"
Name: "dram__cycles_elapsed.avg.per_second"
}
}
}
Options {
Label: "DRAM Roofline"
}
AchievedValues {
AchievedWork {
ValueCyclesPerSecondExpression {
ValuePerCycleMetrics {
Label: "Predicated-On FFMA Operations Per Cycle"
Name: "derived__smsp__sass_thread_inst_executed_op_ffma_pred_on_x2"
}
CyclesPerSecondMetric {
Label: "SM Frequency"
Name: "smsp__cycles_elapsed.avg.per_second"
}
}
}
AchievedTraffic {
Metric {
Label: "DRAM Bandwidth"
Name: "dram__bytes.sum.per_second"
Filter {
MaxArch: CC_70
}
}
}
}
}
4.生成测试程序
tee Theoretical_FLOPS.py <<-'EOF'
import sys
import torch
import torch.nn as nn
import math
import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import numpy as np
import os
# 定义一个测试模型
class SimpleModel(nn.Module):
def __init__(self,input_features,output_features):
super(SimpleModel, self).__init__()
self.fc1 = torch.nn.utils.skip_init(nn.Linear,input_features,output_features,bias=False)
def forward(self, x):
x = self.fc1(x)
return x
input_features = int(sys.argv[1])
output_features = input_features
batch_size = input_features
model = SimpleModel(input_features,output_features).cuda()
input_data = torch.ones(batch_size, input_features).cuda()
test_count=10
# 计算 FLOPs 和内存访问量
flops = FlopCountAnalysis(model, input_data).total()*test_count
activations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()
print("activations:",activations)
# 计算参数个数
params = sum(p.numel() for p in model.parameters())
# 内存访问量假定为 activations 和params 乘以 4 字节(假设 activations 和 params 是 float32 类型)
activation_memory_access = activations * 4
params_memory_access = params * 4
memory_access = activation_memory_access + params_memory_access
memory_access=memory_access*test_count
# warmup
output = model(input_data)
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for i in range(test_count):
output = model(input_data)
end_event.record()
torch.cuda.synchronize()
total_cuda_time = start_event.elapsed_time(end_event) / 1000 # 转换为秒
# FLOPs 转换至 GFLOPs
flops_measured_glops = flops / 1e9
# 内存带宽测量
memory_access_gb=memory_access/ 1e9
bandwidth_measured = memory_access_gb / total_cuda_time # 单位:GB/s
arithmetic_intensity_measured=flops_measured_glops/memory_access_gb #GFLOPs/GB(算法的静态属性
flops_measured = arithmetic_intensity_measured*bandwidth_measured
# RTX 3060 GPU 的峰值性能和带宽
peak_performance = 13.275136 * 1e3 # 单位:GFLOPs
memory_bandwidth = 360.0 # 单位:GB/s
print("arithmetic_intensity:",peak_performance/memory_bandwidth)
print("flops_measured:",flops_measured,flops_measured/peak_performance)
print("bandwidth_measured:",bandwidth_measured)
print("total_cuda_time:",total_cuda_time)
print("arithmetic_intensity_measured:",arithmetic_intensity_measured)
# ncu从这些开始收集性能
import nvtx
with nvtx.annotate("kernel_prof", color="blue"):
output = model(input_data)
torch.cuda.synchronize()
EOF
5.测试规模为8192时的性能
/usr/local/cuda/bin/ncu --nvtx --nvtx-include "kernel_prof/" --metrics sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained,smsp__cycles_elapsed.avg.per_second,smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,sm__cycles_elapsed.avg.per_second,dram__bytes.sum.peak_sustained,dram__bytes.sum.per_second,dram__cycles_elapsed.avg.per_second python Theoretical_FLOPS.py 8192
输出:
activations: 134217728
arithmetic_intensity: 36.87537777777778
flops_measured: 4147.841730822858 0.3124519199519205
bandwidth_measured: 6.075940035385045
total_cuda_time: 1.325402099609375
arithmetic_intensity_measured: 682.6666666666667
==PROF== Profiling "ampere_sgemm_128x128_tn" - 0: 0%....50%.
...100% - 3 passes
==PROF== Disconnected from process 266138
[266138] python3.10@127.0.0.1
ampere_sgemm_128x128_tn (64, 64, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
NVTX Push/Pop Stack for Thread 266138:
<default domain>
<0,kernel_prof>
RGB: 0xff
REGISTERED: kernel_prof
Section: Command line profiler metrics
--------------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------------------------- ------------- ------------
dram__bytes.sum.peak_sustained byte/cycle 48
dram__bytes.sum.per_second Gbyte/second 42.84
dram__cycles_elapsed.avg.per_second cycle/nsecond 7.29
sm__cycles_elapsed.avg.per_second cycle/nsecond 1.32
sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained inst/cycle 3,584
smsp__cycles_elapsed.avg.per_second cycle/nsecond 1.32
smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed inst/cycle 2,282.58
--------------------------------------------------------------------- ------------- ------------
6.计算Roofline
# 峰值性能与带宽
PeakWork=sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained * 2 * sm__cycles_elapsed.avg.per_second = 3584 *2 * 1.32 inst/nsecond = 9.46 TFLOPS
PeakTraffic=dram__bytes.sum.peak_sustained * dram__cycles_elapsed.avg.per_second = 48 * 7.29 byte/nsecond = 48 * 7.29 GB/s = 349.92 GB/s
# 实测性能与带宽
AchievedWork=smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed *2 * smsp__cycles_elapsed.avg.per_second = 2282.58 * 2 * 1.32 inst/nsecond = 6.02 TFLOPS
AchievedTraffic=dram__bytes.sum.per_second = 42.82 Gbyte/second
7.指标解释
与 sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained 相比,smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed
最主要的区别在于衡量的是每个时钟周期内的执行事件,而不是持续峰值。这两个指标从不同的角度描述了 GPU 在执行特定类型操作(如 FMA)时的性能:
可以逐一解释如下:
smsp: 表示该指标是在 Streaming Multiprocessor(流处理器)层面上测量的。在 NVIDIA 架构中,SM 或 SMSM(流多处理器)是负责处理计算任务的主要组件。
sass: 代表 Shader Assembly,是 NVIDIA GPU 的底层指令集,指的是直接在硬件上执行的指令。
thread_inst_executed: 这表示执行在 GPU 线程中的指令的数量。
op_ffma: 表示融合乘加(Fused Multiply-Add)操作,这是一种同时执行乘法和加法的算数操作,对于浮点运算非常常见和重要。
pred_on: 这意味着这些统计数据仅包括那些在谓词(条件)为真时执行的指令。
sum: 指在一定的采集时间窗或一系列样本中,这一指标的累积总和。
per_cycle_elapsed: 这表示指标是以每个 GPU 时钟周期为单位来计测的。它提供了在每个时钟周期内执行的 FMA 操作的平均次数,通常用于衡量单位时间内的执行效率。
peak_sustained: 指示这是在观测期间持续达到的峰值性能的度量。
频率 vs 峰值: per_cycle_elapsed 类似于平均效率(每个时钟周期中执行的平均次数),而 peak_sustained 侧重于在执行高峰期间达到的最高绩效(累积的最大值)。
实时效率: per_cycle_elapsed 更关注瞬时的执行效率,它可以帮助开发者了解在每个具体的执行周期内,硬件是如何响应的。
总的来说,smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed 更适合用来评估和优化 GPU 代码在单个时钟周期内的效率,而 sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained 更适合于评估在密集运算期间GPU的最大处理能力。两者结合使用可以提供一个更全面的 GPU 性能分析。
8.测试规模为1024时的性能
/usr/local/cuda/bin/ncu --nvtx --nvtx-include "kernel_prof/" --metrics sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained,smsp__cycles_elapsed.avg.per_second,smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,sm__cycles_elapsed.avg.per_second,dram__bytes.sum.peak_sustained,dram__bytes.sum.per_second,dram__cycles_elapsed.avg.per_second python Theoretical_FLOPS.py 128
输出
activations: 2097152
arithmetic_intensity: 36.87537777777778
flops_measured: 1470.6536158405618 0.1107825649274374
bandwidth_measured: 17.23422206063158
total_cuda_time: 0.007301119804382325
arithmetic_intensity_measured: 85.33333333333334
==PROF== Profiling "ampere_sgemm_128x64_tn" - 0: 0%....50%....100% - 3 passes
==PROF== Disconnected from process 267209
[267209] python3.10@127.0.0.1
ampere_sgemm_128x64_tn (8, 16, 3)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
NVTX Push/Pop Stack for Thread 267209:
<default domain>
<0,kernel_prof>
RGB: 0xff
REGISTERED: kernel_prof
Section: Command line profiler metrics
--------------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------------------------- ------------- ------------
dram__bytes.sum.peak_sustained byte/cycle 48
dram__bytes.sum.per_second Gbyte/second 100.81
dram__cycles_elapsed.avg.per_second cycle/nsecond 7.29
sm__cycles_elapsed.avg.per_second cycle/nsecond 1.32
sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained inst/cycle 3,584
smsp__cycles_elapsed.avg.per_second cycle/nsecond 1.32
smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed inst/cycle 2,057.05
--------------------------------------------------------------------- ------------- ------------
9.测试规模为128时的性能
/usr/local/cuda/bin/ncu --nvtx --nvtx-include "kernel_prof/" --metrics sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained,smsp__cycles_elapsed.avg.per_second,smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,sm__cycles_elapsed.avg.per_second,dram__bytes.sum.peak_sustained,dram__bytes.sum.per_second,dram__cycles_elapsed.avg.per_second python Theoretical_FLOPS.py 128
输出
activations: 32768
arithmetic_intensity: 36.87537777777778
flops_measured: 3.9713012060185076 0.0002991533349276804
bandwidth_measured: 0.37230948806423503
total_cuda_time: 0.005280767917633057
arithmetic_intensity_measured: 10.666666666666668
==PROF== Profiling "ampere_sgemm_32x32_sliced1x4_tn" - 0: 0%....50%....100% - 3 passes
==PROF== Disconnected from process 267388
[267388] python3.10@127.0.0.1
ampere_sgemm_32x32_sliced1x4_tn (4, 4, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
NVTX Push/Pop Stack for Thread 267388:
<default domain>
<0,kernel_prof>
RGB: 0xff
REGISTERED: kernel_prof
Section: Command line profiler metrics
--------------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------------------------- ------------- ------------
dram__bytes.sum.peak_sustained byte/cycle 48
dram__bytes.sum.per_second Gbyte/second 17.21
dram__cycles_elapsed.avg.per_second cycle/nsecond 7.24
sm__cycles_elapsed.avg.per_second cycle/nsecond 1.31
sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained inst/cycle 3,584
smsp__cycles_elapsed.avg.per_second cycle/nsecond 1.31
smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed inst/cycle 185.00
--------------------------------------------------------------------- ------------- ------------
通过不同规模的测试发现,dram__bytes.sum.peak_sustained和sm__sass_thread_inst_executed_op_ffma_pred_on.sum.peak_sustained不随规模变化
10.RTX 3060基础能力测试
git clone https://www.github.com/nvidia/cuda-samples
cd cuda-samples/Samples/1_Utilities/deviceQuery
make clean && make
./deviceQuery
cd ../bandwidthTest/
make clean && make
./bandwidthTest
cd ../../4_CUDA_Libraries/batchCUBLAS/
make clean && make
./batchCUBLAS -m8192 -n8192 -k8192 --device=0
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "NVIDIA GeForce RTX 3060"
CUDA Driver Version / Runtime Version 12.2 / 12.1
CUDA Capability Major/Minor version number: 8.6
Total amount of global memory: 12044 MBytes (12629377024 bytes)
(028) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores
GPU Max Clock rate: 1852 MHz (1.85 GHz)
Memory Clock rate: 7501 Mhz
Memory Bus Width: 192-bit
L2 Cache Size: 2359296 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total shared memory per multiprocessor: 102400 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 1536
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
Supports MultiDevice Co-op Kernel Launch: Yes
Device PCI Domain ID / Bus ID / location ID: 0 / 3 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.2, CUDA Runtime Version = 12.1, NumDevs = 1
Result = PASS
Running on...
Device 0: NVIDIA GeForce RTX 3060
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 12.0
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 13.2
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 326.3
Result = PASS
gpuDeviceInit() CUDA Device [0]: "Ampere
==== Running single kernels ====
Testing sgemm
#### args: ta=0 tb=0 m=8192 n=8192 k=8192 alpha = (0xbf800000, -1) beta= (0x40000000, 2)
#### args: lda=8192 ldb=8192 ldc=8192
^^^^ elapsed = 0.11065507 sec GFLOPS=9936.39
@@@@ sgemm test OK
==== Running N=10 without streams ====
Testing sgemm
#### args: ta=0 tb=0 m=8192 n=8192 k=8192 alpha = (0xbf800000, -1) beta= (0x00000000, 0)
#### args: lda=8192 ldb=8192 ldc=8192
^^^^ elapsed = 1.07158208 sec GFLOPS=10260.6
@@@@ sgemm test OK
==== Running N=10 with streams ====
Testing sgemm
#### args: ta=0 tb=0 m=8192 n=8192 k=8192 alpha = (0xbf800000, -1) beta= (0x00000000, 0)
#### args: lda=8192 ldb=8192 ldc=8192
^^^^ elapsed = 1.06336808 sec GFLOPS=10339.9
@@@@ sgemm test OK
==== Running N=10 batched ====
Testing sgemm
#### args: ta=0 tb=0 m=8192 n=8192 k=8192 alpha = (0x40000000, 2) beta= (0x40000000, 2)
#### args: lda=8192 ldb=8192 ldc=8192
^^^^ elapsed = 1.29616284 sec GFLOPS=8482.82
@@@@ sgemm test OK
FP32 理论算力
(028) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores
GPU Max Clock rate: 1852 MHz (1.85 GHz)
3584*1.85*2=13.26TFLOPS
11.sm__inst_executed.avg.pct_of_peak_sustained_active
/usr/local/cuda/bin/ncu --nvtx --nvtx-include "kernel_prof/" --metrics sm__inst_executed.avg.pct_of_peak_sustained_active python Theoretical_FLOPS.py 8192
输出
ampere_sgemm_128x128_tn (64, 64, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
NVTX Push/Pop Stack for Thread 270100:
<default domain>
<0,kernel_prof>
RGB: 0xff
REGISTERED: kernel_prof
Section: Command line profiler metrics
-------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------- ----------- ------------
sm__inst_executed.avg.pct_of_peak_sustained_active % 73.47
-------------------------------------------------- ----------- ------------
12.全面分析
/usr/local/cuda/bin/ncu --nvtx --nvtx-include "kernel_prof/" -f --set full --export roofline_report python Theoretical_FLOPS.py 8192
根据该内核的占用情况,理论上每个调度程序可以发出 4.00 个 warp,低于硬件最大值 12。该内核的理论占用率 (33.3%) 受到所需寄存器数量的限制。