PGO的使用
让我们举个例子来说明
用例
1 int XX = 0;
2 int YY = 0;
3
4 int main(int argc, char* argv[]) {
5 if (argc > 1) {
6 XX += 2; // 分支1
7 YY *= XX;
8 } else {
9 XX -= 1; // 分支2
10 YY -= XX;
11 }
12 return 0;
13 }
编译 生成执行文件
$clang -O2 -fprofile-generate t.c
采集数据 1 (train) 只跑1次 分支1
生成 文件 default_15853201382406637699_0.profraw
$./a.out 2 // 注意这里只跑了一次待参数的case
生成profdata格式的文件
$llvm-profdata merge -output=code.profdata.1vs0 default_15853201382406637699_0.profraw
$file code.profdata.1vs0
code.profdata.2vs4: LLVM indexed profile data, version 8
采集数据 2 (train) 跑2次 分支1 + 跑4次 分支2
删除旧的profraw数据
$rm default_15853201382406637699_0.profraw
重新生成 文件 default_15853201382406637699_0.profraw
$./a.out 2; ./a.out 2; ./a.out; ./a.out;./a.out; ./a.out
生成profdata格式的文件
$llvm-profdata merge -output=code.profdata.2vs4 default_15853201382406637699_0.profraw
$file code.profdata.2vs4
code.profdata.2vs4: LLVM indexed profile data, version 8
好了,现在我们有两个profdata文件了(code.profdata.1vs0 + code.profdata.2vs4)
让我们来对比一下他们对PGO优化的影响。
PGO编译 (二次编译)
$clang -O2 t.c -fprofile-use=code.profdata.1vs0 -o -S t1vs0.s
$clang -O2 t.c -fprofile-use=code.profdata.2vs4 -o -S t2vs4.s
对比生成的汇编我们发现
在 t1vs0.s 中的分支1和分支2 在 t2vs4.s中调换了位置:
这主要是因为t1vs0.s中的pgo数据显示 分支1是热分支,所以优先fallthrough分支1,而t2vs4.s的正好相反。
IR中是如何体现PGO数据的
$clang -O2 t.c -fprofile-use=code.profdata.1vs0 -o -S -emit-llvm t1vs0.ll
$clang -O2 t.c -fprofile-use=code.profdata.2vs4 -o -S -emit-llvm t2vs4.ll
我们先列出其中一个IR文件的完整内容 (t1vs0.ll):
; ModuleID = 't.c'
source_filename = "t.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@XX = dso_local local_unnamed_addr global i32 0, align 4
@YY = dso_local local_unnamed_addr global i32 0, align 4
; Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn uwtable
define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !prof !34 {
entry:
%cmp = icmp sgt i32 %argc, 1
%0 = load i32, ptr @XX, align 4, !tbaa !35
br i1 %cmp, label %if.then, label %if.else, !prof !39 // PGO MD数据!
if.then: ; preds = %entry
%add = add nsw i32 %0, 2
store i32 %add, ptr @XX, align 4, !tbaa !35
%1 = load i32, ptr @YY, align 4, !tbaa !35
%mul = mul nsw i32 %1, %add
br label %if.end
if.else: ; preds = %entry
%sub = add nsw i32 %0, -1
store i32 %sub, ptr @XX, align 4, !tbaa !35
%2 = load i32, ptr @YY, align 4, !tbaa !35
%sub1 = sub nsw i32 %2, %sub
br label %if.end
if.end: ; preds = %if.else, %if.then
%storemerge = phi i32 [ %sub1, %if.else ], [ %mul, %if.then ]
store i32 %storemerge, ptr @YY, align 4, !tbaa !35
ret i32 0
}
attributes #0 = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!33}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 1, !"ProfileSummary", !5}
!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
!6 = !{!"ProfileFormat", !"InstrProf"}
!7 = !{!"TotalCount", i64 1}
!8 = !{!"MaxCount", i64 1}
!9 = !{!"MaxInternalCount", i64 0}
!10 = !{!"MaxFunctionCount", i64 1}
!11 = !{!"NumCounts", i64 2}
!12 = !{!"NumFunctions", i64 1}
!13 = !{!"IsPartialProfile", i64 0}
!14 = !{!"PartialProfileRatio", double 0.000000e+00}
!15 = !{!"DetailedSummary", !16}
!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
!17 = !{i32 10000, i64 0, i32 0}
!18 = !{i32 100000, i64 0, i32 0}
!19 = !{i32 200000, i64 0, i32 0}
!20 = !{i32 300000, i64 0, i32 0}
!21 = !{i32 400000, i64 0, i32 0}
!22 = !{i32 500000, i64 0, i32 0}
!23 = !{i32 600000, i64 0, i32 0}
!24 = !{i32 700000, i64 0, i32 0}
!25 = !{i32 800000, i64 0, i32 0}
!26 = !{i32 900000, i64 0, i32 0}
!27 = !{i32 950000, i64 0, i32 0}
!28 = !{i32 990000, i64 0, i32 0}
!29 = !{i32 999000, i64 0, i32 0}
!30 = !{i32 999900, i64 0, i32 0}
!31 = !{i32 999990, i64 0, i32 0}
!32 = !{i32 999999, i64 0, i32 0}
!33 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cb33ef7ca71d864b5fb8efbe59d77e895ba6e9a0)"}
!34 = !{!"function_entry_count", i64 1}
!35 = !{!36, !36, i64 0}
!36 = !{!"int", !37, i64 0}
!37 = !{!"omnipotent char", !38, i64 0}
!38 = !{!"Simple C/C++ TBAA"}
!39 = !{!"branch_weights", i32 1, i32 0} // 最大差异:t2vs4.ll 中为 {... i32 2, i32 4}
再来对比差异
可见他们最大的不同就是在IR “br i1 %cmp, label %if.then, label %if.else” 中所带的 “!prof !39” PGO MD数据不同。这影响了最终的layout区别。
LLVM是如何使用 PGO Metadata的
我们看到例子中的PGO对br指令的标记以 !39 = !{!“branch_weights”, i32 2, i32 4} 的形式出现。
通过分析,我们发现 分支1和2的对调是在 Branch Probability Basic Block Placement (block-placement)这个pass中发生的。那么它一定读取了 “!39 = !{!“branch_weights”…}” 数据。
在block-placement中,一般将 最大概率边 作为fallthrough分支(可以参考一个比较直观的函数MachineBlockPlacement::selectBestSuccessor)
// lib/CodeGen/MachineBlockPlacement.cpp
1580 MachineBlockPlacement::selectBestSuccessor( ) {
...
1618 for (MachineBasicBlock *Succ : Successors) {
// 通过获取边的概率来选择最佳后继BB(fallthrough)
1619 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1620 BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1640 if (BestSucc.BB && BestProb >= SuccProb) { continue;}
...
1646 BestSucc.BB = Succ;
1647 BestProb = SuccProb;
...}
PGO下分支(边)的概率(BranchProbability)是如何计算的?
PGO下分支的BranchProbability 是 在函数 BranchProbabilityInfo::calcMetadataWeights中计算出来的:
// lib/Analysis/BranchProbabilityInfo.cpp
379 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
// 找到分支跳转 br i1 %cmp, label %if.then, label %if.else, !prof !39
380 const Instruction *TI = BB->getTerminator();
// 获取!39 = !{!"branch_weights", i32 2, i32 4}
386 MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
// 获取 将branch_weights信息提取到数组 Weights[2] = {2,4}
406 extractBranchWeights(*TI, Weights);
// 计算总 weight = 2+4 = 6
407 for (I < Weights.size()) { WeightSum += Weights[I]; ...}
// 计算分支概率(数组)BP
443 for (I < Weights.size()) { BP[I] = BranchProbability(Weights[I], WeightSum);}
// 为该BB的每条后继边 赋 上对应的分支概率。
513 setEdgeProbability(BB, BP);
}