LLVM 如何利用 Profile Guided Optimization (PGO)信息

最新推荐文章于 2024-08-03 10:17:49 发布

翔哥@LLVM

最新推荐文章于 2024-08-03 10:17:49 发布

阅读量1.5k

点赞数 5

分类专栏： LLVM 文章标签：开发语言

本文链接：https://blog.csdn.net/zhangxiang0503/article/details/127650332

版权

LLVM 专栏收录该内容

11 篇文章 1 订阅

订阅专栏

PGO的使用

让我们举个例子来说明
用例

  1 int XX = 0;
  2 int YY = 0;
  3
  4 int main(int argc, char* argv[]) {
  5   if (argc > 1) {
  6     XX += 2;    // 分支1
  7     YY *= XX;
  8   } else {
  9     XX -= 1;    // 分支2
 10     YY -= XX;
 11   }
 12   return 0;
 13 }

编译生成执行文件

$clang -O2 -fprofile-generate t.c

采集数据 1 （train）只跑1次分支1

生成 文件 default_15853201382406637699_0.profraw
$./a.out 2  // 注意这里只跑了一次待参数的case

生成profdata格式的文件
$llvm-profdata merge -output=code.profdata.1vs0 default_15853201382406637699_0.profraw

$file code.profdata.1vs0
code.profdata.2vs4: LLVM indexed profile data, version 8

采集数据 2 （train）跑2次分支1 + 跑4次分支2

删除旧的profraw数据
$rm default_15853201382406637699_0.profraw

重新生成 文件 default_15853201382406637699_0.profraw
$./a.out 2； ./a.out 2； ./a.out； ./a.out；./a.out； ./a.out

生成profdata格式的文件
$llvm-profdata merge -output=code.profdata.2vs4 default_15853201382406637699_0.profraw

$file code.profdata.2vs4
code.profdata.2vs4: LLVM indexed profile data, version 8

好了，现在我们有两个profdata文件了（code.profdata.1vs0 + code.profdata.2vs4）
让我们来对比一下他们对PGO优化的影响。

PGO编译 （二次编译）

$clang -O2 t.c -fprofile-use=code.profdata.1vs0 -o -S t1vs0.s
$clang -O2 t.c -fprofile-use=code.profdata.2vs4 -o -S t2vs4.s

对比生成的汇编我们发现
在 t1vs0.s 中的分支1和分支2 在 t2vs4.s中调换了位置：
t1vs0.s vs t2vs4.s
这主要是因为t1vs0.s中的pgo数据显示分支1是热分支，所以优先fallthrough分支1，而t2vs4.s的正好相反。

IR中是如何体现PGO数据的

$clang -O2 t.c -fprofile-use=code.profdata.1vs0 -o -S -emit-llvm t1vs0.ll
$clang -O2 t.c -fprofile-use=code.profdata.2vs4 -o -S -emit-llvm t2vs4.ll

我们先列出其中一个IR文件的完整内容（t1vs0.ll）：

; ModuleID = 't.c'
source_filename = "t.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@XX = dso_local local_unnamed_addr global i32 0, align 4
@YY = dso_local local_unnamed_addr global i32 0, align 4

; Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn uwtable
define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !prof !34 {
entry:
  %cmp = icmp sgt i32 %argc, 1
  %0 = load i32, ptr @XX, align 4, !tbaa !35
  br i1 %cmp, label %if.then, label %if.else, !prof !39  // PGO MD数据!

if.then:                                          ; preds = %entry
  %add = add nsw i32 %0, 2
  store i32 %add, ptr @XX, align 4, !tbaa !35
  %1 = load i32, ptr @YY, align 4, !tbaa !35
  %mul = mul nsw i32 %1, %add
  br label %if.end

if.else:                                          ; preds = %entry
  %sub = add nsw i32 %0, -1
  store i32 %sub, ptr @XX, align 4, !tbaa !35
  %2 = load i32, ptr @YY, align 4, !tbaa !35
  %sub1 = sub nsw i32 %2, %sub
  br label %if.end

if.end:                                           ; preds = %if.else, %if.then
  %storemerge = phi i32 [ %sub1, %if.else ], [ %mul, %if.then ]
  store i32 %storemerge, ptr @YY, align 4, !tbaa !35
  ret i32 0
}

attributes #0 = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!33}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 1, !"ProfileSummary", !5}
!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
!6 = !{!"ProfileFormat", !"InstrProf"}
!7 = !{!"TotalCount", i64 1}
!8 = !{!"MaxCount", i64 1}
!9 = !{!"MaxInternalCount", i64 0}
!10 = !{!"MaxFunctionCount", i64 1}
!11 = !{!"NumCounts", i64 2}
!12 = !{!"NumFunctions", i64 1}
!13 = !{!"IsPartialProfile", i64 0}
!14 = !{!"PartialProfileRatio", double 0.000000e+00}
!15 = !{!"DetailedSummary", !16}
!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
!17 = !{i32 10000, i64 0, i32 0}
!18 = !{i32 100000, i64 0, i32 0}
!19 = !{i32 200000, i64 0, i32 0}
!20 = !{i32 300000, i64 0, i32 0}
!21 = !{i32 400000, i64 0, i32 0}
!22 = !{i32 500000, i64 0, i32 0}
!23 = !{i32 600000, i64 0, i32 0}
!24 = !{i32 700000, i64 0, i32 0}
!25 = !{i32 800000, i64 0, i32 0}
!26 = !{i32 900000, i64 0, i32 0}
!27 = !{i32 950000, i64 0, i32 0}
!28 = !{i32 990000, i64 0, i32 0}
!29 = !{i32 999000, i64 0, i32 0}
!30 = !{i32 999900, i64 0, i32 0}
!31 = !{i32 999990, i64 0, i32 0}
!32 = !{i32 999999, i64 0, i32 0}
!33 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cb33ef7ca71d864b5fb8efbe59d77e895ba6e9a0)"}
!34 = !{!"function_entry_count", i64 1}
!35 = !{!36, !36, i64 0}
!36 = !{!"int", !37, i64 0}
!37 = !{!"omnipotent char", !38, i64 0}
!38 = !{!"Simple C/C++ TBAA"}
!39 = !{!"branch_weights", i32 1, i32 0}  // 最大差异：t2vs4.ll 中为 {... i32 2, i32 4}

再来对比差异
t1vs0.ll vs t2vs4.ll:
可见他们最大的不同就是在IR “br i1 %cmp, label %if.then, label %if.else” 中所带的 “!prof !39” PGO MD数据不同。这影响了最终的layout区别。

LLVM是如何使用 PGO Metadata的
我们看到例子中的PGO对br指令的标记以 !39 = !{!“branch_weights”, i32 2, i32 4} 的形式出现。
通过分析，我们发现分支1和2的对调是在 Branch Probability Basic Block Placement (block-placement)这个pass中发生的。那么它一定读取了 “!39 = !{!“branch_weights”…}” 数据。

在block-placement中，一般将最大概率边作为fallthrough分支（可以参考一个比较直观的函数MachineBlockPlacement::selectBestSuccessor）

// lib/CodeGen/MachineBlockPlacement.cpp
1580 MachineBlockPlacement::selectBestSuccessor( ) {
       ...
1618   for (MachineBasicBlock *Succ : Successors) {
         // 通过获取边的概率来选择最佳后继BB（fallthrough）
1619     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1620     BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1640     if (BestSucc.BB && BestProb >= SuccProb) { continue;}
         ...
1646     BestSucc.BB = Succ;
1647     BestProb = SuccProb;
 ...}

PGO下分支（边）的概率（BranchProbability）是如何计算的？
PGO下分支的BranchProbability 是在函数 BranchProbabilityInfo::calcMetadataWeights中计算出来的：

 // lib/Analysis/BranchProbabilityInfo.cpp
 379 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
       // 找到分支跳转 br i1 %cmp, label %if.then, label %if.else, !prof !39
 380   const Instruction *TI = BB->getTerminator();
       // 获取!39 = !{!"branch_weights", i32 2, i32 4}
 386   MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
       // 获取 将branch_weights信息提取到数组 Weights[2] = {2,4}
 406   extractBranchWeights(*TI, Weights);
       // 计算总 weight = 2+4 = 6
 407   for (I < Weights.size()) { WeightSum += Weights[I]; ...}
      // 计算分支概率(数组)BP
 443   for (I < Weights.size()) { BP[I] = BranchProbability(Weights[I], WeightSum);}
      // 为该BB的每条后继边 赋 上对应的分支概率。
 513   setEdgeProbability(BB, BP);
 }