MLIR学习--使用Polygeist对C代码进行Tiling优化,并用MLIR Pass替换内层循环为外部API
一.背景
- 1.使用Polygeist对C语言实现的矩阵乘,进行Tiling优化
- 2.编写一个MLIR的Pass对最里面的三层for循环进行替换,替换为一个外部API的调用
- 3.准备一个测试程序并实现该API,最后一起编译成一个可执行程序
- 4.备注:本实例只演示流程,并不能执行正确的结果(替换的外部函数没有加入参数)
二.参考链接
三.操作步骤
1.创建容器
mkdir cgeist_demo
cd cgeist_demo
docker stop cgeist_demo
docker rm cgeist_demo
docker run --gpus all --shm-size=32g -id -e NVIDIA_VISIBLE_DEVICES=all --privileged \
-v $PWD:/home -w /home \
--name cgeist_demo --hostname=cgeist_demo ghcr.io/intel/llvm/ubuntu2204_build /bin/bash
docker exec -ti cgeist_demo bash
2.编译polygeist
cd /home
git clone --recursive https://github.com/llvm/Polygeist.git
cd Polygeist
mkdir build
cd build
cmake -G Ninja ../llvm-project/llvm \
-DLLVM_ENABLE_PROJECTS="clang;mlir;polly" \
-DLLVM_EXTERNAL_PROJECTS="polygeist" \
-DLLVM_EXTERNAL_POLYGEIST_SOURCE_DIR=.. \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DCMAKE_BUILD_TYPE=Release
ninja
ninja check-polygeist-opt && ninja check-cgeist
3.准备矩阵乘C代码
tee matmul.c<<-'EOF'
#define N 200
#define M 300
#define K 400
#define DATA_TYPE float
void matmul(DATA_TYPE A[N][K], DATA_TYPE B[K][M], DATA_TYPE C[N][M]) {
int i, j, k;
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
for (int k = 0; k < K; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
EOF
4.使用Polygeist将C代码转换为MLIR(Affine Dialect)
./bin/cgeist matmul.c -function=matmul -S --raise-scf-to-affine --polyhedral-opt | ./bin/mlir-opt --canonicalize --cse > output.mlir
cat output.mlir
输出
module attributes {...} {
func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
affine.for %arg3 = 0 to 200 {
affine.for %arg4 = 0 to 300 {
affine.for %arg5 = 0 to 400 {
%0 = affine.load %arg0[%arg3, %arg5] : memref<?x400xf32>
%1 = affine.load %arg1[%arg5, %arg4] : memref<?x300xf32>
%2 = arith.mulf %0, %1 : f32
%3 = affine.load %arg2[%arg3, %arg4] : memref<?x300xf32>
%4 = arith.addf %3, %2 : f32
affine.store %4, %arg2[%arg3, %arg4] : memref<?x300xf32>
}
}
}
return
}
}
5.使用mlir-opt对MLIR进行Tiling优化
./bin/mlir-opt --affine-loop-tile="tile-sizes=32,32,32" output.mlir -o tiled.mlir
cat tiled.mlir
输出
#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 + 32, 200)>
#map2 = affine_map<(d0) -> (d0 + 32, 300)>
#map3 = affine_map<(d0) -> (d0 + 32, 400)>
module attributes {...} {
func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
affine.for %arg3 = 0 to 200 step 32 {
affine.for %arg4 = 0 to 300 step 32 {
affine.for %arg5 = 0 to 400 step 32 {
affine.for %arg6 = #map(%arg3) to min #map1(%arg3) {
affine.for %arg7 = #map(%arg4) to min #map2(%arg4) {
affine.for %arg8 = #map(%arg5) to min #map3(%arg5) {
%0 = affine.load %arg0[%arg6, %arg8] : memref<?x400xf32>
%1 = affine.load %arg1[%arg8, %arg7] : memref<?x300xf32>
%2 = arith.mulf %0, %1 : f32
%3 = affine.load %arg2[%arg6, %arg7] : memref<?x300xf32>
%4 = arith.addf %3, %2 : f32
affine.store %4, %arg2[%arg6, %arg7] : memref<?x300xf32>
}
}
}
}
}
}
return
}
}
6.准备MLIR Pass,用来对最里面的三层affine.for进行替换
tee MyPass.cpp<<-'EOF'
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/InitAllDialects.h"
#include "mlir/InitAllPasses.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Tools/Plugins/DialectPlugin.h"
#include "mlir/InitAllPasses.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/IR/Builders.h"
#include "llvm/IR/Function.h"
#include "mlir/Dialect/Func/IR/FuncOps.h.inc"
#include <stdio.h>
using namespace mlir;
namespace {
struct ReplaceTripleAffineForPass
: public PassWrapper<ReplaceTripleAffineForPass, OperationPass<func::FuncOp>> {
void runOnOperation() override {
func::FuncOp func = getOperation();
MLIRContext *context = &getContext();
OpBuilder builder(context);
// 用于存储需要删除的最外层循环
SmallVector<Operation*, 4> loopsToErase;
// 遍历函数中的所有操作
func.walk([&](mlir::affine::AffineForOp forOp) {
// 检查当前 affine.for 是否是最内层循环(其主体中不包含 affine.for)
bool isInnermost = true;
for (Operation &op : forOp.getBody()->getOperations()) {
if (isa<mlir::affine::AffineForOp>(op)) {
isInnermost = false;
break;
}
}
if (!isInnermost)
return; // 不是最内层循环,跳过
// 检查父节点是否是 affine.for
auto parentForOp = dyn_cast<mlir::affine::AffineForOp>(forOp->getParentOp());
if (!parentForOp)
return; // 父节点不是 affine.for,跳过
// 检查祖父节点是否是 affine.for
auto grandParentForOp = dyn_cast<mlir::affine::AffineForOp>(parentForOp->getParentOp());
if (!grandParentForOp)
return; // 祖父节点不是 affine.for,跳过
// 确认是最内层的三层嵌套 affine.for 循环
// 现在可以进行替换
// 将最外层循环添加到待删除列表
loopsToErase.push_back(grandParentForOp);
// 记录外层循环的位置,用于插入新的操作
builder.setInsertionPoint(grandParentForOp);
// 假设外部 API 是一个名为 "external_api_call" 的函数,无返回值,无参数
ModuleOp module = func->getParentOfType<ModuleOp>();
SymbolTable symbolTable(module);
// 检查模块中是否已声明外部函数
func::FuncOp externalFunc = symbolTable.lookup<func::FuncOp>("external_api_call");
if (!externalFunc) {
// 如果没有,先声明外部函数
auto funcType = builder.getFunctionType(/*inputs=*/TypeRange{}, /*results=*/TypeRange{});
externalFunc = func::FuncOp::create(forOp.getLoc(), "external_api_call", funcType);
externalFunc.setPrivate();
module.push_back(externalFunc);
}
// 创建对外部函数的调用
builder.create<func::CallOp>(forOp.getLoc(), externalFunc, ValueRange{});
});
// 遍历待删除的循环,统一删除
for (Operation *loopOp : loopsToErase) {
loopOp->erase();
}
}
mlir::StringRef getArgument() const final { return "replace-triple-affine-for"; } //命令行参数
mlir::StringRef getDescription() const final {
return "Replace the innermost three nested affine.for loops with an external function call.";
}
};
} // namespace
MLIR_DECLARE_EXPLICIT_TYPE_ID(ReplaceTripleAffineForPass)
MLIR_DEFINE_EXPLICIT_TYPE_ID(ReplaceTripleAffineForPass)
extern "C" LLVM_ATTRIBUTE_WEAK PassPluginLibraryInfo mlirGetPassPluginInfo() {
return {MLIR_PLUGIN_API_VERSION, "replace-triple-affine-for", LLVM_VERSION_STRING,
[](){
mlir::PassRegistration<ReplaceTripleAffineForPass>();}
};
}
EOF
7.编译MLIR Pass动态库
/home/Polygeist/build/bin/clang++ -ggdb -std=c++17 -shared -fPIC MyPass.cpp -o libMyPass.so \
-I /home/Polygeist/llvm-project/llvm/include/ \
-I /home/Polygeist/llvm-project/mlir/include/ \
-I /home/Polygeist/build/tools/mlir/include \
-L /home/Polygeist/build/lib \
-lMLIRFuncDialect \
-lMLIRIR \
-lMLIRPass \
-lMLIRTransforms \
-lLLVMCore \
-lLLVMSupport \
`/home/Polygeist/build/bin/llvm-config --cxxflags --ldflags --system-libs --libs core` \
-pthread
DRR还没走通[记录在此]
tee MyCustomPatterns.td <<-'EOF'
include "mlir/Dialect/Affine/IR/AffineOps.td"
include "mlir/IR/PatternBase.td"
include "mlir/Dialect/Func/IR/FuncOps.td"
include "mlir/IR/BuiltinAttributes.td"
// 声明自定义的重写函数
def ApplyAffineForLoopRewrite : NativeCodeCall<"applyAffineForLoopRewrite">;
// 定义模式,匹配一个AffineForOp操作
def ReplaceNestedLoopsWithCall : Pattern<
(AffineForOp $outerForOp),
// 重写部分,调用自定义的重写函数,需要用[]包裹
[(ApplyAffineForLoopRewrite $outerForOp)]
>;
EOF
./bin/mlir-tblgen -gen-rewriters -I /home/Polygeist/llvm-project/mlir/include/ MyCustomPatterns.td -o MyCustomPatterns.inc
8.使用mlir-opt加载上面的libMyPass.so,对MLIR进行重写
./bin/mlir-opt --load-pass-plugin=./libMyPass.so \
--pass-pipeline="builtin.module(func.func(replace-triple-affine-for))" ./tiled.mlir -o output.mlir
cat output.mlir
输出
cat output.mlir
module attributes {...} {
func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
affine.for %arg3 = 0 to 200 step 32 {
affine.for %arg4 = 0 to 300 step 32 {
affine.for %arg5 = 0 to 400 step 32 {
func.call @external_api_call() : () -> ()
}
}
}
return
}
func.func private @external_api_call()
}
9.将MLIR转换为LLVM IR
./bin/mlir-opt --lower-affine --convert-arith-to-llvm --convert-scf-to-cf \
--convert-cf-to-llvm -convert-to-llvm \
-reconcile-unrealized-casts output.mlir -o fused.mlir
./bin/mlir-translate --mlir-to-llvmir fused.mlir -o output.ll
cat output.ll
输出
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare ptr @malloc(i64)
declare void @free(ptr)
define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) {
br label %22
22: ; preds = %37, %21
%23 = phi i64 [ %38, %37 ], [ 0, %21 ]
%24 = icmp slt i64 %23, 200
br i1 %24, label %25, label %39
25: ; preds = %22
br label %26
26: ; preds = %35, %25
%27 = phi i64 [ %36, %35 ], [ 0, %25 ]
%28 = icmp slt i64 %27, 300
br i1 %28, label %29, label %37
29: ; preds = %26
br label %30
30: ; preds = %33, %29
%31 = phi i64 [ %34, %33 ], [ 0, %29 ]
%32 = icmp slt i64 %31, 400
br i1 %32, label %33, label %35
33: ; preds = %30
call void @external_api_call()
%34 = add i64 %31, 32
br label %30
35: ; preds = %30
%36 = add i64 %27, 32
br label %26
37: ; preds = %26
%38 = add i64 %23, 32
br label %22
39: ; preds = %22
ret void
}
declare void @external_api_call()
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}
10.准备测试程序,实现external_api_call
tee main.c<<-'EOF'
#include <stdio.h>
#define N 200
#define M 300
#define K 400
#define DATA_TYPE float
extern void matmul(DATA_TYPE A[N][K], DATA_TYPE B[K][M], DATA_TYPE C[N][M]);
void external_api_call()
{
printf("call external_api_call\n");
}
int main()
{
DATA_TYPE A[N][K];
DATA_TYPE B[K][M];
DATA_TYPE C[N][M];
matmul(A,B,C);
}
EOF
11.编译并执行
./bin/clang main.c output.ll -o demo
./demo
输出
call external_api_call
call external_api_call
....