MLIR学习--使用Polygeist对C代码进行Tiling优化,并用MLIR Pass替换内层循环为外部API-CSDN博客

本文链接：https://blog.csdn.net/m0_61864577/article/details/144988244

MLIR学习--使用Polygeist对C代码进行Tiling优化,并用MLIR Pass替换内层循环为外部API

一.背景
二.参考链接
三.操作步骤

一.背景

1.使用Polygeist对C语言实现的矩阵乘,进行Tiling优化
2.编写一个MLIR的Pass对最里面的三层for循环进行替换,替换为一个外部API的调用
3.准备一个测试程序并实现该API,最后一起编译成一个可执行程序
4.备注:本实例只演示流程,并不能执行正确的结果(替换的外部函数没有加入参数)

二.参考链接

三.操作步骤

1.创建容器

mkdir cgeist_demo
cd cgeist_demo
docker stop cgeist_demo
docker rm cgeist_demo
docker run --gpus all --shm-size=32g -id -e NVIDIA_VISIBLE_DEVICES=all --privileged \
        -v $PWD:/home -w /home \
        --name cgeist_demo --hostname=cgeist_demo ghcr.io/intel/llvm/ubuntu2204_build /bin/bash
docker exec -ti cgeist_demo bash

2.编译polygeist

cd /home
git clone --recursive https://github.com/llvm/Polygeist.git
cd Polygeist
mkdir build
cd build
cmake -G Ninja ../llvm-project/llvm \
  -DLLVM_ENABLE_PROJECTS="clang;mlir;polly" \
  -DLLVM_EXTERNAL_PROJECTS="polygeist" \
  -DLLVM_EXTERNAL_POLYGEIST_SOURCE_DIR=.. \
  -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
  -DLLVM_ENABLE_ASSERTIONS=ON \
  -DCMAKE_BUILD_TYPE=Release
ninja
ninja check-polygeist-opt && ninja check-cgeist

3.准备矩阵乘C代码

tee matmul.c<<-'EOF'    
#define N 200
#define M 300
#define K 400
#define DATA_TYPE float

void matmul(DATA_TYPE A[N][K], DATA_TYPE B[K][M], DATA_TYPE C[N][M]) {
  int i, j, k;
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < M; j++) {
      for (int k = 0; k < K; k++) {
        C[i][j] += A[i][k] * B[k][j];
      }
    }
  }
}
EOF

4.使用Polygeist将C代码转换为MLIR(Affine Dialect)

./bin/cgeist matmul.c -function=matmul -S --raise-scf-to-affine --polyhedral-opt | ./bin/mlir-opt --canonicalize --cse > output.mlir
cat output.mlir

输出

module attributes {...} {
  func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 200 {
      affine.for %arg4 = 0 to 300 {
        affine.for %arg5 = 0 to 400 {
          %0 = affine.load %arg0[%arg3, %arg5] : memref<?x400xf32>
          %1 = affine.load %arg1[%arg5, %arg4] : memref<?x300xf32>
          %2 = arith.mulf %0, %1 : f32
          %3 = affine.load %arg2[%arg3, %arg4] : memref<?x300xf32>
          %4 = arith.addf %3, %2 : f32
          affine.store %4, %arg2[%arg3, %arg4] : memref<?x300xf32>
        }
      }
    }
    return
  }
}

5.使用mlir-opt对MLIR进行Tiling优化

./bin/mlir-opt --affine-loop-tile="tile-sizes=32,32,32" output.mlir -o tiled.mlir
cat tiled.mlir

输出

#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 + 32, 200)>
#map2 = affine_map<(d0) -> (d0 + 32, 300)>
#map3 = affine_map<(d0) -> (d0 + 32, 400)>
module attributes {...} {
  func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 200 step 32 {
      affine.for %arg4 = 0 to 300 step 32 {
        affine.for %arg5 = 0 to 400 step 32 {
          affine.for %arg6 = #map(%arg3) to min #map1(%arg3) {
            affine.for %arg7 = #map(%arg4) to min #map2(%arg4) {
              affine.for %arg8 = #map(%arg5) to min #map3(%arg5) {
                %0 = affine.load %arg0[%arg6, %arg8] : memref<?x400xf32>
                %1 = affine.load %arg1[%arg8, %arg7] : memref<?x300xf32>
                %2 = arith.mulf %0, %1 : f32
                %3 = affine.load %arg2[%arg6, %arg7] : memref<?x300xf32>
                %4 = arith.addf %3, %2 : f32
                affine.store %4, %arg2[%arg6, %arg7] : memref<?x300xf32>
              }
            }
          }
        }
      }
    }
    return
  }
}

6.准备MLIR Pass,用来对最里面的三层affine.for进行替换

tee MyPass.cpp<<-'EOF'  
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/InitAllDialects.h"
#include "mlir/InitAllPasses.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Tools/Plugins/DialectPlugin.h"
#include "mlir/InitAllPasses.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/IR/Builders.h"
#include "llvm/IR/Function.h"
#include "mlir/Dialect/Func/IR/FuncOps.h.inc"
#include <stdio.h>

using namespace mlir;
namespace {
struct ReplaceTripleAffineForPass
    : public PassWrapper<ReplaceTripleAffineForPass, OperationPass<func::FuncOp>> {
  void runOnOperation() override {
    func::FuncOp func = getOperation();
    MLIRContext *context = &getContext();
    OpBuilder builder(context);

    // 用于存储需要删除的最外层循环
    SmallVector<Operation*, 4> loopsToErase;

    // 遍历函数中的所有操作
    func.walk([&](mlir::affine::AffineForOp forOp) {
      // 检查当前 affine.for 是否是最内层循环（其主体中不包含 affine.for）
      bool isInnermost = true;
      for (Operation &op : forOp.getBody()->getOperations()) {
        if (isa<mlir::affine::AffineForOp>(op)) {
          isInnermost = false;
          break;
        }
      }
      if (!isInnermost)
        return; // 不是最内层循环，跳过

      // 检查父节点是否是 affine.for
      auto parentForOp = dyn_cast<mlir::affine::AffineForOp>(forOp->getParentOp());
      if (!parentForOp)
        return; // 父节点不是 affine.for，跳过

      // 检查祖父节点是否是 affine.for
      auto grandParentForOp = dyn_cast<mlir::affine::AffineForOp>(parentForOp->getParentOp());
      if (!grandParentForOp)
        return; // 祖父节点不是 affine.for，跳过

      // 确认是最内层的三层嵌套 affine.for 循环
      // 现在可以进行替换

      // 将最外层循环添加到待删除列表
      loopsToErase.push_back(grandParentForOp);

      // 记录外层循环的位置，用于插入新的操作
      builder.setInsertionPoint(grandParentForOp);

      // 假设外部 API 是一个名为 "external_api_call" 的函数，无返回值，无参数
      ModuleOp module = func->getParentOfType<ModuleOp>();
      SymbolTable symbolTable(module);

      // 检查模块中是否已声明外部函数
      func::FuncOp externalFunc = symbolTable.lookup<func::FuncOp>("external_api_call");
      if (!externalFunc) {
        // 如果没有，先声明外部函数
        auto funcType = builder.getFunctionType(/*inputs=*/TypeRange{}, /*results=*/TypeRange{});
        externalFunc = func::FuncOp::create(forOp.getLoc(), "external_api_call", funcType);
        externalFunc.setPrivate();
        module.push_back(externalFunc);
      }

      // 创建对外部函数的调用
      builder.create<func::CallOp>(forOp.getLoc(), externalFunc, ValueRange{});
    });

    // 遍历待删除的循环，统一删除
    for (Operation *loopOp : loopsToErase) {
      loopOp->erase();
    }
  }
  mlir::StringRef getArgument() const final { return "replace-triple-affine-for"; } //命令行参数
  mlir::StringRef getDescription() const final {
    return "Replace the innermost three nested affine.for loops with an external function call.";
  }
};
} // namespace

MLIR_DECLARE_EXPLICIT_TYPE_ID(ReplaceTripleAffineForPass)
MLIR_DEFINE_EXPLICIT_TYPE_ID(ReplaceTripleAffineForPass)

extern "C" LLVM_ATTRIBUTE_WEAK PassPluginLibraryInfo mlirGetPassPluginInfo() {
    return {MLIR_PLUGIN_API_VERSION, "replace-triple-affine-for", LLVM_VERSION_STRING,
          [](){
                  mlir::PassRegistration<ReplaceTripleAffineForPass>();}
              };
}
EOF

7.编译MLIR Pass动态库

/home/Polygeist/build/bin/clang++ -ggdb -std=c++17 -shared -fPIC MyPass.cpp -o libMyPass.so \
  -I /home/Polygeist/llvm-project/llvm/include/ \
  -I /home/Polygeist/llvm-project/mlir/include/ \
  -I /home/Polygeist/build/tools/mlir/include \
  -L /home/Polygeist/build/lib \
  -lMLIRFuncDialect \
  -lMLIRIR \
  -lMLIRPass \
  -lMLIRTransforms \
  -lLLVMCore \
  -lLLVMSupport \
  `/home/Polygeist/build/bin/llvm-config --cxxflags --ldflags --system-libs --libs core` \
  -pthread

DRR还没走通[记录在此]

tee MyCustomPatterns.td <<-'EOF'    
include "mlir/Dialect/Affine/IR/AffineOps.td"
include "mlir/IR/PatternBase.td"
include "mlir/Dialect/Func/IR/FuncOps.td"
include "mlir/IR/BuiltinAttributes.td"

// 声明自定义的重写函数
def ApplyAffineForLoopRewrite : NativeCodeCall<"applyAffineForLoopRewrite">;

// 定义模式，匹配一个AffineForOp操作
def ReplaceNestedLoopsWithCall : Pattern<
  (AffineForOp $outerForOp),
  // 重写部分，调用自定义的重写函数，需要用[]包裹
  [(ApplyAffineForLoopRewrite $outerForOp)]
>;
EOF
./bin/mlir-tblgen -gen-rewriters -I /home/Polygeist/llvm-project/mlir/include/ MyCustomPatterns.td -o MyCustomPatterns.inc

8.使用mlir-opt加载上面的libMyPass.so,对MLIR进行重写

./bin/mlir-opt --load-pass-plugin=./libMyPass.so \
			--pass-pipeline="builtin.module(func.func(replace-triple-affine-for))" ./tiled.mlir -o output.mlir
cat output.mlir

输出

cat output.mlir
module attributes {...} {
  func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 200 step 32 {
      affine.for %arg4 = 0 to 300 step 32 {
        affine.for %arg5 = 0 to 400 step 32 {
          func.call @external_api_call() : () -> ()
        }
      }
    }
    return
  }
  func.func private @external_api_call()
}

9.将MLIR转换为LLVM IR

./bin/mlir-opt --lower-affine --convert-arith-to-llvm --convert-scf-to-cf \
				--convert-cf-to-llvm -convert-to-llvm \
				-reconcile-unrealized-casts output.mlir -o fused.mlir
./bin/mlir-translate --mlir-to-llvmir fused.mlir -o output.ll
cat output.ll

输出

; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare ptr @malloc(i64)
declare void @free(ptr)
define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) {
  br label %22
22:                                               ; preds = %37, %21
  %23 = phi i64 [ %38, %37 ], [ 0, %21 ]
  %24 = icmp slt i64 %23, 200
  br i1 %24, label %25, label %39
25:                                               ; preds = %22
  br label %26
26:                                               ; preds = %35, %25
  %27 = phi i64 [ %36, %35 ], [ 0, %25 ]
  %28 = icmp slt i64 %27, 300
  br i1 %28, label %29, label %37
29:                                               ; preds = %26
  br label %30
30:                                               ; preds = %33, %29
  %31 = phi i64 [ %34, %33 ], [ 0, %29 ]
  %32 = icmp slt i64 %31, 400
  br i1 %32, label %33, label %35
33:                                               ; preds = %30
  call void @external_api_call()
  %34 = add i64 %31, 32
  br label %30
35:                                               ; preds = %30
  %36 = add i64 %27, 32
  br label %26
37:                                               ; preds = %26
  %38 = add i64 %23, 32
  br label %22
39:                                               ; preds = %22
  ret void
}
declare void @external_api_call()
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}

10.准备测试程序,实现external_api_call

tee main.c<<-'EOF'  
#include <stdio.h>
#define N 200
#define M 300
#define K 400
#define DATA_TYPE float

extern void matmul(DATA_TYPE A[N][K], DATA_TYPE B[K][M], DATA_TYPE C[N][M]);
void external_api_call()
{
  printf("call external_api_call\n");
}

int main()
{
	DATA_TYPE A[N][K];
	DATA_TYPE B[K][M];
	DATA_TYPE C[N][M];
	matmul(A,B,C);
}
EOF

11.编译并执行

./bin/clang main.c output.ll  -o demo
./demo

输出

call external_api_call
call external_api_call
....