使用调度模板和 AutoTVM 以及 AutoSchedule优化 Operator

最新推荐文章于 2023-03-20 11:54:54 发布

pkapkaever

最新推荐文章于 2023-03-20 11:54:54 发布

阅读量414

点赞数

文章标签： python

本文链接：https://blog.csdn.net/pkapkaever/article/details/121218748

版权

本文介绍了如何使用AutoTVM和AutoSchedule进行计算图的自动调优。首先展示了AutoTVM的模板调优，通过定义搜索空间并使用RandomTuner寻找最佳配置。然后，探讨了AutoSchedule的自动调度器，它无需手动模板，能自动生成并搜索优化调度。两种方法都在矩阵乘法的例子中进行了应用，并验证了优化后的性能。

摘要由CSDN通过智能技术生成

首先要知道为啥要用这个调优，上一篇说到，咱们tile选择的bn = 32，但是硬件后端多种多样，不一定说32就能得到最好的结果，所以AutoTVM和AutoSchedule可以帮我们在搜索空间中一个个尝试，找到最好的结果（但是哦，大家想一想，搜索空间有局限对不对？）
使用AutoTVM进行调优

import logging
import sys

import numpy as np
import tvm
from tvm import te
from tvm.autotvm.task import Task
import tvm.testing

# the module is called `autotvm`
from tvm import autotvm
@autotvm.template("tutorial/matmul") 
def matmul_basic(N, L, M, dtype):

    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    yo, yi = s[C].split(y, 8)
    xo, xi = s[C].split(x, 8)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]


###使用AutoTVM进行矩阵乘法
@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
def matmul_v1(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # 2. get the config object
    cfg = autotvm.get_config()
    
    
    #这是不同的点，我们并不知道如何tile在目标的硬件上表现好一些
    #所以定义了这样一个搜索空间，会自己搜索，在这些空间之中表现最好的值
    # 3. define search space
    #cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
    cfg.define_split("tile_y", y, num_outputs=2)
    #枚举所有可能的组合，这些组合可以将y轴分成两个轴
    #并且以y的长度为因子
    
    # 4. schedule according to config
    #yo, yi = s[C].split(y, cfg["tile_y"].val)
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = s[C].split(x, cfg["tile_x"].val)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

#####step2----------Auto-tuners in TVM
N, L, M = 512, 512, 512
task = autotvm.task.create("tutorial/matmul_v1", args=(N, L, M, "float32"),target="llvm")
# logging config (for printing tuning log to the screen)
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))

# Begin tuning with RandomTuner, log records to file `matmul.log`
# You can use alternatives like XGBTuner.
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(
    n_trial=10,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("matmul.log")],
)

# apply history best from log file
with autotvm.apply_history_best("matmul.log"):
    with tvm.target.Target("llvm"):
        s, arg_bufs = matmul_v1(N, L, M, "float32")
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-4)

运行结果
在这里插入图片描述
使用AutoSchedule进行调优

与基于模板的AutoTVM依赖手动模板来定义搜索空间不同，自动调度器不需要任何模板。用户只需编写计算声明，无需任何调度命令或模板。自动调度器可以自动生成一个大的搜索空间，并在该空间中找到一个好的调度。

import os

import numpy as np
import tvm
from tvm import te, auto_scheduler


##定义矩阵乘法
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def matmul_add(N, L, M, dtype):
    A = te.placeholder((N, L), name="A", dtype=dtype)
    B = te.placeholder((L, M), name="B", dtype=dtype)
    C = te.placeholder((N, M), name="C", dtype=dtype)

    k = te.reduce_axis((0, L), name="k")
    matmul = te.compute(
        (N, M),
        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
        name="matmul",
        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
    )
    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")

    return [A, B, C, out]

target = tvm.target.Target("llvm")
N = L = M = 1024
task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)

# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
#设置自动调度器的参数
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=10,
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)

#运行搜索
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
#检查优化的shcedule
print("Lowered TIR:")
print(tvm.lower(sch, args, simple_mode=True))


func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np

dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
c_tvm = tvm.nd.array(c_np, device=dev)
out_tvm = tvm.nd.empty(out_np.shape, device=dev)
func(a_tvm, b_tvm, c_tvm, out_tvm)

# Check results
np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

# Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
print(
    "Execution time of this operator: %.3f ms"
    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)

print("Equivalent python schedule:")
print(task.print_best(log_file))

def resume_search(task, log_file):
    print("Resume search:")
    cost_model = auto_scheduler.XGBModel()
    cost_model.update_from_file(log_file)
    search_policy = auto_scheduler.SketchPolicy(
        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
    )
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
    )
    task.tune(tune_option, search_policy=search_policy)


resume_search(task, log_file)