首先要知道为啥要用这个调优,上一篇说到,咱们tile选择的bn = 32,但是硬件后端多种多样,不一定说32就能得到最好的结果,所以AutoTVM和AutoSchedule可以帮我们在搜索空间中一个个尝试,找到最好的结果(但是哦,大家想一想,搜索空间有局限对不对?)
使用AutoTVM进行调优
import logging
import sys
import numpy as np
import tvm
from tvm import te
from tvm.autotvm.task import Task
import tvm.testing
# the module is called `autotvm`
from tvm import autotvm
@autotvm.template("tutorial/matmul")
def matmul_basic(N, L, M, dtype):
A = te.placeholder((N, L), name="A", dtype=dtype)
B = te.placeholder((L, M), name="B", dtype=dtype)
k = te.reduce_axis((0, L), name="k")
C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
s = te.create_schedule(C.op)
# schedule
y, x = s[C].op.axis
k = s[C].op.reduce_axis[0]
yo, yi = s[C].split(y, 8)
xo, xi = s[C].split(x, 8)
s[C].reorder(yo, xo, k, yi, xi)
return s, [A, B, C]
###使用AutoTVM进行矩阵乘法
@autotvm.template("tutorial/matmul_v1") # 1. use a decorator
def matmul_v1(N, L, M, dtype):
A = te.placeholder((N, L), name="A", dtype=dtype)
B = te.placeholder((L, M), name="B", dtype=dtype)
k = te.reduce_axis((0, L), name="k")
C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
s = te.create_schedule(C.op)
# schedule
y, x = s[C].op.axis
k = s[C].op.reduce_axis[0]
# 2. get the config object
cfg = autotvm.get_config()
#这是不同的点,我们并不知道如何tile在目标的硬件上表现好一些
#所以定义了这样一个搜索空间,会自己搜索,在这些空间之中表现最好的值
# 3. define search space
#cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
cfg.define_split("tile_y", y, num_outputs=2)
#枚举所有可能的组合,这些组合可以将y轴分成两个轴
#并且以y的长度为因子
# 4. schedule according to config
#yo, yi = s[C].split(y, cfg["tile_y"].val)
yo, yi = cfg["tile_y"].apply(s, C, y)
xo, xi = s[C].split(x, cfg["tile_x"].val)
s[C].reorder(yo, xo, k, yi, xi)
return s, [A, B, C]
#####step2----------Auto-tuners in TVM
N, L, M = 512, 512, 512
task = autotvm.task.create("tutorial/matmul_v1", args=(N, L, M, "float32"),target="llvm")
# logging config (for printing tuning log to the screen)
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
# Begin tuning with RandomTuner, log records to file `matmul.log`
# You can use alternatives like XGBTuner.
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(
n_trial=10,
measure_option=measure_option,
callbacks=[autotvm.callback.log_to_file("matmul.log")],
)
# apply history best from log file
with autotvm.apply_history_best("matmul.log"):
with tvm.target.Target("llvm"):
s, arg_bufs = matmul_v1(N, L, M, "float32")
func = tvm.build(s, arg_bufs)
# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)
c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-4)
运行结果
使用AutoSchedule进行调优
与基于模板的AutoTVM依赖手动模板来定义搜索空间不同,自动调度器不需要任何模板。用户只需编写计算声明,无需任何调度命令或模板。自动调度器可以自动生成一个大的搜索空间,并在该空间中找到一个好的调度。
import os
import numpy as np
import tvm
from tvm import te, auto_scheduler
##定义矩阵乘法
@auto_scheduler.register_workload # Note the auto_scheduler decorator
def matmul_add(N, L, M, dtype):
A = te.placeholder((N, L), name="A", dtype=dtype)
B = te.placeholder((L, M), name="B", dtype=dtype)
C = te.placeholder((N, M), name="C", dtype=dtype)
k = te.reduce_axis((0, L), name="k")
matmul = te.compute(
(N, M),
lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
name="matmul",
attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B
)
out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
return [A, B, C, out]
target = tvm.target.Target("llvm")
N = L = M = 1024
task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
#设置自动调度器的参数
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
verbose=2,
)
#运行搜索
# Run auto-tuning (search)
task.tune(tune_option)
# Apply the best schedule
sch, args = task.apply_best(log_file)
#检查优化的shcedule
print("Lowered TIR:")
print(tvm.lower(sch, args, simple_mode=True))
func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np
dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
c_tvm = tvm.nd.array(c_np, device=dev)
out_tvm = tvm.nd.empty(out_np.shape, device=dev)
func(a_tvm, b_tvm, c_tvm, out_tvm)
# Check results
np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
# Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
print(
"Execution time of this operator: %.3f ms"
% (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)
print("Equivalent python schedule:")
print(task.print_best(log_file))
def resume_search(task, log_file):
print("Resume search:")
cost_model = auto_scheduler.XGBModel()
cost_model.update_from_file(log_file)
search_policy = auto_scheduler.SketchPolicy(
task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
)
task.tune(tune_option, search_policy=search_policy)
resume_search(task, log_file)