使用Python API（AutoTVM）

玛了个玛卡巴卡

已于 2024-03-18 15:12:14 修改

阅读量388

点赞数 4

分类专栏： TVM 文章标签： TVM python

于 2024-03-16 00:16:21 首次发布

本文链接：https://blog.csdn.net/Albdon/article/details/136752886

版权

TVM 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

使用Python API（AutoTVM）

第1步：加载
第2步：编译
第3步：运行
第4步：调优
第5步：利用调优结果编译
第6步：运行调优后的模型

代码如下：

################################################################################
# 第1步：加载
import onnx
import tvm.relay as relay

# 将模型原型加载到内存
model = onnx.load("./resnet50-v2.onnx")
# 将onnx.ModelProto转换为tvm.IRModule（模型）和tvm.nd.NDArray（参数）
mod, params = relay.frontend.from_onnx(model, shape={'data':[1, 3, 224, 224]})

################################################################################
# 第2步：编译
import tvm
from tvm.contrib import graph_executor

target = "llvm"

with tvm.transform.PassContext(opt_level=3): # 优化Relay的pass上下文
	# 构建Relay函数以在TVM图形执行器上运行
    executor_factory = relay.build(mod, target=target, params=params)

# 使用给定的设备类型和id构造TVM设备
dev = tvm.device(target, 0)
# 从图执行器工厂中读取tvm.runtime.Module
runtime_module = executor_factory["default"](dev)
# 对tvm.runtime.Module进行包装，方便调用
module = graph_executor.GraphModule(runtime_module)

################################################################################
# 第3步：运行
# 预处理
import preprocess
img_data = preprocess.read_img()
# 运行
module.set_input(key='data', value=img_data)
module.run()
tvm_output = module.get_output(0, tvm.nd.empty((1, 1000))).numpy()
# 后处理
import postprocess
postprocess.print_labels(tvm_output)
# 性能测量
import performance
print("unoptimized: %s" % performance.test_perf(module))

################################################################################
# 第4步：调优
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm

# 设置调优配置
tuning_option = {
    "tuner": "xgb",
    "trials": 20,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option( # 设置测量选项
	    builder=autotvm.LocalBuilder(), # 本地构建器
	    runner=autotvm.LocalRunner() # 本地运行器
	),
    "tuning_records": "resnet-50-v2-autotuning.json",
}

# 从ralay.funtion.Function中提取任务
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

# 对每个任务进行调优
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
	# 创建xgboost调优器对象
    tuner_obj = XGBTuner(task, loss_type="reg")
	# 执行调优
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

################################################################################
# 第5步：利用调优结果编译
with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        executor_factory = relay.build(mod, target=target, params=params)

dev = tvm.device(target, 0)
runtime_module = executor_factory["default"](dev)
module = graph_executor.GraphModule(runtime_module)

################################################################################
# 第6步：运行调优后的程序
# 运行
module.set_input(key='data', value=img_data)
module.run()
tvm_output = module.get_output(0, tvm.nd.empty((1, 1000))).numpy()
# 后处理
postprocess.print_labels(tvm_output)
# 输出性能数据
print("optimized: %s" % performance.test_perf(module))

第1步：加载

onnx.load()函数将序列化的模型原型（表示模型结构和参数的数据结构）加载到内存，返回值的类型为onnx.ModelProto：

load = load_model

def load_model(
    f: Union[IO[bytes], str],
    format: Optional[Any] = None,  # pylint: disable=redefined-builtin
    load_external_data: bool = True,
) -> ModelProto:
    """Loads a serialized ModelProto into memory.

    Args:
        f: can be a file-like object (has "read" function) or a string containing a file name
        format: for future use
        load_external_data: Whether to load the external data.
            Set to True if the data is under the same directory of the model.
            If not, users need to call :func:`load_external_data_for_model`
            with directory to load external data from.

    Returns:
        Loaded in-memory ModelProto.
    """

tvm.relay.frontend.from_onnx()函数将onnx.ModelProto转换为等效的tvm.IRModule，同时返回tvm.nd.NDArray类型的参数：

def from_onnx(
    model,
    shape=None,
    dtype="float32",
    opset=None,
    freeze_params=True,
    convert_config=None,
    export_node_renamed_model_path=None,
):
    """Convert a ONNX model into an equivalent Relay Function.

    ONNX graphs are represented as Python Protobuf objects.
    The companion parameters will be handled automatically.
    However, the input names from onnx graph is vague, mixing inputs and
    network weights/bias such as "1", "2"...
    For convenience, we rename the `real` input names to "input_0",
    "input_1"... And renaming parameters to "param_0", "param_1"...

    By default, ONNX defines models in terms of dynamic shapes. The ONNX importer
    retains that dynamism upon import, and the compiler attempts to convert the
    model into a static shapes at compile time. If this fails, there may still
    be dynamic operations in the model. Not all TVM kernels currently support
    dynamic shapes, please file an issue on discuss.tvm.apache.org
    if you hit an error with dynamic kernels.

    Parameters
    ----------
    model : protobuf object
        ONNX ModelProto after ONNX v1.1.0

    shape : dict of str to tuple, optional
        The input shape to the graph

    dtype : str or dict of str to str
        The input types to the graph

    opset : int, optional
        Override to autodetected opset.
        This can be helpful for some testing.

    freeze_params: bool
        If this parameter is true, the importer will take any provided
        onnx input values (weights, shapes, etc) and embed them into the relay model
        as Constants instead of variables. This allows more aggressive optimizations
        at compile time and helps in making models static if certain inputs represent
        attributes relay would traditionally consider compile-time constants.

    convert_config : Optional[Dict[str, Any]]
        Default config:
            use_nt_batch_matmul : bool = True
                True to convert qualified onnx `matmul` to `nn.batch_matmul` strict to NT format
                (transpose_a=False, transpose_b=True).

    export_node_renamed_model_path : str, optional
        Export the node renamed onnx model to the path.
        Some models do not contain names in their nodes. During the conversion, if names of nodes
        are empty, new names will be assigned based on their op types. The exported model can be the
        reference to spans.

    Returns
    -------
    mod : tvm.IRModule
        The relay module for compilation

    params : dict of str to tvm.nd.NDArray
        The parameter dict to be used by relay
    """

第2步：编译

tvm.ir.transform.PassContext类是运行Relay优化/分析的基础，每个pass上下文包含一些使用的辅助信息帮助优化pass：

@tvm._ffi.register_object("transform.PassContext")
class PassContext(tvm.runtime.Object):
    """The basis where a Relay optimization/analysis runs on.
    Each pass context contains a number of auxiliary information that is used
    to help an optimization pass. Such information includes the error reporter
    to record the errors of during the optimization, etc.

    opt_level : Optional[int]
        The optimization level of this pass.

    required_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
        The list of passes that are required by a certain pass.

    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
        The list of passes that are disabled.

    instruments : Optional[Sequence[PassInstrument]]
        The list of pass instrument implementations.

    config : Optional[Dict[str, Object]]
        Additional configurations for specific passes.
    """

tvm.relay.build()构建Relay函数以在TVM图形执行器上运行，返回TVM图执行器的运行时工厂tvm.relay.backend.executor_factory.ExecutorFactoryModule。因为使用的是默认参数executor=Executor("graph")，因此返回的是GraphExecutorFactoryModule：

def build(
    ir_mod,
    target=None,
    target_host=None,
    executor=Executor("graph"),
    runtime=Runtime("cpp"),
    workspace_memory_pools=None,
    constant_memory_pools=None,
    params=None,
    mod_name="default",
):
    # fmt: off
    # pylint: disable=line-too-long
    """Helper function that builds a Relay function to run on TVM graph executor.

    Parameters
    ----------
    ir_mod : :py:class:`~tvm.IRModule`
        The IR module to build. Using relay.Function is deprecated.

    target : None, or any multi-target like object, see Target.canon_multi_target
        For homogeneous compilation, the unique build target.
        For heterogeneous compilation, a dictionary or list of possible build targets.
        Defaults to the current target in the environment if None.

    target_host : None, or any target like object, see Target.canon_target
        Host compilation target, if target is device.

    executor : Optional[Executor]
        The executor configuration with which to build the model.
        Defaults to "graph" if no executor specified.

    runtime : Optional[Runtime]
        Runtime configuration to use when building the model.
        Defaults to "cpp" if no runtime specified.

    workspace_memory_pools : Optional[WorkspaceMemoryPools]
        The object that contains an Array of WorkspacePoolInfo objects
        that hold properties of read-write workspace pools that could be
        used by the inference.

    constant_memory_pools : Optional[ConstantMemoryPools]
        The object that contains an Array of ConstantPoolInfo objects
        that hold properties of read-only pools that could be
        used by the inference.

    params : dict of str to NDArray
        Input parameters to the graph that do not change
        during inference time. Used for constant folding.

    mod_name: Optional[str]
        The module name we will build

    Returns
    -------
    factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule
            The runtime factory for the TVM graph executor.
    """

tvm.device()函数使用给定的设备类型和id构造TVM设备：

def device(dev_type, dev_id=0):
    """Construct a TVM device with given device type and id.

    Parameters
    ----------
    dev_type: int or str
        The device type mask or name of the device.

    dev_id : int, optional
        The integer device id

    Returns
    -------
    dev: tvm.runtime.Device
        The corresponding device.

    Examples
    --------
    Device can be used to create reflection of device by
    string representation of the device type.

    .. code-block:: python

      assert tvm.device("cpu", 1) == tvm.cpu(1)
      assert tvm.device("cuda", 0) == tvm.cuda(0)
    """

tvm.contrib.graph_executor.GraphModule类是tvm.runtime.Module的浅包装，使用它可以直接调用tvm.runtime.Module的set_input()、run()、get_output()函数：

class GraphModule(object):
    """Wrapper runtime module.

    This is a thin wrapper of the underlying TVM module.
    you can also directly call set_input, run, and get_output
    of underlying module functions

    Parameters
    ----------
    module : tvm.runtime.Module
        The internal tvm module that holds the actual graph functions.

    Attributes
    ----------
    module : tvm.runtime.Module
        The internal tvm module that holds the actual graph functions.

    Examples
    --------

    .. code-block:: python

        import tvm
        from tvm import relay
        from tvm.contrib import graph_executor

        # build the library using graph executor
        lib = relay.build(...)
        lib.export_library("compiled_lib.so")
        # load it back as a runtime
        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
        # Call the library factory function for default and create
        # a new runtime.Module, wrap with graph module.
        gmod = graph_executor.GraphModule(lib["default"](dev))
        # use the graph module.
        gmod.set_input("x", data)
        gmod.run()
    """

第3步：运行

预处理

preprocess.py

from tvm.contrib.download import download_testdata
from PIL import Image
import numpy as np

def read_img():
    img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
    img_path = download_testdata(img_url, "imagenet_cat.png", module="data")

    # 重设大小为 224x224
    resized_image = Image.open(img_path).resize((224, 224))
    img_data = np.asarray(resized_image).astype("float32")

    # ONNX 需要 NCHW 输入, 因此对数组进行转换
    img_data = np.transpose(img_data, (2, 0, 1))

    # 根据 ImageNet 进行标准化
    imagenet_mean = np.array([0.485, 0.456, 0.406])
    imagenet_stddev = np.array([0.229, 0.224, 0.225])
    norm_img_data = np.zeros(img_data.shape).astype("float32")
    for i in range(img_data.shape[0]):
        norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]

    # 添加 batch 维度
    img_data = np.expand_dims(norm_img_data, axis=0)
    
    return img_data

运行

直接调用tvm.contrib.graph_executor.GraphModule的set_input()、run()、get_output()函数：

class GraphModule(object):
    def set_input(self, key=None, value=None, **params):
        """Set inputs to the module via kwargs

        Parameters
        ----------
        key : int or str
           The input key

        value : the input value.
           The input value

        params : dict of str to NDArray
           Additional arguments
        """
        
    def run(self, **input_dict):
        """Run forward execution of the graph

        Parameters
        ----------
        input_dict: dict of str to NDArray
            List of input values to be feed to
        """
        
    def get_output(self, index, out=None):
        """Get index-th output to out

        Parameters
        ----------
        index : int
            The output index

        out : NDArray
            The output array container
        """

后处理

postprocess.py

import numpy as np
from scipy.special import softmax
from tvm.contrib.download import download_testdata

def print_labels(data):
    # 下载标签列表
    labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
    labels_path = download_testdata(labels_url, "synset.txt", module="data")

    with open(labels_path, "r") as f:
        labels = [l.rstrip() for l in f]

    scores = softmax(data)
    scores = np.squeeze(scores)
    ranks = np.argsort(scores)[::-1]

    for rank in ranks[0:5]:
        print("class='%s' with probability=%f" % (labels[rank], scores[rank]))

性能测量

performance.py

import timeit
import numpy as np

def test_perf(module):
    timing_number = 10
    timing_repeat = 10
    timing_results_list = (
        np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
        * 1000 / timing_number
    )
    timing_statistics = {
        "mean": np.mean(timing_results_list),
        "median": np.median(timing_results_list),
        "std": np.std(timing_results_list),
    }

    return timing_statistics

第4步：调优

tvm.autotvm.measure_option()设置测量选项，为了测量配置，需要构建并运行它，也就是设置Builder和Runner参数：

def measure_option(builder, runner):
    """
    Set options for measure. To measure a config, we will build it and run it.
    So we have to set options for these two steps.
    They have their own options on timeout, parallel, etc.

    Parameters
    ----------
    builder: Builder
        Specify how to build programs
    runner: Runner
        Specify how to run programs

    Examples
    --------
    # example setting for using local devices
    >>> measure_option = autotvm.measure_option(
    >>>     builder=autotvm.LocalBuilder(),      # use all local cpu cores for compilation
    >>>     runner=autotvm.LocalRunner(          # measure them sequentially
    >>>         number=10,
    >>>         timeout=5)
    >>> )

    Note
    ----
    To make measurement results accurate, you should pick the correct value for the argument
    `number` and `repeat` in Runner(). Some devices need a certain minimum running time to
    "warm up," such as GPUs that need time to reach a performance power state.
    Using `min_repeat_ms` can dynamically adjusts `number`, so it is recommended.
    The typical value for NVIDIA GPU is 150 ms.
    """

tvm.autotvm.LocalBuilder类在本地编译代码：

class LocalBuilder(Builder):
    """Run compilation on local machine

    Parameters
    ----------
    timeout: float
        The timeout of a compilation
    n_parallel: int
        The number of tasks run in parallel. "None" will use all cpu cores
    build_kwargs: dict
        If supplied, additional kwargs passed to build_func. Overrides any build_kwargs supplied
        by the Runner.
    build_func: callable or str
        If is 'default', use default build function
        If is 'ndk', use function for android ndk
        If id 'stackvm', use function for stackvm
        If is callable, use it as custom build function, expect lib_format field.
    do_fork: bool
        If False, do not fork when building. Requires n_parallel=1.
    runtime: Optional[Runtime]
        Specify the runtime to generate artifacts for
    """

tvm.autotvm.LocalRunner类在本地运行代码：

class LocalRunner(RPCRunner):
    """Run generated code on local devices.

    Parameters
    ----------
    timeout: float
        The timeout of a compilation
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float, optional
        The cool down interval between two measurements.
    enable_cpu_cache_flush: bool
        Whether to flush cache on CPU between repeated measurements.
        Flushing cache can make the measured latency of one operator closer to
        its actual latency during end-to-end inference.
        To make this option effective, the argument `number` should also be set to 1.
        This is only has effect on CPU task.
    Note
    ----
    This is a "fake" local mode. We start a silent rpc tracker and rpc server
    for the user. In this way we reuse timeout/isolation mechanism in RPC infrastructure.
    """

tvm.autotvm.task.extract_from_program()函数从要调优的tvm.IRModule或ralay.funtion.Function中提取任务：

def extract_from_program(mod, params, target, target_host=None, ops=None):
    """Extract tuning tasks from a relay program.

    This function is the single program version of extract_from_multiple_program.

    Parameters
    ----------
    mod: tvm.IRModule or relay.function.Function
        The module or function to tune
    params: dict of str to numpy array
        The associated parameters of the program
    target: tvm.target.Target
        The compilation target
    target_host: tvm.target.Target
        The host compilation target
    ops: List[tvm.ir.Op] or None
        List of relay ops to be tuned. If not specified, all tunable ops will be extracted.

    Returns
    -------
    task: Array of autotvm.task.Task
        collected tasks
    """

tvm.autotvm.tuner.XGBTuner类是使用xgboost作为cost模型的调优器，tuner_obj.tune()调用了XGBTuner.tune()进行调优：

class XGBTuner(ModelBasedTuner):
    """Tuner that uses xgboost as cost model

    Parameters
    ----------
    task: Task
        The tuning task
    plan_size: int
        The size of a plan. After `plan_size` trials, the tuner will refit a new cost model
        and do planing for the next `plan_size` trials.
    feature_type: str, optional
        If is 'itervar', use features extracted from IterVar (loop variable).
        If is 'knob', use flatten ConfigEntity directly.
        If is 'curve', use sampled curve feature (relation feature).

        Note on choosing feature type:
        For single task tuning, 'itervar' and 'knob' are good.
        'itervar' is more accurate but 'knob' is much faster.
        There are some constraints on 'itervar', if you meet
        problems with feature extraction when using 'itervar',
        you can switch to 'knob'.

        For cross-shape tuning (e.g. many convolutions with different shapes),
        'itervar' and 'curve' has better transferability,
        'knob' is faster.

        For cross-device or cross-operator tuning, you can use 'curve' only.
    loss_type: str
        If is 'reg', use regression loss to train cost model.
        The cost model predicts the normalized flops.
        If is 'rank', use pairwise rank loss to train cost model.
        The cost model predicts relative rank score.
        If is 'rank-binary', use pairwise rank loss with binarized labels to train cost model.
        The cost model predicts relative rank score.

    num_threads: int, optional
        The number of threads.

    optimizer: str or ModelOptimizer, optional
        If is 'sa', use a default simulated annealing optimizer.
        Otherwise it should be a ModelOptimizer object.

    diversity_filter_ratio: int or float, optional
        If is not None, the tuner will first select
        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
        and then pick batch_size of them according to the diversity metric.

    log_interval: int = 50
        The verbose level.
        If is 0, output nothing.
        Otherwise, output debug information every `verbose` iterations.
    """
    
	def tune(self, *args, **kwargs):  # pylint: disable=arguments-differ
		super(XGBTuner, self).tune(*args, **kwargs)
	
		# manually close pool to avoid multiprocessing issues
		self.cost_model._close_pool()

XGBTuner.tune()函数内部调用了XGBTuner的祖先类tvm.autotvm.tuner.Tuner的Tuner.tune()函数：

class Tuner(object):
    """Base class for tuners

    Parameters
    ----------
    task: autotvm.task.Task
        Tuning Task
    """

    def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_prefix="G"):
        """Begin tuning

        Parameters
        ----------
        n_trial: int
            Maximum number of configs to try (measure on real hardware)
        measure_option: dict
            The options for how to measure generated code.
            You should use the return value ot autotvm.measure_option for this argument.
        early_stopping: int, optional
            Early stop the tuning when not finding better configs in this number of trials
        callbacks: List of callable
            A list of callback functions. The signature of callback function is
            (Tuner, List of MeasureInput, List of MeasureResult)
            with no return value. These callback functions will be called on
            every measurement pair. See autotvm/tuner/callback.py for some examples.
        si_prefix: str
            One of tvm.autotvm.utils.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
        """

调用XGBTuner.tune()时使用了两个回调函数，其中tvm.autotvm.callback.progress_bar()显示调优的进度条，tvm.autotvm.callback.log_to_file()将调优记录保存到文件中，日志的行以autotvm.record.encode的格式存储：

def progress_bar(total, prefix="", si_prefix="G"):
    """Display progress bar for tuning

    Parameters
    ----------
    total: int
        The total number of trials
    prefix: str
        The prefix of output message
    si_prefix: str
        SI prefix for flops
    """
    
def log_to_file(file_out, protocol="json"):
    """Log the tuning records into file.
    The rows of the log are stored in the format of autotvm.record.encode.

    Parameters
    ----------
    file_out : File or str
        The file to log to.
    protocol: str, optional
        The log protocol. Can be 'json' or 'pickle'

    Returns
    -------
    callback : callable
        Callback function to do the logging.
    """

第5步：利用调优结果编译

这一步与第2步的代码基本一致，只是在编译时应用了优化记录。

tvm.autotvm.ApplyHistoryBest类继承自tvm.autotvm.DispatchContext，ApplyHistoryBest是一个调度上下文，表示应用历史最优的配置，我们输入的参数是调优记录的文件路径：

class ApplyHistoryBest(DispatchContext):
    """
    Apply the history best config

    Parameters
    ----------
    records : None, Records, or iterator of Records objects, where a
              Records object is a path-like object, a file-like object,
              or an iterator of (MeasureInput, MeasureResult).

        Collection of tuning records. If multiple Records objects are passed, their
        contents will be merged.
    """