Halide学习笔记----Halide tutorial源码阅读21

最新推荐文章于 2022-03-10 19:57:38 发布

姑苏隐士

最新推荐文章于 2022-03-10 19:57:38 发布

阅读量1.7k

点赞数 1

分类专栏： HALIDE 文章标签：阅读源码

本文链接：https://blog.csdn.net/luzhanbo207/article/details/78906887

版权

HALIDE 专栏收录该内容

22 篇文章 31 订阅

订阅专栏

Halide入门教程21

调度器定义

    // Halide tutorial lesson 21: Auto-Scheduler
    // Halide入门第21课:自动调度器

    // So far we have written Halide schedules by hand, but it is also possible to
    // ask Halide to suggest a reasonable schedule. We call this auto-scheduling.
    // This lesson demonstrates how to use the auto-scheduler to generate a
    // copy-pasteable CPU schedule that can be subsequently improved upon.
    // 前面的调度都是手动些调度策略,但是让Halide给出一个合理的调度策略是可能的.我们称之为自动调度.
    // 本课展示了如何用自动调度器生成一个可以复制粘贴的cpu调度策略,可以基于此策略进行提升.

    // On linux or os x, you can compile and run it like so:

    // g++ lesson_21_auto_scheduler_generate.cpp ../tools/GenGen.cpp -g -std=c++11 -fno-rtti -I ../include -L ../bin -lHalide -lpthread -ldl -o lesson_21_generate
    // ./lesson_21_generate -o . -f conv_layer target=host
    // g++ lesson_21_auto_scheduler_run.cpp brighten_*.o -ldl -lpthread -o lesson_21_run
    // ./lesson_21_run

    #include "Halide.h"
    #include <stdio.h>

    using namespace Halide;

    // We will define a generator to auto-schedule.
    // 定义一个生成器,此处类似与第15课中的生成器的定义格式.
    class AutoScheduled : public Halide::Generator<AutoScheduled> {
    public:
        Input<Buffer<float>>  input{"input", 3};
        Input<float>          factor{"factor"};

        Output<Buffer<float>> output1{"output1", 2};
        Output<Buffer<float>> output2{"output2", 2};

        Expr sum3x3(Func f, Var x, Var y) {
            return f(x-1, y-1) + f(x-1, y) + f(x-1, y+1) +
                   f(x, y-1)   + f(x, y)   + f(x, y+1) +
                   f(x+1, y-1) + f(x+1, y) + f(x+1, y+1);
        }

        void generate() {
            // For our algorithm, we'll use Harris corner detection.
            // generate函数内部描述pipeline的算法部分,这里的算法主要是用于Harris角点检测.
            Func in_b = BoundaryConditions::repeat_edge(input);

            gray(x, y) = 0.299f * in_b(x, y, 0) + 0.587f * in_b(x, y, 1) + 0.114f * in_b(x, y, 2);

            Iy(x, y) = gray(x-1, y-1)*(-1.0f/12) + gray(x-1, y+1)*(1.0f/12) +
                       gray(x, y-1)*(-2.0f/12) + gray(x, y+1)*(2.0f/12) +
                       gray(x+1, y-1)*(-1.0f/12) + gray(x+1, y+1)*(1.0f/12);

            Ix(x, y) = gray(x-1, y-1)*(-1.0f/12) + gray(x+1, y-1)*(1.0f/12) +
                       gray(x-1, y)*(-2.0f/12) + gray(x+1, y)*(2.0f/12) +
                       gray(x-1, y+1)*(-1.0f/12) + gray(x+1, y+1)*(1.0f/12);

            Ixx(x, y) = Ix(x, y) * Ix(x, y);
            Iyy(x, y) = Iy(x, y) * Iy(x, y);
            Ixy(x, y) = Ix(x, y) * Iy(x, y);
            Sxx(x, y) = sum3x3(Ixx, x, y);
            Syy(x, y) = sum3x3(Iyy, x, y);
            Sxy(x, y) = sum3x3(Ixy, x, y);
            det(x, y) = Sxx(x, y) * Syy(x, y) - Sxy(x, y) * Sxy(x, y);
            trace(x, y) = Sxx(x, y) + Syy(x, y);
            harris(x, y) = det(x, y) - 0.04f * trace(x, y) * trace(x, y);
            output1(x, y) = harris(x + 2, y + 2);
            output2(x, y) = factor * harris(x + 2, y + 2);
        }

        void schedule() {
            if (auto_schedule) {
                // The auto-scheduler requires estimates on all the input/output
                // sizes and parameter values in order to compare different
                // alternatives and decide on a good schedule.
                // 自动调度器需要估计所有输入输出的尺寸,变量值的范围,从而更好的比较不同的调度策略,给出
                // 一个相对较好的调度策略.

                // To provide estimates (min and extent values) for each dimension
                // of the input images ('input', 'filter', and 'bias'), we use the
                // set_bounds_estimate() method. set_bounds_estimate() takes in
                // (min, extent) of the corresponding dimension as arguments.
                // 用set_bounds_estimate()方法来定义输入图像的估计,它有两个输入参数,最小值和范围.
                input.dim(0).set_bounds_estimate(0, 1024);
                input.dim(1).set_bounds_estimate(0, 1024);
                input.dim(2).set_bounds_estimate(0, 3);

                // To provide estimates on the parameter values, we use the
                // set_estimate() method.
                // 提供set_estimate()方法来估计参数的范围
                factor.set_estimate(2.0f);

                // To provide estimates (min and extent values) for each dimension
                // of pipeline outputs, we use the estimate() method. estimate()
                // takes in (dim_name, min, extent) as arguments.
                // 提供estimate()方法来估计pipeline输出的每一个唯独的范围.estimate的输出参数为
                // (维度名称,最小值,范围)
                output1.estimate(x, 0, 1024)
                       .estimate(y, 0, 1024);

                output2.estimate(x, 0, 1024)
                       .estimate(y, 0, 1024);

                // Technically, the estimate values can be anything, but the closer
                // they are to the actual use-case values, the better the generated
                // schedule will be.
                // 这些估计值可以是任意值,但是它们越接近实际使用值,生成的调度策略就会越好.

                // To auto-schedule the the pipeline, we don't have to do anything else:
                // every Generator implicitly has a GeneratorParam named "auto_schedule";
                // if this is set to true, Halide will call auto_schedule() on all of
                // our pipeline's outputs automatically.
                // 为了自动调度pipeline,我们不必做任何使用,每一个生成器隐含这一个生成器参数auto_schedule,
                // 如果此参数设置为true,Halide会调用auto_schedule来在整个pipeline的所有输出上进行自动
                // 调度.

                // Every Generator also implicitly has a GeneratorParams named "machine_params",
                // which allows you to specify characteristics of the machine architecture
                // for the auto-scheduler; it's generally specified in your Makefile.
                // If none is specified, the default machine parameters for a generic CPU
                // architecture will be used by the auto-scheduler.
                // 每一个生成器隐含着一个machine_params的生成器参数,这个参数可以用来为自动调度器来制定
                // 计算平台的架构特性.这些参数通常是在Makefile中制定的.如果没有制定,默认值就是通用的cpu
                // 架构.

                // Let's see some arbitrary but plausible values for the machine parameters.
                //
                //      const int kParallelism = 32;
                //      const int kLastLevelCacheSize = 16 * 1024 * 1024;
                //      const int kBalance = 40;
                //      MachineParams machine_params(kParallelism, kLastLevelCacheSize, kBalance);
                //
                // The arguments to MachineParams are the maximum level of parallelism
                // available, the size of the last-level cache (in KB), and the ratio
                // between the cost of a miss at the last level cache and the cost
                // of arithmetic on the target architecture, in that order.
                // 这些参数分别是最大可并行数,最后一级缓存的大小,和目标架构上缓存读取和算数运算操作消耗比.

                // Note that when using the auto-scheduler, no schedule should have
                // been applied to the pipeline; otherwise, the auto-scheduler will
                // throw an error. The current auto-scheduler cannot handle a
                // partially-scheduled pipeline.
                // 在使用自动调度器时,不能指定任何调度策略,否则自动调度器会抛出错误.

                // If HL_DEBUG_CODEGEN is set to 3 or greater, the schedule will be dumped
                // to stdout (along with much other information); a more useful way is
                // to add "schedule" to the -e flag to the Generator. (In CMake and Bazel,
                // this is done using the "extra_outputs" flag.)
                // 如果HL_DEBUG_CODEGEN设置成3或者更大,调度策略将会dump到标准输出.一个更有用的方法是给
                // 生成器的-e选项加上schedule

                // The generated schedule that is dumped to file is an actual
                // Halide C++ source, which is readily copy-pasteable back into
                // this very same source file with few modifications. Programmers
                // can use this as a starting schedule and iteratively improve the
                // schedule. Note that the current auto-scheduler is only able to
                // generate CPU schedules and only does tiling, simple vectorization
                // and parallelization. It doesn't deal with line buffering, storage
                // reordering, or factoring reductions.
                // 生成的调度策略dump到文件中的是c++源码,这个代码是可读,可以在少量修改后复制粘贴到
                // 原来的源代码文件中的的.程序员可以用这个作为起始调度策略不断调优和迭代.当前的自动调度器
                // 只能生成cpu的调度,而且只有简单的切片/向量化/并行化.处理不了行buffer,存储顺序调整,分解
                // 约减区域等调度.

                // At the time of writing, the auto-scheduler will return the
                // following schedule for the estimates and machine parameters
                // declared above when run on this pipeline:
                // 如下代码便是自动调度器生成的一个调度策略.
                //
                // Var x_i("x_i");
                // Var x_i_vi("x_i_vi");
                // Var x_i_vo("x_i_vo");
                // Var x_o("x_o");
                // Var x_vi("x_vi");
                // Var x_vo("x_vo");
                // Var y_i("y_i");
                // Var y_o("y_o");
                //
                // Func f0 = pipeline.get_func(3);
                // Func f1 = pipeline.get_func(7);
                // Func f11 = pipeline.get_func(14);
                // Func f2 = pipeline.get_func(4);
                // Func output1 = pipeline.get_func(15);
                // Func output2 = pipeline.get_func(16);
                //
                // {
                //     Var x = f0.args()[0];
                //     f0
                //         .compute_at(f11, x_o)
                //         .split(x, x_vo, x_vi, 8)
                //         .vectorize(x_vi);
                // }
                // {
                //     Var x = f1.args()[0];
                //     f1
                //         .compute_at(f11, x_o)
                //         .split(x, x_vo, x_vi, 8)
                //         .vectorize(x_vi);
                // }
                // {
                //     Var x = f11.args()[0];
                //     Var y = f11.args()[1];
                //     f11
                //         .compute_root()
                //         .split(x, x_o, x_i, 256)
                //         .split(y, y_o, y_i, 128)
                //         .reorder(x_i, y_i, x_o, y_o)
                //         .split(x_i, x_i_vo, x_i_vi, 8)
                //         .vectorize(x_i_vi)
                //         .parallel(y_o)
                //         .parallel(x_o);
                // }
                // {
                //     Var x = f2.args()[0];
                //     f2
                //         .compute_at(f11, x_o)
                //         .split(x, x_vo, x_vi, 8)
                //         .vectorize(x_vi);
                // }
                // {
                //     Var x = output1.args()[0];
                //     Var y = output1.args()[1];
                //     output1
                //         .compute_root()
                //         .split(x, x_vo, x_vi, 8)
                //         .vectorize(x_vi)
                //         .parallel(y);
                // }
                // {
                //     Var x = output2.args()[0];
                //     Var y = output2.args()[1];
                //     output2
                //         .compute_root()
                //         .split(x, x_vo, x_vi, 8)
                //         .vectorize(x_vi)
                //         .parallel(y);
                // }
            } else {
                // This is where you would declare the schedule you have written by
                // hand or paste the schedule generated by the auto-scheduler.
                // We will use a naive schedule here to compare the performance of
                // the autoschedule with a basic schedule.
                // 这里是用来写入手动调度策略,后者粘贴会自动调度器生成的代码.这里用一个朴素的调度来比较
                // 自动调度策略的性能.
                gray.compute_root();
                Iy.compute_root();
                Ix.compute_root();
            }
        }
    private:
        Var x{"x"}, y{"y"}, c{"c"};
        Func gray, Iy, Ix, Ixx, Iyy, Ixy, Sxx, Syy, Sxy, det, trace, harris;
    };

    // As in lesson 15, we register our generator and then compile this
    // file along with tools/GenGen.cpp.
    HALIDE_REGISTER_GENERATOR(AutoScheduled, auto_schedule_gen)

    // After compiling this file, see how to use it in
    // lesson_21_auto_scheduler_run.cpp

调用代码

  // Halide tutorial lesson 21: Auto-Scheduler

    // Before reading this file, see lesson_21_auto_scheduler_generate.cpp

    // This is the code that actually uses the Halide pipeline we've
    // compiled. It does not depend on libHalide, so we won't be including
    // Halide.h.
    // 这段代码实际上调用前面定义的Halide pipeline
    // Instead, it depends on the header files that lesson_21_auto_scheduler_generator produced.
    #include "auto_schedule_false.h"
    #include "auto_schedule_true.h"

    // We'll use the Halide::Runtime::Buffer class for passing data into and out of
    // the pipeline.
    #include "HalideBuffer.h"
    #include "halide_benchmark.h"

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <assert.h>

    int main(int argc, char **argv) {
        // Let's declare and initialize the input images
        Halide::Runtime::Buffer<float> input(1024, 1024, 3);

        for (int c = 0; c < input.channels(); ++c) {
            for (int y = 0; y < input.height(); ++y) {
                for (int x = 0; x < input.width(); ++x) {
                    input(x, y, c) = rand();
                }
            }
        }

        Halide::Runtime::Buffer<float> output1(1024, 1024);
        Halide::Runtime::Buffer<float> output2(1024, 1024);
        // Run each version of the codes (with no auto-schedule and with
        // auto-schedule) multiple times for benchmarking.
        // 运行没有开自动调度器的pipeline
        double auto_schedule_off = Halide::Tools::benchmark(2, 5, [&]() {
            auto_schedule_false(input, 2.0f, output1, output2);
        });
        printf("Manual schedule: %gms\n", auto_schedule_off * 1e3);

        // 运行开调度器的pipeline
        double auto_schedule_on = Halide::Tools::benchmark(2, 5, [&]() {
            auto_schedule_true(input, 2.0f, output1, output2);
        });
        printf("Auto schedule: %gms\n", auto_schedule_on * 1e3);

        // auto_schedule_on should be faster since in the auto_schedule_off version,
        // the schedule is very simple.
        assert(auto_schedule_on < auto_schedule_off);

        return 0;
    }

编写shell脚本,编译和执行代码:

  #!bin/bash
    #########################################################################
    # File Name: lesson_21_compile.sh
    # Author: xxx
    # mail: xxx
    # Created Time: Thu 21 Dec 2017 11:08:22 PM CST
    #########################################################################

    echo "1. compile the auto_schedule generator"
    g++ lesson_21_auto_scheduler_generate.cpp ../tools/GenGen.cpp -std=c++11 -fno-rtti -I ../include -L ../bin -lHalide -lpthread -ldl -o lesson_21_generate
    echo "2. generate header file and static library with auto_schedule on"
    ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true target=host auto_schedule=true
    echo "3. generate header file and static library with auto_schedule off"
    ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_false target=host auto_schedule=false
    echo "4. compile auto_schedule_run"
    g++ lesson_21_auto_scheduler_run.cpp auto_schedule_*.a -std=c++11 -I ../include -I ../tools -ldl -lpthread -o lesson_21_run
    echo "5. test auto_scheduler_run"
    ./lesson_21_run

 $ sh lesson_21_compile.sh

这里给出我的笔记本实测结果,可看出自动调优并没有比默认的调度快

这里写图片描述

修改shell脚本,加入machine_params,并打开-O3优化

   #!/bin/bash
    #########################################################################
    # File Name: lesson_21_compile.sh
    # Author: xxx
    # mail: xxx
    # Created Time: Thu 21 Dec 2017 11:08:22 PM CST
    #########################################################################

    echo "1. compile the auto_schedule generator"
    g++ lesson_21_auto_scheduler_generate.cpp ../tools/GenGen.cpp -std=c++11 -fno-rtti -I ../include -L ../bin -lHalide -lpthread -ldl -o lesson_21_generate -O3 

    echo "2. generate header file and static library with auto_schedule on"
    #MACHINE_PARAMS = 32,16777216,40
    ./lesson_21_generate -o . -g auto_schedule_gen -e static_library,h,schedule -f auto_schedule_true target=host-no_runtime auto_schedule=true machine_params=32,16777216,40

    echo "3. generate header file and static library with auto_schedule off"
    ./lesson_21_generate -o . -g auto_schedule_gen -e static_library,h,schedule -f auto_schedule_false target=host auto_schedule=false

    echo "4. compile auto_schedule_run"
    g++ lesson_21_auto_scheduler_run.cpp auto_schedule_*.a -std=c++11 -I ../include -I ../tools -ldl -lpthread -o lesson_21_run -O3

    echo "5. test auto_scheduler_run"
    ./lesson_21_run

实测结果

这里写图片描述

至此,Halide官方提供的tutorial更新完毕,稍后花一点时间整理一下Halide提供的三篇论文以及Halide源码中app目录和test目录中的案例.

姑苏隐士

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
Halide学习笔记----Halide tutorial源码阅读21

Halide入门教程21调度器定义 // Halide tutorial lesson 21: Auto-Scheduler // Halide入门第21课:自动调度器 // So far we have written Halide schedules by hand, but it is also possible to // ask Halide to sugges
复制链接

扫一扫