Halide示例学习四

慷仔

于 2022-05-02 16:42:43 发布

阅读量307

点赞数 1

分类专栏：高性能计算文章标签：性能优化深度学习算法

本文链接：https://blog.csdn.net/pengcode/article/details/124542892

版权

高性能计算专栏收录该内容

12 篇文章 0 订阅

订阅专栏

该博客详细介绍了Halide编程语言中用于优化计算图的技巧，包括compute_at、store_at以及多阶段计算图的组合使用。通过示例代码展示了如何调整计算图的执行顺序，以减少中间结果的存储和计算次数，提高效率。内容涵盖了内建计算、计算根、按坐标计算、循环内存储和向量化等概念。

摘要由CSDN通过智能技术生成

Halide示例学习四

多阶段计算图

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    // 两阶段的计算图
    {
        Func producer("producer_default"), consumer("consumer_default");
        // 第一阶段：数学计算
        producer(x, y) = sin(x * y);
        // 第二阶段：求平均
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;
        // 开启追踪
        consumer.trace_stores();
        producer.trace_stores();
        // 实例化为4x4的空间
        printf("\nEvaluating producer-consumer pipeline with default schedule\n");
        // 从这里也可以看出，这两个阶段不是先计算完producer然后再计算consumer
        // 而是最后组成了consumer这个整体的计算公式
        consumer.realize({4, 4});
        // 同下述的公式一致
        // consumer(x, y) = (sin(x * y) +
        //                   sin(x * (y + 1)) +
        //                   sin((x + 1) * y) +
        //                   sin((x + 1) * (y + 1))/4);
        // 等效C代码如下所示
        float result[4][4];
        for (int y = 0; y < 4; y++) {
            for (int x = 0; x < 4; x++) {
                result[y][x] = (sin(x * y) +
                                sin(x * (y + 1)) +
                                sin((x + 1) * y) +
                                sin((x + 1) * (y + 1))) / 4;
            }
        }
        printf("\n");
        // 使用print_loop_nest，将不会出现producer的任何信息，因为被内建再consumer中
        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");
    }

    return 0;
}

# 结果如下所示
# 默认是全部使用内建函数
Store consumer_default.0(0, 0) = 0.210368
Store consumer_default.0(1, 0) = 0.437692
Store consumer_default.0(2, 0) = 0.262604
Store consumer_default.0(3, 0) = -0.153921
Store consumer_default.0(0, 1) = 0.437692
Store consumer_default.0(1, 1) = 0.475816
Store consumer_default.0(2, 1) = 0.003550
Store consumer_default.0(3, 1) = 0.023565
Store consumer_default.0(0, 2) = 0.262604
Store consumer_default.0(1, 2) = 0.003550
Store consumer_default.0(2, 2) = -0.225879
Store consumer_default.0(3, 2) = 0.146372
Store consumer_default.0(0, 3) = -0.153921
Store consumer_default.0(1, 3) = 0.023565
Store consumer_default.0(2, 3) = 0.146372
Store consumer_default.0(3, 3) = -0.237233

多计算图组合示例

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    // 使用compute_root来强制使得计算图进行计算
    {
        Func producer("producer_root"), consumer("consumer_root");
        producer(x, y) = sin(x * y);
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;
        // 让Halide在计算consumer前先计算producer
        producer.compute_root();
        consumer.trace_stores();
        producer.trace_stores();
        printf("\nEvaluating producer.compute_root()\n");
        consumer.realize({4, 4});
        // 输出的计算过程中将会出现producer的计算过程
        // 等效C代码
        float result[4][4];
        float producer_storage[5][5];
        // 先计算producer_storage
        for (int y = 0; y < 5; y++) {
            for (int x = 0; x < 5; x++) {
                producer_storage[y][x] = sin(x * y);
            }
        }
        // 然后计算result
        for (int y = 0; y < 4; y++) {
            for (int x = 0; x < 4; x++) {
                result[y][x] = (producer_storage[y][x] +
                                producer_storage[y + 1][x] +
                                producer_storage[y][x + 1] +
                                producer_storage[y + 1][x + 1]) / 4;
            }
        }
        // 实例化最终的输出尺寸4x4的时候，Halide会自动进行尺寸的反向推导
        // 知道producer的尺寸为5x5
        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");
    }

    /* 现在来比较上述两者的差别，特别是数据存取次数、计算次数方面
    全内建计算图 (默认的调度):
    - 分配的临时内存: 0
    - 取数操作: 0
    - 存数操作: 16
    - sin函数的调用次数: 64

    分步计算:
    - 分配的临时内存: 25 floats
    - 取数操作: 64
    - 存数操作: 41
    - sin函数的调用次数: 25
    return 0;
}

#结果
Store producer_root.0(0, 0) = 0.000000
Store producer_root.0(1, 0) = 0.000000
Store producer_root.0(2, 0) = 0.000000
Store producer_root.0(3, 0) = 0.000000
Store producer_root.0(4, 0) = 0.000000
Store producer_root.0(0, 1) = 0.000000
Store producer_root.0(1, 1) = 0.841471
Store producer_root.0(2, 1) = 0.909297
Store producer_root.0(3, 1) = 0.141120
Store producer_root.0(4, 1) = -0.756802
Store producer_root.0(0, 2) = 0.000000
Store producer_root.0(1, 2) = 0.909297
Store producer_root.0(2, 2) = -0.756802
Store producer_root.0(3, 2) = -0.279415
Store producer_root.0(4, 2) = 0.989358
Store producer_root.0(0, 3) = 0.000000
Store producer_root.0(1, 3) = 0.141120
Store producer_root.0(2, 3) = -0.279415
Store producer_root.0(3, 3) = 0.412118
Store producer_root.0(4, 3) = -0.536573
Store producer_root.0(0, 4) = 0.000000
Store producer_root.0(1, 4) = -0.756802
Store producer_root.0(2, 4) = 0.989358
Store producer_root.0(3, 4) = -0.536573
Store producer_root.0(4, 4) = -0.287903
Store consumer_root.0(0, 0) = 0.210368
Store consumer_root.0(1, 0) = 0.437692
Store consumer_root.0(2, 0) = 0.262604
Store consumer_root.0(3, 0) = -0.153921
Store consumer_root.0(0, 1) = 0.437692
Store consumer_root.0(1, 1) = 0.475816
Store consumer_root.0(2, 1) = 0.003550
Store consumer_root.0(3, 1) = 0.023565
Store consumer_root.0(0, 2) = 0.262604
Store consumer_root.0(1, 2) = 0.003550
Store consumer_root.0(2, 2) = -0.225879
Store consumer_root.0(3, 2) = 0.146372
Store consumer_root.0(0, 3) = -0.153921
Store consumer_root.0(1, 3) = 0.023565
Store consumer_root.0(2, 3) = 0.146372
Store consumer_root.0(3, 3) = -0.237233

示意图显示结果如下所示：

在这里插入图片描述

compute_at使用方法

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    // 介于内建和非内建之间的部分
    {
        Func producer("producer_y"), consumer("consumer_y");
        producer(x, y) = sin(x * y);
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;
        // 当compute_at(consumer,x)时跟全内建的形式非常相同
        /*
        a.compute_at(b,y)：表示在y这个循环块内插入一块计算在y固定的时候，计算必要的a的代码
        */
        producer.compute_at(consumer, y);
        producer.trace_stores();
        consumer.trace_stores();
        printf("\nEvaluating producer.compute_at(consumer, y)\n");
        consumer.realize({4, 4});
        // 等效C代码如下所示
        float result[4][4];
        for (int y = 0; y < 4; y++) {
            // 为了能够满足计算结果一行的需求，我们知道中间输入至少要两行
            float producer_storage[2][5];
            for (int py = y; py < y + 2; py++) {
                for (int px = 0; px < 5; px++) {
                    producer_storage[py - y][px] = sin(px * py);
                }
            }
            // Compute a scanline of the consumer.
            for (int x = 0; x < 4; x++) {
                result[y][x] = (producer_storage[0][x] +
                                producer_storage[1][x] +
                                producer_storage[0][x + 1] +
                                producer_storage[1][x + 1]) / 4;
            }
        }
        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");

        // 这种做法介于内建和非内建之间，好处就是可以利用数据的局部性
        // 不仅减少了中间结果缓存空间，同时利用了高速缓存实现快速读取

        // producer.compute_at(consumer, y):
        // - 缓存空间: 10 floats
        // - 读: 64
        // - 存: 56
        // - sin调用次数: 40
    }
    return 0;
}

# 结果如下所示
Store producer_y.0(0, 0) = 0.000000
Store producer_y.0(1, 0) = 0.000000
Store producer_y.0(2, 0) = 0.000000
Store producer_y.0(3, 0) = 0.000000
Store producer_y.0(4, 0) = 0.000000
Store producer_y.0(0, 1) = 0.000000
Store producer_y.0(1, 1) = 0.841471
Store producer_y.0(2, 1) = 0.909297
Store producer_y.0(3, 1) = 0.141120
Store producer_y.0(4, 1) = -0.756802
Store consumer_y.0(0, 0) = 0.210368
Store consumer_y.0(1, 0) = 0.437692
Store consumer_y.0(2, 0) = 0.262604
Store consumer_y.0(3, 0) = -0.153921
Store producer_y.0(0, 1) = 0.000000
Store producer_y.0(1, 1) = 0.841471
Store producer_y.0(2, 1) = 0.909297
Store producer_y.0(3, 1) = 0.141120
Store producer_y.0(4, 1) = -0.756802
Store producer_y.0(0, 2) = 0.000000
Store producer_y.0(1, 2) = 0.909297
Store producer_y.0(2, 2) = -0.756802
Store producer_y.0(3, 2) = -0.279415
Store producer_y.0(4, 2) = 0.989358
Store consumer_y.0(0, 1) = 0.437692
Store consumer_y.0(1, 1) = 0.475816
Store consumer_y.0(2, 1) = 0.003550
Store consumer_y.0(3, 1) = 0.023565
Store producer_y.0(0, 2) = 0.000000
Store producer_y.0(1, 2) = 0.909297
Store producer_y.0(2, 2) = -0.756802
Store producer_y.0(3, 2) = -0.279415
Store producer_y.0(4, 2) = 0.989358
Store producer_y.0(0, 3) = 0.000000
Store producer_y.0(1, 3) = 0.141120
Store producer_y.0(2, 3) = -0.279415
Store producer_y.0(3, 3) = 0.412118
Store producer_y.0(4, 3) = -0.536573
Store consumer_y.0(0, 2) = 0.262604
Store consumer_y.0(1, 2) = 0.003550
Store consumer_y.0(2, 2) = -0.225879
Store consumer_y.0(3, 2) = 0.146372
Store producer_y.0(0, 3) = 0.000000
Store producer_y.0(1, 3) = 0.141120
Store producer_y.0(2, 3) = -0.279415
Store producer_y.0(3, 3) = 0.412118
Store producer_y.0(4, 3) = -0.536573
Store producer_y.0(0, 4) = 0.000000
Store producer_y.0(1, 4) = -0.756802
Store producer_y.0(2, 4) = 0.989358
Store producer_y.0(3, 4) = -0.536573
Store producer_y.0(4, 4) = -0.287903
Store consumer_y.0(0, 3) = -0.153921
Store consumer_y.0(1, 3) = 0.023565
Store consumer_y.0(2, 3) = 0.146372
Store consumer_y.0(3, 3) = -0.237233

示意图显示结果：

在这里插入图片描述

store_root配合compute_at使用方法

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    // 下述展示了强制中间buffer保持计算结果的情况
    {
        Func producer("producer_root_y"), consumer("consumer_root_y");
        producer(x, y) = sin(x * y);
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;

        // 让producer保持中间计算值
        producer.store_root();
        // ... but compute it as needed per y coordinate of the
        // consumer.
        producer.compute_at(consumer, y);

        producer.trace_stores();
        consumer.trace_stores();

        printf("\nEvaluating producer.store_root().compute_at(consumer, y)\n");
        consumer.realize({4, 4});

        // result/lesson_08_store_root_compute_y.gif
        // 等效C代码如下所示
        float result[4][4];
        float producer_storage[5][5];
        for (int y = 0; y < 4; y++) {

            // Compute enough of the producer to satisfy this scanline
            // of the consumer.
            for (int py = y; py < y + 2; py++) {

                // 跳过已经经过计算的prouducer点
                if (y > 0 && py == y) continue;

                for (int px = 0; px < 5; px++) {
                    producer_storage[py][px] = sin(px * py);
                }
            }

            // Compute a scanline of the consumer.
            for (int x = 0; x < 4; x++) {
                result[y][x] = (producer_storage[y][x] +
                                producer_storage[y + 1][x] +
                                producer_storage[y][x + 1] +
                                producer_storage[y + 1][x + 1]) / 4;
            }
        }

        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");

        // producer.store_root().compute_at(consumer, y):
        // - 分配的中间缓存: 10 floats
        // - 取数: 64 = 4 * 4 * 4
        // - 存数: 41 = 5 * 5 + 4 * 4
        // - sin调用次数: 25 = 5 * 5
        // 关于分配的中间缓存为何是10floats
        // 主要是因为实际的Halide使用了如下所示的循环中间缓存作为缓存
        {
            // Actually store 2 scanlines instead of 5
            float producer_storage[2][5];
            for (int y = 0; y < 4; y++) {
                for (int py = y; py < y + 2; py++) {
                    if (y > 0 && py == y) continue;
                    for (int px = 0; px < 5; px++) {
                        // 循环缓存
                        producer_storage[py & 1][px] = sin(px * py);
                    }
                }
                for (int x = 0; x < 4; x++) {
                    // 循环调用
                    result[y][x] = (producer_storage[y & 1][x] +
                                    producer_storage[(y + 1) & 1][x] +
                                    producer_storage[y & 1][x + 1] +
                                    producer_storage[(y + 1) & 1][x + 1]) / 4;
                }
            }
        }
    }
    return 0;
}

示意图显示结果：

在这里插入图片描述

tile配合compute_at使用方法

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    {
        Func producer("producer_tile"), consumer("consumer_tile");
        producer(x, y) = sin(x * y);
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;
        Var x_outer, y_outer, x_inner, y_inner;
        consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
        producer.compute_at(consumer, x_outer);
        producer.trace_stores();
        consumer.trace_stores();

        printf("\nEvaluating:\n"
               "consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);\n"
               "producer.compute_at(consumer, x_outer);\n");
        consumer.realize({8, 8});

        float result[8][8];

        // For every tile of the consumer:
        for (int y_outer = 0; y_outer < 2; y_outer++) {
            for (int x_outer = 0; x_outer < 2; x_outer++) {
                // 瓦片下标配合
                int x_base = x_outer * 4;
                int y_base = y_outer * 4;
                // 下面这个内循环就是因为compute_as(,x_outer)的缘故，在此处进行了producer的必要的部分计算
                float producer_storage[5][5];
                for (int py = y_base; py < y_base + 5; py++) {
                    for (int px = x_base; px < x_base + 5; px++) {
                        producer_storage[py - y_base][px - x_base] = sin(px * py);
                    }
                }
                // 一个瓦片的内容
                for (int y_inner = 0; y_inner < 4; y_inner++) {
                    for (int x_inner = 0; x_inner < 4; x_inner++) {
                        int x = x_base + x_inner;
                        int y = y_base + y_inner;
                        result[y][x] =
                            (producer_storage[y - y_base][x - x_base] +
                             producer_storage[y - y_base + 1][x - x_base] +
                             producer_storage[y - y_base][x - x_base + 1] +
                             producer_storage[y - y_base + 1][x - x_base + 1]) / 4;
                    }
                }
            }
        }

        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");
    }
    return 0;
}

示意图显示结果：

在这里插入图片描述

compute_at和store_at以及之前的混合技使用方法

#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
    Var x("x"), y("y");
    {
        Func producer("producer_mixed"), consumer("consumer_mixed");
        producer(x, y) = sin(x * y);
        consumer(x, y) = (producer(x, y) +
                          producer(x, y + 1) +
                          producer(x + 1, y) +
                          producer(x + 1, y + 1)) / 4;
        // y轴拆分成内16的双层循环
        Var yo, yi;
        consumer.split(y, yo, yi, 16);
        // 并行化最外层循环yo
        consumer.parallel(yo);
        // consumer进行x轴进行4位向量化
        consumer.vectorize(x, 4);
        // 在yo轴内，创建一个中间缓存来保存prouducer计算结果
        producer.store_at(consumer, yo);
        // 在yi轴内，创建一个代码块计算yi内consumer所需的prouducer
        producer.compute_at(consumer, yi);
        // producer的x轴也进行4位向量化
        producer.vectorize(x, 4);


        Buffer<float> halide_result = consumer.realize({160, 160});

        // 自己理解的C等效代码
        {
            float Consumer[160][160];
            // omp loop
            for(int yo = 0; yo < 10; yo++){
                float producer_storage[17][161];// store_at
                for(int yi = 0; yi < 16; yi++){
                    int yy = yo * 16 + yi;
                    // 计算所需的producer_storage
                    // compute_at
                    for(int x4 = 0; x4 < 160; x4++){
                        // 1部分内部应该是向量化4
                        producer_storage[yy&15][x4] = sin(yy*x4);
                        producer_storage[yy&15][x4 + 1] = sin(yy*(x4+1));
                        producer_storage[(yy + 1)&15][x4] = sin((yy + 1)*x4);
                        producer_storage[(yy+1)&15][(x4+1)] = sin((yy+1)*(x4+1));
                    }
                    for(int x4 = 0; x4 < 160; x4++){
                        Consumer[yy][x4] = producer_storage[yy&15][x4];
                        Consumer[yy][x4 + 1] = producer_storage[yy&15][x4 + 1];
                        Consumer[(yy + 1)][x4] = producer_storage[(yy + 1)&15][x4];
                        Consumer[(yy+1)][(x4+1)] = producer_storage[(yy+1)&15][(x4+1)];
                    }
                    
                }
            }
        }
        // 下述是官方的C等效代码
        float c_result[160][160];

        // For every strip of 16 scanlines (this loop is parallel in
        // the Halide version)
        for (int yo = 0; yo < 160 / 16; yo++) {
            int y_base = yo * 16;

            // Allocate a two-scanline circular buffer for the producer
            float producer_storage[2][161];

            // For every scanline in the strip of 16:
            for (int yi = 0; yi < 16; yi++) {
                int y = y_base + yi;

                for (int py = y; py < y + 2; py++) {
                    // store_at的影响
                    if (yi > 0 && py == y) continue;
                    for (int x_vec = 0; x_vec < 160 / 4 + 1; x_vec++) {
                        int x_base = x_vec * 4;
                        // 4 doesn't divide 161, so push the last vector left
                        // (see lesson 05).
                        if (x_base > 161 - 4) x_base = 161 - 4;
                        // If you're on x86, Halide generates SSE code for this part:
                        int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
                        float vec[4] = {sinf(x[0] * py), sinf(x[1] * py),
                                        sinf(x[2] * py), sinf(x[3] * py)};
                        producer_storage[py & 1][x[0]] = vec[0];
                        producer_storage[py & 1][x[1]] = vec[1];
                        producer_storage[py & 1][x[2]] = vec[2];
                        producer_storage[py & 1][x[3]] = vec[3];
                    }
                }

                // Now compute consumer for this scanline:
                for (int x_vec = 0; x_vec < 160 / 4; x_vec++) {
                    int x_base = x_vec * 4;
                    // Again, Halide's equivalent here uses SSE.
                    int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
                    float vec[] = {
                        (producer_storage[y & 1][x[0]] +
                         producer_storage[(y + 1) & 1][x[0]] +
                         producer_storage[y & 1][x[0] + 1] +
                         producer_storage[(y + 1) & 1][x[0] + 1]) /
                            4,
                        (producer_storage[y & 1][x[1]] +
                         producer_storage[(y + 1) & 1][x[1]] +
                         producer_storage[y & 1][x[1] + 1] +
                         producer_storage[(y + 1) & 1][x[1] + 1]) /
                            4,
                        (producer_storage[y & 1][x[2]] +
                         producer_storage[(y + 1) & 1][x[2]] +
                         producer_storage[y & 1][x[2] + 1] +
                         producer_storage[(y + 1) & 1][x[2] + 1]) /
                            4,
                        (producer_storage[y & 1][x[3]] +
                         producer_storage[(y + 1) & 1][x[3]] +
                         producer_storage[y & 1][x[3] + 1] +
                         producer_storage[(y + 1) & 1][x[3] + 1]) /
                            4};

                    c_result[y][x[0]] = vec[0];
                    c_result[y][x[1]] = vec[1];
                    c_result[y][x[2]] = vec[2];
                    c_result[y][x[3]] = vec[3];
                }
            }
        }
        printf("Pseudo-code for the schedule:\n");
        consumer.print_loop_nest();
        printf("\n");

        // Look on my code, ye mighty, and despair!

        // Let's check the C result against the Halide result. Doing
        // this I found several bugs in my C implementation, which
        // should tell you something.
        for (int y = 0; y < 160; y++) {
            for (int x = 0; x < 160; x++) {
                float error = halide_result(x, y) - c_result[y][x];
                // It's floating-point math, so we'll allow some slop:
                if (error < -0.001f || error > 0.001f) {
                    printf("halide_result(%d, %d) = %f instead of %f\n",
                           x, y, halide_result(x, y), c_result[y][x]);
                    return -1;
                }
            }
        }
    }
    return 0;
}

示意图显示结果：

在这里插入图片描述