halide编程技术指南（连载五）

最新推荐文章于 2024-03-20 10:00:09 发布
Aoulun
最新推荐文章于 2024-03-20 10:00:09 发布
阅读量554
点赞数
分类专栏：深度学习文章标签： halide 人工智能
本文链接：https://blog.csdn.net/Aoulun/article/details/108430270
版权
深度学习专栏收录该内容
45 篇文章 5 订阅
订阅专栏
本文是halide编程指南的连载，已同步至公众号
第九章多重传递函数、更新定义和约化


#include "Halide.h"
#include <stdio.h>
// 我们将在本课使用x86 SSE内部函数.
#ifdef __SSE2__
#include <emmintrin.h>
#endif
// 最后我们还需要一个时钟来做性能测试.
#include "clock.h"
using namespace Halide;
// 加载PNG的支持代码.
#include "halide_image_io.h"
using namespace Halide::Tools;
int main(int argc, char **argv) {
    // 声明下面要使用的一些变量Vars.
    Var x("x"), y("y");

    // 加载要用作输入的灰度图像.
    Buffer<uint8_t> input = load_image("images/gray.png");

    // 可以在多个过程中定义Func。我们先看一个玩具的例子.
    {
        // 第一个定义必须与我们已经看到的一样-从Vars到Expr的映射：
        Func f;
        f(x, y) = x + y;
        // 我们称第一个定义为“纯”定义。

        // 但后面的定义可以包括两边的计算表达式。最简单的例子是修改单个点:
        f(3, 7) = 42;

        // 我们称这些额外的定义为“更新”定义，或“减少”定义。归约定义是一个更新定义，它递归地引用同一站点上函数的当前值:
        f(x, y) = f(x, y) + 17;

        // 如果将更新限制为一行，则可以递归地引用同一列中的值:
        f(x, 3) = f(x, 0) * f(x, 10);

        // 类似地，如果我们将更新限制在一个列中，我们可以递归地引用同一行中的其他值。
        f(0, y) = f(0, y) / f(3, y);

        // 一般规则是：更新定义中使用的每个变量，必须在所有对函数引用的左侧和右侧以与纯定义中相同的位置显示。因此，以下定义是合法的更新：（啥有意思呢？）
        f(x, 17) = x + 8;
        f(0, y) = y * 8;
        f(x, x + 1) = x + 8;
        f(y/2, y) = f(0, y) * 17;

        // 但下面这些就会出现错误:

        // f(x, 0) = f(x + 1, 0);
        // 右边f的第一个参数必须是'x'，而不是'x+1'.

        // f(y, y + 1) = y + 8;
        // 左边f的第二个参数必须是“y”，而不是“y+1”.

        // f(y, x) = y - x;
        // 左边的f的参数在错误的地方.

        // f(3, 4) = x + y;
        // 自由变量出现在右侧，而不是左侧.

        // 看到这里，我想你更迷糊了，到底啥意思啊。

        // 实现这个函数
        f.realize(100, 101);

        // 对于f的每个实现，每个步骤都在下一个步骤开始之前完整地运行。一个简单的例子，让我们跟踪负载和存储:
        Func g("g");
        g(x, y) = x + y;   // 纯定义
        g(2, 1) = 42;      // 第一次更新定义
        g(x, 0) = g(x, 1); // 第二次更新定义

        g.trace_loads();
        g.trace_stores();

        g.realize(4, 4);
------------------------------------------------------------------------------------------
 > Begin pipeline g.0()
 > Store g.0(0, 0) = 0
 > Store g.0(1, 0) = 1
 > Store g.0(2, 0) = 2
 > Store g.0(3, 0) = 3
 > Store g.0(0, 1) = 1
 > Store g.0(1, 1) = 2
 > Store g.0(2, 1) = 3
 > Store g.0(3, 1) = 4
 > Store g.0(0, 2) = 2
 > Store g.0(1, 2) = 3
 > Store g.0(2, 2) = 4
 > Store g.0(3, 2) = 5
 > Store g.0(0, 3) = 3
 > Store g.0(1, 3) = 4
 > Store g.0(2, 3) = 5
 > Store g.0(3, 3) = 6
 > Store g.0(2, 1) = 42
 > Load g.0(0, 1) = 1
 > Store g.0(0, 0) = 1
 > Load g.0(1, 1) = 2
 > Store g.0(1, 0) = 2
 > Load g.0(2, 1) = 42
 > Store g.0(2, 0) = 42
 > Load g.0(3, 1) = 4
 > Store g.0(3, 0) = 4
 > End pipeline g.0()
-----------------------------------------------------------------------------------------
        // 可视化图如下.
            图91

        // 通过阅读日志，我们可以看到每个过程都是依次应用的，也就是按顺序执行。等价的C是:
        int result[4][4];
        // 纯定义
        for (int y = 0; y < 4; y++) {
            for (int x = 0; x < 4; x++) {
                result[y][x] = x + y;
            }
        }
        // 第一次更新
        result[1][2] = 42;
        // 第二次更新
        for (int x = 0; x < 4; x++) {
            result[0][x] = result[1][x];
        }
    }

    // 将更新过程放入循环中.
    {
        // 从这个纯粹的定义开始:
        Func f;
        f(x, y) = (x + y)/100.0f;

        // 假设我们想要一个更新，使前五十行成正方形。我们可以添加50个更新定义:

        // f(x, 0) = f(x, 0) * f(x, 0);
        // f(x, 1) = f(x, 1) * f(x, 1);
        // f(x, 2) = f(x, 2) * f(x, 2);
        // ...
        // f(x, 49) = f(x, 49) * f(x, 49);

        // 或者在C++中等效使用编译时间循环:
        // for (int i = 0; i < 50; i++) {
        //   f(x, i) = f(x, i) * f(x, i);
        // }

        // 但将循环放入生成的代码中更易于管理，也更灵活。我们通过定义一个“还原域”并在更新定义中使用它来实现这一点:
        RDom r(0, 50);
        f(x, r) = f(x, r) * f(x, r);
        Buffer<float> halide_result = f.realize(100, 100);

        // 可视化图.
        图92
        // 等效c代码:
        float c_result[100][100];
        for (int y = 0; y < 100; y++) {
            for (int x = 0; x < 100; x++) {
                c_result[y][x] = (x + y)/100.0f;
            }
        }
        for (int x = 0; x < 100; x++) {
            for (int r = 0; r < 50; r++) {
                // 还原域上的循环发生在更新步骤中使用的任何纯变量上的循环内部:
                c_result[r][x] = c_result[r][x] * c_result[r][x];
            }
        }

        // 检查结果是否匹配:
        for (int y = 0; y < 100; y++) {
            for (int x = 0; x < 100; x++) {
                if (fabs(halide_result(x, y) - c_result[y][x]) > 0.01f) {
                    printf("halide_result(%d, %d) = %f instead of %f\n",
                           x, y, halide_result(x, y), c_result[y][x]);
                    return -1;
                }
            }
        }
    }

    // 现在，我们将研究更新定义的实际用途：计算直方图.
    {

        // 对图像的某些操作不能清晰地表示为从输出坐标到存储在其中的值的纯函数。典型的例子是计算直方图。自然的方法是迭代输入图像，更新直方图桶。在halide里是怎么做到的:
        Func histogram("histogram");

        // 直方图桶从零开始.
        histogram(x) = 0;

        // 在输入图像上定义多维归约域:
        RDom r(0, input.width(), 0, input.height());

        // 对于还原域中的每个点，递增对应于该点输入图像强度的直方图桶.
        histogram(input(r.x, r.y)) += 1;

        Buffer<int> halide_result = histogram.realize(256);

        // 等效的c代码:
        int c_result[256];
        for (int x = 0; x < 256; x++) {
            c_result[x] = 0;
        }
        for (int r_y = 0; r_y < input.height(); r_y++) {
            for (int r_x = 0; r_x < input.width(); r_x++) {
                c_result[input(r_x, r_y)] += 1;
            }
        }

        // 检查结果:
        for (int x = 0; x < 256; x++) {
            if (c_result[x] != halide_result(x)) {
                printf("halide_result(%d) = %d instead of %d\n",
                       x, halide_result(x), c_result[x]);
                return -1;
            }
        }
    }

    // 计划更新步骤
    {
        // 更新步骤中的纯变量，通常可以并行化、向量化、拆分等.

        // 向量化、拆分或并行化属于归约域的变量比较困难。我们将在以后的课程中讨论这个问题.

        // 考虑一下定义:
        Func f;
        f(x, y) = x * y;
        // 将第0行设置为第8行的值
        f(x, 0) = f(x, 8);
        // 将第0列设置为第8列加2
        f(0, y) = f(8, y) + 2;

        // 每个阶段的纯变量可以独立调度。为了控制纯定义，我们像过去一样安排。以下代码仅对纯定义进行向量化和并行化.
        f.vectorize(x, 4).parallel(y);

        // 我们使用Func::update（int）来获取更新步骤的句柄，以便进行调度。下面的行将第一个更新步骤向量化到x上。对于这个更新步骤，我们不能对y做任何操作，因为它不使用y.
        f.update(0).vectorize(x, 4);

        // 现在我们将第二个更新步骤并行化为大小为4的块.
        Var yo, yi;
        f.update(1).split(y, yo, yi, 4).parallel(yo);

        Buffer<int> halide_result = f.realize(16, 16);

        // 可视化图.

         图93

        // 等效的c:
        int c_result[16][16];

        // 纯步骤,在x中向量化，在y中并行化.
        for (int y = 0; y < 16; y++) { // 应该是一个并行for循环
            for (int x_vec = 0; x_vec < 4; x_vec++) {
                int x[] = {x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3};
                c_result[y][x[0]] = x[0] * y;
                c_result[y][x[1]] = x[1] * y;
                c_result[y][x[2]] = x[2] * y;
                c_result[y][x[3]] = x[3] * y;
            }
        }

        // 第一次更新。向量化x.
        for (int x_vec = 0; x_vec < 4; x_vec++) {
            int x[] = {x_vec*4, x_vec*4+1, x_vec*4+2, x_vec*4+3};
            c_result[0][x[0]] = c_result[8][x[0]];
            c_result[0][x[1]] = c_result[8][x[1]];
            c_result[0][x[2]] = c_result[8][x[2]];
            c_result[0][x[3]] = c_result[8][x[3]];
        }

        // 第二次更新。在y中用大小为4的块并行化.
        for (int yo = 0; yo < 4; yo++) { // 应该是一个并行for循环
            for (int yi = 0; yi < 4; yi++) {
                int y = yo*4 + yi;
                c_result[y][0] = c_result[y][8] + 2;
            }
        }

        // 检查结果是否匹配:
        for (int y = 0; y < 16; y++) {
            for (int x = 0; x < 16; x++) {
                if (halide_result(x, y) != c_result[y][x]) {
                    printf("halide_result(%d, %d) = %d instead of %d\n",
                           x, y, halide_result(x, y), c_result[y][x]);
                    return -1;
                }
            }
        }
    }

    // 这包括如何在使用更新步骤的Func中调度变量，但是涉及计算和存储的producer-consumer关系呢？让我们在producer和consumer对中，以producer的身份审视归约.
    {
        // 因为更新在存储数组上执行多个传递，所以内联它们是没有意义的。所以他们的默认安排尽可能的接近。它在consumer的最内部循环中计算它们。考虑这个小例子:
        Func producer, consumer;
        producer(x) = x*2;
        producer(x) += 10;
        consumer(x) = 2 * producer(x);
        Buffer<int> halide_result = consumer.realize(10);

        // 可视化图.

         图94

        // 等效的c:
        int c_result[10];
        for (int x = 0; x < 10; x++)  {
            int producer_storage[1];
            // producer的纯步骤
            producer_storage[0] = x * 2;
            // producer的更新步骤
            producer_storage[0] = producer_storage[0] + 10;
            // consumer的纯步骤
            c_result[x] = 2 * producer_storage[0];
        }

        // 检查结果匹配
        for (int x = 0; x < 10; x++) {
            if (halide_result(x) != c_result[x]) {
                printf("halide_result(%d) = %d instead of %d\n",
                       x, halide_result(x), c_result[x]);
                return -1;
            }
        }

        // 对于所有其他compute_at / store_at选项，减少量将放置在consumer 循环嵌套中的预期位置。
    }

    // 现在，让我们考虑约化producer-consumer对中的consumer。 这涉及更多.
    {
        {
            // Case 1: consumer仅在纯步骤中引用producer.
            Func producer, consumer;
            // producer的纯步骤.
            producer(x) = x*17;
            consumer(x) = 2 * producer(x);
            consumer(x) += 50;

            // 在这种情况下，生产者的有效计划是默认计划（内联），以及:
            //
            // 1) producer.compute_at(x), 在producer的纯步骤中，将producer的计算置于x的循环内.
            //
            // 2) producer.compute_root(), 可以提前计算所有producer.
            //
            // 3) producer.store_root().compute_at(x), 它通过x为循环外部的consumer分配空间，但根据需要在循环内部填充空间。
            //
            // 让我们使用选项1.

            producer.compute_at(consumer, x);

            Buffer<int> halide_result = consumer.realize(10);

            // 可视化图.

             图95

            // 等效C代码:
            int c_result[10];
            // consumer的纯步骤
            for (int x = 0; x < 10; x++)  {
                // producer的纯步骤
                int producer_storage[1];
                producer_storage[0] = x * 17;
                c_result[x] = 2 * producer_storage[0];
            }
            // consumer更新步骤
            for (int x = 0; x < 10; x++) {
                c_result[x] += 50;
            }

            // 所有的纯步骤都是在任何更新步骤之前计算的，因此在x上有两个单独的循环.

            // 检查结果是否一致
            for (int x = 0; x < 10; x++) {
                if (halide_result(x) != c_result[x]) {
                    printf("halide_result(%d) = %d instead of %d\n",
                           x, halide_result(x), c_result[x]);
                    return -1;
                }
            }
        }

        {
            // Case 2: consumer 仅在更新步骤中引用producer
            Func producer, consumer;
            producer(x) = x * 17;
            consumer(x) = 100 - x * 10;
            consumer(x) += producer(x);

            // 在consumer的每个x坐标，我们再次计算producer 。这会将producer 代码放在consumer的更新步骤中，因为这是唯一使用producer的步骤。
            producer.compute_at(consumer, x);

            // 但是, 并不是说:
            // producer.compute_at(consumer.update(0), x).
            // 调度是针对Func的Vars完成的，Func的Vars在pure（纯）和update（更新）步骤中共享。

            Buffer<int> halide_result = consumer.realize(10);

            // 可视化图.
             图96
            // 等效的C代码:
            int c_result[10];
            //  consumer的纯步骤
            for (int x = 0; x < 10; x++)  {
                c_result[x] = 100 - x * 10;
            }
            //  consumer的更新步骤
            for (int x = 0; x < 10; x++) {
                //  producer的纯步骤
                int producer_storage[1];
                producer_storage[0] = x * 17;
                c_result[x] += producer_storage[0];
            }


            // 检查结果一致性
            for (int x = 0; x < 10; x++) {
                if (halide_result(x) != c_result[x]) {
                    printf("halide_result(%d) = %d instead of %d\n",
                           x, halide_result(x), c_result[x]);
                    return -1;
                }
            }
        }

        {
            // Case 3: consumer 在共享公共变量的多个步骤中引用producer
            Func producer, consumer;
            producer(x) = x * 17;
            consumer(x) = 170 - producer(x);
            consumer(x) += producer(x)/2;

            // 在consumer的每个x坐标我们再次计算producer 。这将producer 代码放在consumer的纯和更新步骤中。因此，最终有两个独立的producer实现，并且产生冗余的工作。
            producer.compute_at(consumer, x);

            Buffer<int> halide_result = consumer.realize(10);

            // 可视化

             图97

            // 等效C代码:
            int c_result[10];
            //  consumer纯步骤
            for (int x = 0; x < 10; x++)  {
                // producer纯步骤
                int producer_storage[1];
                producer_storage[0] = x * 17;
                c_result[x] = 170 - producer_storage[0];
            }
            // consumer更新步骤
            for (int x = 0; x < 10; x++) {
                // producer纯步骤另外一个copy
                int producer_storage[1];
                producer_storage[0] = x * 17;
                c_result[x] += producer_storage[0]/2;
            }

            // 检查结果一致性
            for (int x = 0; x < 10; x++) {
                if (halide_result(x) != c_result[x]) {
                    printf("halide_result(%d) = %d instead of %d\n",
                           x, halide_result(x), c_result[x]);
                    return -1;
                }
            }
        }

        {
            // Case 4: 在多个步骤中，consumer 引用producer，但是不共享公共变量
           
            Func producer, consumer;
            producer(x, y) = (x * y) / 10 + 8;
            consumer(x, y) = x + y;
            consumer(x, 0) = producer(x, x);
            consumer(0, y) = producer(y, 9-y);

            // 这个例子中，producer.compute_at(consumer, x)和producer.compute_at(consumer, y)将生效，因为任何一个都不能覆盖producer的一个用途。所以，要做producer的内联，或者使用producer.compute_root().

            // 假设我们真的希望producer在consumer更新步骤的内部循环中compute_at。halide不允许一个函数有多个不同的调度，但我们可以通过在producer周围创建两个交换量来解决这个问题，并对它们进行调度

            // 尝试 2:
            Func producer_1, producer_2, consumer_2;
            producer_1(x, y) = producer(x, y);
            producer_2(x, y) = producer(x, y);

            consumer_2(x, y) = x + y;
            consumer_2(x, 0) += producer_1(x, x);
            consumer_2(0, y) += producer_2(y, 9-y);

            // 交换量提供了producer的两个单独的句柄，所以就可以对他们做不同的操作。
            producer_1.compute_at(consumer_2, x);
            producer_2.compute_at(consumer_2, y);

            Buffer<int> halide_result = consumer_2.realize(10, 10);

            // 可视化图.
98图
             

            // 等效C:
            int c_result[10][10];
            //  consumer纯步骤
            for (int y = 0; y < 10; y++) {
                for (int x = 0; x < 10; x++) {
                    c_result[y][x] = x + y;
                }
            }
            // consumer的第一个更新步骤
            for (int x = 0; x < 10; x++) {
                int producer_1_storage[1];
                producer_1_storage[0] = (x * x) / 10 + 8;
                c_result[0][x] += producer_1_storage[0];
            }
            // consumer的第二个更新步骤
            for (int y = 0; y < 10; y++) {
                int producer_2_storage[1];
                producer_2_storage[0] = (y * (9-y)) / 10 + 8;
                c_result[y][0] += producer_2_storage[0];
            }

            // 检查结果一致性
            for (int y = 0; y < 10; y++) {
                for (int x = 0; x < 10; x++) {
                    if (halide_result(x, y) != c_result[y][x]) {
                        printf("halide_result(%d, %d) = %d instead of %d\n",
                               x, y, halide_result(x, y), c_result[y][x]);
                        return -1;
                    }
                }
            }
        }

        {
            // Case 5: 在consumer的约简域变量下调度producer 

            //我们不仅限于在consumer的纯变量上调度producers 。如果producers 只在reduction domain（RDom）变量的循环中使用，我们也可以在那里安排producers 。
            Func producer, consumer;

            RDom r(0, 5);
            producer(x) = x % 8;
            consumer(x) = x + 10;
            consumer(x) += r + producer(x + r);

            producer.compute_at(consumer, r);

            Buffer<int> halide_result = consumer.realize(10);

            // 可视化图.

             图99

            // 等效C代码:
            int c_result[10];
            //  consumer纯步骤.
            for (int x = 0; x < 10; x++)  {
                c_result[x] = x + 10;
            }
            //consumer更新步骤.
            for (int x = 0; x < 10; x++) {
                // 约化域（RDom）上的循环总是内环.
                for (int r = 0; r < 5; r++) {
                    // 我们在这里安排了producer 的存储和计算。我们只需要一个值
                    int producer_storage[1];
                    // producer.纯步骤
                    producer_storage[0] = (x + r) % 8;

                    //在consumer的更新步骤中使用.
                    c_result[x] += r + producer_storage[0];
                }
            }

            // 检查结果一致性
            for (int x = 0; x < 10; x++) {
                if (halide_result(x) != c_result[x]) {
                    printf("halide_result(%d) = %d instead of %d\n",
                           x, halide_result(x), c_result[x]);
                    return -1;
                }
            }


        }
    }

    // 一个例子.
    {
        // 对于类似卷积的操作，默认的缩减计划是一个很好的例子。例如，在clamp-to-edge条件下，对灰度图做5*5的模糊:

        // 第一步添加边界条件.
        Func clamped = BoundaryConditions::repeat_edge(input);

        // 定义一个 5x5框，从 (-2, -2)开始
        RDom r(-2, 5, -2, 5);

        // 对每个像素做 5x5的乘积求和.
        Func local_sum;
        local_sum(x, y) = 0; // 计算32位的和
        local_sum(x, y) += clamped(x + r.x, y + r.y);

        // 除以25做平均
        Func blurry;
        blurry(x, y) = cast<uint8_t>(local_sum(x, y) / 25);

        Buffer<uint8_t> halide_result = blurry.realize(input.width(), input.height());

        // 默认计划将“clamped”内联到“local_sum”的更新步骤中，因为clamped只有一个纯定义，因此它的默认计划是完全内联的。然后，我们将计算blurry的每个x坐标的local_sum，因为默认的缩减计划是compute innermost（从里面开始）
//等效的C代码

        Buffer<uint8_t> c_result(input.width(), input.height());
        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                int local_sum[1];
                //  local_sum的纯步骤
                local_sum[0] = 0;
                // local_sum的更新步骤
                for (int r_y = -2; r_y <= 2; r_y++) {
                    for (int r_x = -2; r_x <= 2; r_x++) {
                        // clamping内联到更新步骤中 
                        int clamped_x = std::min(std::max(x + r_x, 0), input.width()-1);
                        int clamped_y = std::min(std::max(y + r_y, 0), input.height()-1);
                        local_sum[0] += input(clamped_x, clamped_y);
                    }
                }
                // blurry纯步骤
                c_result(x, y) = (uint8_t)(local_sum[0] / 25);
            }
        }

        // 检查一致性
        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                if (halide_result(x, y) != c_result(x, y)) {
                    printf("halide_result(%d, %d) = %d instead of %d\n",
                           x, y, halide_result(x, y), c_result(x, y));
                    return -1;
                }
            }
        }
    }

    // Reduction helper.
    {
        // 在Halide.h中提供有reduction helper的函数，它们计算小的缩减并将它们安排在consumer代码最里面。最有用的是“求和”。
        Func f1;
        RDom r(0, 100);
        f1(x) = sum(r + x) * 7;

        // Sum创建一个小的匿名函数来进行缩减. 等效为:
        Func f2;
        Func anon;
        anon(x) = 0;
        anon(x) += r + x;
        f2(x) = anon(x) * 7;

        // 所以即使f1引用了一个归约域，它也是一个纯函数。RDom（约简域）已经被隐含了，用来定义内部匿名约简。

        Buffer<int> halide_result_1 = f1.realize(10);
        Buffer<int> halide_result_2 = f2.realize(10);

        // 等效C代码:
        int c_result[10];
        for (int x = 0; x < 10; x++) {
            int anon[1];
            anon[0] = 0;
            for (int r = 0; r < 100; r++) {
                anon[0] += r + x;
            }
            c_result[x] = anon[0] * 7;
        }

        // 检查结果对不对.
        for (int x = 0; x < 10; x++) {
            if (halide_result_1(x) != c_result[x]) {
                printf("halide_result_1(%d) = %d instead of %d\n",
                       x, halide_result_1(x), c_result[x]);
                return -1;
            }
            if (halide_result_2(x) != c_result[x]) {
                printf("halide_result_2(%d) = %d instead of %d\n",
                       x, halide_result_2(x), c_result[x]);
                return -1;
            }
        }
    }

    // 使用reduction helpers的复杂的例子.
    {
        // 其他的reduction helpers包括"product", "minimum","maximum", "argmin", and "argmax"。使用
argmin and argmax需要理解tuples，在下一章介绍。让我们使用minimum and maximum来计算灰度图的局部扩散度。

        // 第一，给输入添加边界条件.
        Func clamped;
        Expr x_clamped = clamp(x, 0, input.width()-1);
        Expr y_clamped = clamp(y, 0, input.height()-1);
        clamped(x, y) = input(x_clamped, y_clamped);

        RDom box(-2, 5, -2, 5);
        // 计算局部最大值减去局部最小值:
        Func spread;
        spread(x, y) = (maximum(clamped(x + box.x, y + box.y)) -
                        minimum(clamped(x + box.x, y + box.y)));

        // 以32条扫描线为单位计算结果
        Var yo, yi;
        spread.split(y, yo, yi, 32).parallel(yo);

        // 在x方向上做矢量化。在求扩散度中，在x的循环内部，进行隐式的向量化计算，包括minimum and maximum helpers，因此它们也被矢量化。
        spread.vectorize(x, 16);

        // 我们将通过在循环缓冲区中填充每个扫描线来应用边界条件。
        clamped.store_at(spread, yo).compute_at(spread, yi);

        Buffer<uint8_t> halide_result = spread.realize(input.width(), input.height());

        // C等价代码几乎太可怕了，无法想象（我花了很长时间调试）。这一次我想同时计时halide版本和C版本，所以我将使用sse intrinsics进行矢量化，并使用openmp来执行并行for循环（您需要使用-fopenmp或类似的代码来编译以获得正确的计时）
        #ifdef __SSE2__

        // 不要包括分配输出缓冲区所需的时间.
        Buffer<uint8_t> c_result(input.width(), input.height());

        #ifdef _OPENMP
        double t1 = current_time();
        #endif

        // 运行100次这样我们就可以平均计时结果了。.
        for (int iters = 0; iters < 100; iters++) {

            #pragma omp parallel for
            for (int yo = 0; yo < (input.height() + 31)/32; yo++) {
                int y_base = std::min(yo * 32, input.height() - 32);

                // 在大小为8的循环缓冲区中计算clamped （循环缓冲区的最小幂大于5）。每个线程都需要自己的分配内存，因此它必须出现在这里

                int clamped_width = input.width() + 4;
                uint8_t *clamped_storage = (uint8_t *)malloc(clamped_width * 8);

                for (int yi = 0; yi < 32; yi++) {
                    int y = y_base + yi;

                    uint8_t *output_row = &c_result(0, y);

                    // 为此扫描线计算clamped ，跳过此切片中已计算的行。
                    int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2);
                    int max_y_clamped = (y + 2);
                    for (int cy = min_y_clamped; cy <= max_y_clamped; cy++) {
                        // 找出使用位掩码填充循环缓冲区的哪一行:
                        uint8_t *clamped_row =
                            clamped_storage + (cy & 7) * clamped_width;

                        // 通过clamped y坐标，找出我们从哪一行读取输入:
                        int clamped_y = std::min(std::max(cy, 0), input.height()-1);
                        uint8_t *input_row = &input(0, clamped_y);

                        // 用扩充填充.
                        for (int x = -2; x < input.width() + 2; x++) {
                            int clamped_x = std::min(std::max(x, 0), input.width()-1);
                            *clamped_row++ = input_row[clamped_x];
                        }
                    }

                    // 现在在x的向量上迭代输出的纯步骤。.
                    for (int x_vec = 0; x_vec < (input.width() + 15)/16; x_vec++) {
                        int x_base = std::min(x_vec * 16, input.width() - 16);

                        // 为minimum 和maximum helpers分配存储空间。一个向量就够了
                        __m128i minimum_storage, maximum_storage;

                        // maximum 的纯步骤是一个零向量
                        maximum_storage = _mm_setzero_si128();

                        //maximum的更新步骤
                        for (int max_y = y - 2; max_y <= y + 2; max_y++) {
                            uint8_t *clamped_row =
                                clamped_storage + (max_y & 7) * clamped_width;
                            for (int max_x = x_base - 2; max_x <= x_base + 2; max_x++) {
                                __m128i v = _mm_loadu_si128(
                                    (__m128i const *)(clamped_row + max_x + 2));
                                maximum_storage = _mm_max_epu8(maximum_storage, v);
                            }
                        }

                        // minimum 的纯步骤是一个向量。通过比较本身来创造它。
                        minimum_storage = _mm_cmpeq_epi32(_mm_setzero_si128(),
                                                          _mm_setzero_si128());

                        // minimum的更新步骤
                        for (int min_y = y - 2; min_y <= y + 2; min_y++) {
                            uint8_t *clamped_row =
                                clamped_storage + (min_y & 7) * clamped_width;
                            for (int min_x = x_base - 2; min_x <= x_base + 2; min_x++) {
                                __m128i v = _mm_loadu_si128(
                                    (__m128i const *)(clamped_row + min_x + 2));
                                minimum_storage = _mm_min_epu8(minimum_storage, v);
                            }
                        }

                        // 计算扩散度 spread.
                        __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage);

                        // 保存.
                        _mm_storeu_si128((__m128i *)(output_row + x_base), spread);

                    }
                }

                free(clamped_storage);
            }
        }

        // 如果没有启用openmp，请跳过计时比较。否则对C不公平.
        #ifdef _OPENMP
        double t2 = current_time();

        // 现在再次运行halide版本，而不需要jit编译开销。也要运行一百次.
        for (int iters = 0; iters < 100; iters++) {
            spread.realize(halide_result);
        }

        double t3 = current_time();

        // 报告时间。在我的机器上，400万像素的输入都需要3毫秒（快！），这是有意义的，因为它们使用相同的向量化和并行化策略。但是我发现halide更容易读、写、调试、修改和移植
        printf("Halide spread took %f ms. C equivalent took %f ms\n",
               (t3 - t2)/100, (t2 - t1)/100);

        #endif // _OPENMP
        // 检查结果一致性
        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                if (halide_result(x, y) != c_result(x, y)) {
                    printf("halide_result(%d, %d) = %d instead of %d\n",
                           x, y, halide_result(x, y), c_result(x, y));
                    return -1;
                }
            }
        }

        #endif // __SSE2__
    }

    printf("Success!\n");
    return 0;}
图91
图92
图93
图94
图95
图96
图97
图98
图99
Aoulun
关注
0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
halide编程技术指南（连载五）

本文是halide编程指南的连载，已同步至公众号第九章多重传递函数、更新定义和约化#include "Halide.h"#include <stdio.h>// 我们将在本课使用x86 SSE内部函数.#ifdef __SSE2__#include <emmintrin.h>#endif// 最后我们还需要一个时钟来做性能测试.#include "clock.h"using namespace Halide;// 加载PNG的支持代码.#incl.
复制链接

扫一扫