Halide示例学习四
多阶段计算图
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
// 两阶段的计算图
{
Func producer("producer_default"), consumer("consumer_default");
// 第一阶段:数学计算
producer(x, y) = sin(x * y);
// 第二阶段:求平均
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
// 开启追踪
consumer.trace_stores();
producer.trace_stores();
// 实例化为4x4的空间
printf("\nEvaluating producer-consumer pipeline with default schedule\n");
// 从这里也可以看出,这两个阶段不是先计算完producer然后再计算consumer
// 而是最后组成了consumer这个整体的计算公式
consumer.realize({4, 4});
// 同下述的公式一致
// consumer(x, y) = (sin(x * y) +
// sin(x * (y + 1)) +
// sin((x + 1) * y) +
// sin((x + 1) * (y + 1))/4);
// 等效C代码如下所示
float result[4][4];
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
result[y][x] = (sin(x * y) +
sin(x * (y + 1)) +
sin((x + 1) * y) +
sin((x + 1) * (y + 1))) / 4;
}
}
printf("\n");
// 使用print_loop_nest,将不会出现producer的任何信息,因为被内建再consumer中
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
}
return 0;
}
# 结果如下所示
# 默认是全部使用内建函数
Store consumer_default.0(0, 0) = 0.210368
Store consumer_default.0(1, 0) = 0.437692
Store consumer_default.0(2, 0) = 0.262604
Store consumer_default.0(3, 0) = -0.153921
Store consumer_default.0(0, 1) = 0.437692
Store consumer_default.0(1, 1) = 0.475816
Store consumer_default.0(2, 1) = 0.003550
Store consumer_default.0(3, 1) = 0.023565
Store consumer_default.0(0, 2) = 0.262604
Store consumer_default.0(1, 2) = 0.003550
Store consumer_default.0(2, 2) = -0.225879
Store consumer_default.0(3, 2) = 0.146372
Store consumer_default.0(0, 3) = -0.153921
Store consumer_default.0(1, 3) = 0.023565
Store consumer_default.0(2, 3) = 0.146372
Store consumer_default.0(3, 3) = -0.237233
多计算图组合示例
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
// 使用compute_root来强制使得计算图进行计算
{
Func producer("producer_root"), consumer("consumer_root");
producer(x, y) = sin(x * y);
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
// 让Halide在计算consumer前先计算producer
producer.compute_root();
consumer.trace_stores();
producer.trace_stores();
printf("\nEvaluating producer.compute_root()\n");
consumer.realize({4, 4});
// 输出的计算过程中将会出现producer的计算过程
// 等效C代码
float result[4][4];
float producer_storage[5][5];
// 先计算producer_storage
for (int y = 0; y < 5; y++) {
for (int x = 0; x < 5; x++) {
producer_storage[y][x] = sin(x * y);
}
}
// 然后计算result
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
result[y][x] = (producer_storage[y][x] +
producer_storage[y + 1][x] +
producer_storage[y][x + 1] +
producer_storage[y + 1][x + 1]) / 4;
}
}
// 实例化最终的输出尺寸4x4的时候,Halide会自动进行尺寸的反向推导
// 知道producer的尺寸为5x5
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
}
/* 现在来比较上述两者的差别,特别是数据存取次数、计算次数方面
全内建计算图 (默认的调度):
- 分配的临时内存: 0
- 取数操作: 0
- 存数操作: 16
- sin函数的调用次数: 64
分步计算:
- 分配的临时内存: 25 floats
- 取数操作: 64
- 存数操作: 41
- sin函数的调用次数: 25
return 0;
}
#结果
Store producer_root.0(0, 0) = 0.000000
Store producer_root.0(1, 0) = 0.000000
Store producer_root.0(2, 0) = 0.000000
Store producer_root.0(3, 0) = 0.000000
Store producer_root.0(4, 0) = 0.000000
Store producer_root.0(0, 1) = 0.000000
Store producer_root.0(1, 1) = 0.841471
Store producer_root.0(2, 1) = 0.909297
Store producer_root.0(3, 1) = 0.141120
Store producer_root.0(4, 1) = -0.756802
Store producer_root.0(0, 2) = 0.000000
Store producer_root.0(1, 2) = 0.909297
Store producer_root.0(2, 2) = -0.756802
Store producer_root.0(3, 2) = -0.279415
Store producer_root.0(4, 2) = 0.989358
Store producer_root.0(0, 3) = 0.000000
Store producer_root.0(1, 3) = 0.141120
Store producer_root.0(2, 3) = -0.279415
Store producer_root.0(3, 3) = 0.412118
Store producer_root.0(4, 3) = -0.536573
Store producer_root.0(0, 4) = 0.000000
Store producer_root.0(1, 4) = -0.756802
Store producer_root.0(2, 4) = 0.989358
Store producer_root.0(3, 4) = -0.536573
Store producer_root.0(4, 4) = -0.287903
Store consumer_root.0(0, 0) = 0.210368
Store consumer_root.0(1, 0) = 0.437692
Store consumer_root.0(2, 0) = 0.262604
Store consumer_root.0(3, 0) = -0.153921
Store consumer_root.0(0, 1) = 0.437692
Store consumer_root.0(1, 1) = 0.475816
Store consumer_root.0(2, 1) = 0.003550
Store consumer_root.0(3, 1) = 0.023565
Store consumer_root.0(0, 2) = 0.262604
Store consumer_root.0(1, 2) = 0.003550
Store consumer_root.0(2, 2) = -0.225879
Store consumer_root.0(3, 2) = 0.146372
Store consumer_root.0(0, 3) = -0.153921
Store consumer_root.0(1, 3) = 0.023565
Store consumer_root.0(2, 3) = 0.146372
Store consumer_root.0(3, 3) = -0.237233
示意图显示结果如下所示:
compute_at使用方法
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
// 介于内建和非内建之间的部分
{
Func producer("producer_y"), consumer("consumer_y");
producer(x, y) = sin(x * y);
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
// 当compute_at(consumer,x)时跟全内建的形式非常相同
/*
a.compute_at(b,y):表示在y这个循环块内插入一块计算在y固定的时候,计算必要的a的代码
*/
producer.compute_at(consumer, y);
producer.trace_stores();
consumer.trace_stores();
printf("\nEvaluating producer.compute_at(consumer, y)\n");
consumer.realize({4, 4});
// 等效C代码如下所示
float result[4][4];
for (int y = 0; y < 4; y++) {
// 为了能够满足计算结果一行的需求,我们知道中间输入至少要两行
float producer_storage[2][5];
for (int py = y; py < y + 2; py++) {
for (int px = 0; px < 5; px++) {
producer_storage[py - y][px] = sin(px * py);
}
}
// Compute a scanline of the consumer.
for (int x = 0; x < 4; x++) {
result[y][x] = (producer_storage[0][x] +
producer_storage[1][x] +
producer_storage[0][x + 1] +
producer_storage[1][x + 1]) / 4;
}
}
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
// 这种做法介于内建和非内建之间,好处就是可以利用数据的局部性
// 不仅减少了中间结果缓存空间,同时利用了高速缓存实现快速读取
// producer.compute_at(consumer, y):
// - 缓存空间: 10 floats
// - 读: 64
// - 存: 56
// - sin调用次数: 40
}
return 0;
}
# 结果如下所示
Store producer_y.0(0, 0) = 0.000000
Store producer_y.0(1, 0) = 0.000000
Store producer_y.0(2, 0) = 0.000000
Store producer_y.0(3, 0) = 0.000000
Store producer_y.0(4, 0) = 0.000000
Store producer_y.0(0, 1) = 0.000000
Store producer_y.0(1, 1) = 0.841471
Store producer_y.0(2, 1) = 0.909297
Store producer_y.0(3, 1) = 0.141120
Store producer_y.0(4, 1) = -0.756802
Store consumer_y.0(0, 0) = 0.210368
Store consumer_y.0(1, 0) = 0.437692
Store consumer_y.0(2, 0) = 0.262604
Store consumer_y.0(3, 0) = -0.153921
Store producer_y.0(0, 1) = 0.000000
Store producer_y.0(1, 1) = 0.841471
Store producer_y.0(2, 1) = 0.909297
Store producer_y.0(3, 1) = 0.141120
Store producer_y.0(4, 1) = -0.756802
Store producer_y.0(0, 2) = 0.000000
Store producer_y.0(1, 2) = 0.909297
Store producer_y.0(2, 2) = -0.756802
Store producer_y.0(3, 2) = -0.279415
Store producer_y.0(4, 2) = 0.989358
Store consumer_y.0(0, 1) = 0.437692
Store consumer_y.0(1, 1) = 0.475816
Store consumer_y.0(2, 1) = 0.003550
Store consumer_y.0(3, 1) = 0.023565
Store producer_y.0(0, 2) = 0.000000
Store producer_y.0(1, 2) = 0.909297
Store producer_y.0(2, 2) = -0.756802
Store producer_y.0(3, 2) = -0.279415
Store producer_y.0(4, 2) = 0.989358
Store producer_y.0(0, 3) = 0.000000
Store producer_y.0(1, 3) = 0.141120
Store producer_y.0(2, 3) = -0.279415
Store producer_y.0(3, 3) = 0.412118
Store producer_y.0(4, 3) = -0.536573
Store consumer_y.0(0, 2) = 0.262604
Store consumer_y.0(1, 2) = 0.003550
Store consumer_y.0(2, 2) = -0.225879
Store consumer_y.0(3, 2) = 0.146372
Store producer_y.0(0, 3) = 0.000000
Store producer_y.0(1, 3) = 0.141120
Store producer_y.0(2, 3) = -0.279415
Store producer_y.0(3, 3) = 0.412118
Store producer_y.0(4, 3) = -0.536573
Store producer_y.0(0, 4) = 0.000000
Store producer_y.0(1, 4) = -0.756802
Store producer_y.0(2, 4) = 0.989358
Store producer_y.0(3, 4) = -0.536573
Store producer_y.0(4, 4) = -0.287903
Store consumer_y.0(0, 3) = -0.153921
Store consumer_y.0(1, 3) = 0.023565
Store consumer_y.0(2, 3) = 0.146372
Store consumer_y.0(3, 3) = -0.237233
示意图显示结果:
store_root配合compute_at使用方法
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
// 下述展示了强制中间buffer保持计算结果的情况
{
Func producer("producer_root_y"), consumer("consumer_root_y");
producer(x, y) = sin(x * y);
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
// 让producer保持中间计算值
producer.store_root();
// ... but compute it as needed per y coordinate of the
// consumer.
producer.compute_at(consumer, y);
producer.trace_stores();
consumer.trace_stores();
printf("\nEvaluating producer.store_root().compute_at(consumer, y)\n");
consumer.realize({4, 4});
// result/lesson_08_store_root_compute_y.gif
// 等效C代码如下所示
float result[4][4];
float producer_storage[5][5];
for (int y = 0; y < 4; y++) {
// Compute enough of the producer to satisfy this scanline
// of the consumer.
for (int py = y; py < y + 2; py++) {
// 跳过已经经过计算的prouducer点
if (y > 0 && py == y) continue;
for (int px = 0; px < 5; px++) {
producer_storage[py][px] = sin(px * py);
}
}
// Compute a scanline of the consumer.
for (int x = 0; x < 4; x++) {
result[y][x] = (producer_storage[y][x] +
producer_storage[y + 1][x] +
producer_storage[y][x + 1] +
producer_storage[y + 1][x + 1]) / 4;
}
}
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
// producer.store_root().compute_at(consumer, y):
// - 分配的中间缓存: 10 floats
// - 取数: 64 = 4 * 4 * 4
// - 存数: 41 = 5 * 5 + 4 * 4
// - sin调用次数: 25 = 5 * 5
// 关于分配的中间缓存为何是10floats
// 主要是因为实际的Halide使用了如下所示的循环中间缓存作为缓存
{
// Actually store 2 scanlines instead of 5
float producer_storage[2][5];
for (int y = 0; y < 4; y++) {
for (int py = y; py < y + 2; py++) {
if (y > 0 && py == y) continue;
for (int px = 0; px < 5; px++) {
// 循环缓存
producer_storage[py & 1][px] = sin(px * py);
}
}
for (int x = 0; x < 4; x++) {
// 循环调用
result[y][x] = (producer_storage[y & 1][x] +
producer_storage[(y + 1) & 1][x] +
producer_storage[y & 1][x + 1] +
producer_storage[(y + 1) & 1][x + 1]) / 4;
}
}
}
}
return 0;
}
示意图显示结果:
tile配合compute_at使用方法
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
{
Func producer("producer_tile"), consumer("consumer_tile");
producer(x, y) = sin(x * y);
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
Var x_outer, y_outer, x_inner, y_inner;
consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
producer.compute_at(consumer, x_outer);
producer.trace_stores();
consumer.trace_stores();
printf("\nEvaluating:\n"
"consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);\n"
"producer.compute_at(consumer, x_outer);\n");
consumer.realize({8, 8});
float result[8][8];
// For every tile of the consumer:
for (int y_outer = 0; y_outer < 2; y_outer++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
// 瓦片下标配合
int x_base = x_outer * 4;
int y_base = y_outer * 4;
// 下面这个内循环就是因为compute_as(,x_outer)的缘故,在此处进行了producer的必要的部分计算
float producer_storage[5][5];
for (int py = y_base; py < y_base + 5; py++) {
for (int px = x_base; px < x_base + 5; px++) {
producer_storage[py - y_base][px - x_base] = sin(px * py);
}
}
// 一个瓦片的内容
for (int y_inner = 0; y_inner < 4; y_inner++) {
for (int x_inner = 0; x_inner < 4; x_inner++) {
int x = x_base + x_inner;
int y = y_base + y_inner;
result[y][x] =
(producer_storage[y - y_base][x - x_base] +
producer_storage[y - y_base + 1][x - x_base] +
producer_storage[y - y_base][x - x_base + 1] +
producer_storage[y - y_base + 1][x - x_base + 1]) / 4;
}
}
}
}
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
}
return 0;
}
示意图显示结果:
compute_at和store_at以及之前的混合技使用方法
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
{
Func producer("producer_mixed"), consumer("consumer_mixed");
producer(x, y) = sin(x * y);
consumer(x, y) = (producer(x, y) +
producer(x, y + 1) +
producer(x + 1, y) +
producer(x + 1, y + 1)) / 4;
// y轴拆分成内16的双层循环
Var yo, yi;
consumer.split(y, yo, yi, 16);
// 并行化最外层循环yo
consumer.parallel(yo);
// consumer进行x轴进行4位向量化
consumer.vectorize(x, 4);
// 在yo轴内,创建一个中间缓存来保存prouducer计算结果
producer.store_at(consumer, yo);
// 在yi轴内,创建一个代码块计算yi内consumer所需的prouducer
producer.compute_at(consumer, yi);
// producer的x轴也进行4位向量化
producer.vectorize(x, 4);
Buffer<float> halide_result = consumer.realize({160, 160});
// 自己理解的C等效代码
{
float Consumer[160][160];
// omp loop
for(int yo = 0; yo < 10; yo++){
float producer_storage[17][161];// store_at
for(int yi = 0; yi < 16; yi++){
int yy = yo * 16 + yi;
// 计算所需的producer_storage
// compute_at
for(int x4 = 0; x4 < 160; x4++){
// 1部分内部应该是向量化4
producer_storage[yy&15][x4] = sin(yy*x4);
producer_storage[yy&15][x4 + 1] = sin(yy*(x4+1));
producer_storage[(yy + 1)&15][x4] = sin((yy + 1)*x4);
producer_storage[(yy+1)&15][(x4+1)] = sin((yy+1)*(x4+1));
}
for(int x4 = 0; x4 < 160; x4++){
Consumer[yy][x4] = producer_storage[yy&15][x4];
Consumer[yy][x4 + 1] = producer_storage[yy&15][x4 + 1];
Consumer[(yy + 1)][x4] = producer_storage[(yy + 1)&15][x4];
Consumer[(yy+1)][(x4+1)] = producer_storage[(yy+1)&15][(x4+1)];
}
}
}
}
// 下述是官方的C等效代码
float c_result[160][160];
// For every strip of 16 scanlines (this loop is parallel in
// the Halide version)
for (int yo = 0; yo < 160 / 16; yo++) {
int y_base = yo * 16;
// Allocate a two-scanline circular buffer for the producer
float producer_storage[2][161];
// For every scanline in the strip of 16:
for (int yi = 0; yi < 16; yi++) {
int y = y_base + yi;
for (int py = y; py < y + 2; py++) {
// store_at的影响
if (yi > 0 && py == y) continue;
for (int x_vec = 0; x_vec < 160 / 4 + 1; x_vec++) {
int x_base = x_vec * 4;
// 4 doesn't divide 161, so push the last vector left
// (see lesson 05).
if (x_base > 161 - 4) x_base = 161 - 4;
// If you're on x86, Halide generates SSE code for this part:
int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
float vec[4] = {sinf(x[0] * py), sinf(x[1] * py),
sinf(x[2] * py), sinf(x[3] * py)};
producer_storage[py & 1][x[0]] = vec[0];
producer_storage[py & 1][x[1]] = vec[1];
producer_storage[py & 1][x[2]] = vec[2];
producer_storage[py & 1][x[3]] = vec[3];
}
}
// Now compute consumer for this scanline:
for (int x_vec = 0; x_vec < 160 / 4; x_vec++) {
int x_base = x_vec * 4;
// Again, Halide's equivalent here uses SSE.
int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
float vec[] = {
(producer_storage[y & 1][x[0]] +
producer_storage[(y + 1) & 1][x[0]] +
producer_storage[y & 1][x[0] + 1] +
producer_storage[(y + 1) & 1][x[0] + 1]) /
4,
(producer_storage[y & 1][x[1]] +
producer_storage[(y + 1) & 1][x[1]] +
producer_storage[y & 1][x[1] + 1] +
producer_storage[(y + 1) & 1][x[1] + 1]) /
4,
(producer_storage[y & 1][x[2]] +
producer_storage[(y + 1) & 1][x[2]] +
producer_storage[y & 1][x[2] + 1] +
producer_storage[(y + 1) & 1][x[2] + 1]) /
4,
(producer_storage[y & 1][x[3]] +
producer_storage[(y + 1) & 1][x[3]] +
producer_storage[y & 1][x[3] + 1] +
producer_storage[(y + 1) & 1][x[3] + 1]) /
4};
c_result[y][x[0]] = vec[0];
c_result[y][x[1]] = vec[1];
c_result[y][x[2]] = vec[2];
c_result[y][x[3]] = vec[3];
}
}
}
printf("Pseudo-code for the schedule:\n");
consumer.print_loop_nest();
printf("\n");
// Look on my code, ye mighty, and despair!
// Let's check the C result against the Halide result. Doing
// this I found several bugs in my C implementation, which
// should tell you something.
for (int y = 0; y < 160; y++) {
for (int x = 0; x < 160; x++) {
float error = halide_result(x, y) - c_result[y][x];
// It's floating-point math, so we'll allow some slop:
if (error < -0.001f || error > 0.001f) {
printf("halide_result(%d, %d) = %f instead of %f\n",
x, y, halide_result(x, y), c_result[y][x]);
return -1;
}
}
}
}
return 0;
}
示意图显示结果: