默认的维度计算顺序
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
// 该示例演示了如何观察计算的维度变化顺序以及如何改变计算顺序
Var x("x"), y("y");
// 首先来观察一个默认的计算顺序
Func gradient("gradient");
gradient(x, y) = x + y;
gradient.trace_stores();
/* 一般地,图像中x表示列,y表示行
那么一般是先遍历一行,也就是从左往右,从上往下的遍历顺序
columns-列,rows-行
行-列遍历计算顺序
*/
printf("Evaluating gradient row-major\n");
Buffer<int> output = gradient.realize({4, 4});
// 等价的C代码如下所示:
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
printf("\n\n");
// 使用print_loop_nest函数可以打印出怎样的调度信息和循环嵌套
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
// 上述将会打印出如下所示的调度信息
// compute gradient:
// for y:
// for x:
// gradient(...) = ...
return 0;
}
# 运行结果如下所示:
Begin pipeline gradient.0()
Tag gradient.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
Store gradient.0(0, 0) = 0
Store gradient.0(1, 0) = 1
Store gradient.0(2, 0) = 2
Store gradient.0(3, 0) = 3
Store gradient.0(0, 1) = 1
Store gradient.0(1, 1) = 2
Store gradient.0(2, 1) = 3
Store gradient.0(3, 1) = 4
Store gradient.0(0, 2) = 2
Store gradient.0(1, 2) = 3
Store gradient.0(2, 2) = 4
Store gradient.0(3, 2) = 5
Store gradient.0(0, 3) = 3
Store gradient.0(1, 3) = 4
Store gradient.0(2, 3) = 5
Store gradient.0(3, 3) = 6
End pipeline gradient.0()
Evaluating gradient row-major
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Pseudo-code for the schedule:
produce gradient:
for y:
for x:
gradient(...) = ...
维度顺序重排操作
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Func gradient("gradient_col_major");
Var x,y;
gradient(x, y) = x + y;
gradient.trace_stores();
// 我们可以通过reorder类方法来进行调度的重排
gradient.reorder(y, x);
// 上述交换了x和y这两个轴的计算先后顺序
// 也就是说现在是列-行的计算顺序
printf("Evaluating gradient column-major\n");
Buffer<int> output = gradient.realize({4, 4});
// 以下是等效的C代码
printf("Equivalent C:\n");
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 4; y++) {
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
#运行结果如下所示
Begin pipeline gradient_col_major.0()
Tag gradient_col_major.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
Store gradient_col_major.0(0, 0) = 0
Store gradient_col_major.0(0, 1) = 1
Store gradient_col_major.0(0, 2) = 2
Store gradient_col_major.0(0, 3) = 3
Store gradient_col_major.0(1, 0) = 1
Store gradient_col_major.0(1, 1) = 2
Store gradient_col_major.0(1, 2) = 3
Store gradient_col_major.0(1, 3) = 4
Store gradient_col_major.0(2, 0) = 2
Store gradient_col_major.0(2, 1) = 3
Store gradient_col_major.0(2, 2) = 4
Store gradient_col_major.0(2, 3) = 5
Store gradient_col_major.0(3, 0) = 3
Store gradient_col_major.0(3, 1) = 4
Store gradient_col_major.0(3, 2) = 5
Store gradient_col_major.0(3, 3) = 6
End pipeline gradient_col_major.0()
Evaluating gradient column-major
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 0, y = 1: 1
Evaluating at x = 0, y = 2: 2
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 0: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 0: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 0: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 3, y = 3: 6
Pseudo-code for the schedule:
produce gradient_col_major:
for x:
for y:
gradient_col_major(...) = ...
维度拆分
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
Func gradient("gradient_split");
gradient(x, y) = x + y;
gradient.trace_stores();
// 以下的split进行了单层循环到多层循环嵌套的变化
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 2);
/* 以下的C代码展示了相应的循环嵌套转换结果
原始的x_dim=x_outer*factor+x_inner
*/
printf("Evaluating gradient with x split into x_outer and x_inner \n");
Buffer<int> output = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
for (int x_inner = 0; x_inner < 2; x_inner++) {
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
/* 打印出来的结果如下所示:
produce gradient_split:
for y:
for x.v0:
for x.v1 in [0, 1]:
gradient_split(...) = ...
*/
gradient.print_loop_nest();
printf("\n");
/*
说明:尽管进行了维度拆分,但是并没有影响那个像素点的计算顺序
*/
return 0;
}
#运行结果如下所示
Begin pipeline gradient_split.0()
Tag gradient_split.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
Store gradient_split.0(0, 0) = 0
Store gradient_split.0(1, 0) = 1
Store gradient_split.0(2, 0) = 2
Store gradient_split.0(3, 0) = 3
Store gradient_split.0(0, 1) = 1
Store gradient_split.0(1, 1) = 2
Store gradient_split.0(2, 1) = 3
Store gradient_split.0(3, 1) = 4
Store gradient_split.0(0, 2) = 2
Store gradient_split.0(1, 2) = 3
Store gradient_split.0(2, 2) = 4
Store gradient_split.0(3, 2) = 5
Store gradient_split.0(0, 3) = 3
Store gradient_split.0(1, 3) = 4
Store gradient_split.0(2, 3) = 5
Store gradient_split.0(3, 3) = 6
End pipeline gradient_split.0()
Evaluating gradient with x split into x_outer and x_inner
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Pseudo-code for the schedule:
produce gradient_split:
for y:
for x.v0:
for x.v1 in [0, 1]:
gradient_split(...) = ...
维度融合
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
Func gradient("gradient_fused");
gradient(x, y) = x + y;
// 维度拆分的反例就是维度融合
// 通过融合可以把双循环变成单循环,同时像素点的计算顺序不变
Var fused;
gradient.fuse(x, y, fused);
printf("Evaluating gradient with x and y fused\n");
Buffer<int> output = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int fused = 0; fused < 4 * 4; fused++) {
int y = fused / 4;
int x = fused % 4;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
#运行结果如下所示
Evaluating gradient with x and y fused
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Pseudo-code for the schedule:
produce gradient_fused:
for x.v2:
gradient_fused(...) = ...
维度平铺
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
//平铺操作,类似于取一个图像中相邻的几个像素点的操作,比较像卷积操作
Var x,y;
Func gradient("gradient_tiled");
gradient(x, y) = x + y;
gradient.trace_stores();
// 平铺可以是维度拆分+循环重排的组合技能
Var x_outer, x_inner, y_outer, y_inner;
gradient.split(x, x_outer, x_inner, 4);
gradient.split(y, y_outer, y_inner, 4);
gradient.reorder(x_inner, y_inner, x_outer, y_outer);
// 使用tile接口就是如下所示的操作
// gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
printf("Evaluating gradient in 4x4 tiles\n");
Buffer<int> output = gradient.realize({8, 8});
// 示意图可以查看lesson_05_tiled.gif
printf("Equivalent C:\n");
for (int y_outer = 0; y_outer < 2; y_outer++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
for (int y_inner = 0; y_inner < 4; y_inner++) {
for (int x_inner = 0; x_inner < 4; x_inner++) {
int x = x_outer * 4 + x_inner;
int y = y_outer * 4 + y_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
结果的动态展示图如下所示:
#运行结果如下所示
Begin pipeline gradient_tiled.0()
Tag gradient_tiled.0() tag = "func_type_and_dim: 1 0 32 1 2 0 8 0 8"
Store gradient_tiled.0(0, 0) = 0
Store gradient_tiled.0(1, 0) = 1
Store gradient_tiled.0(2, 0) = 2
Store gradient_tiled.0(3, 0) = 3
Store gradient_tiled.0(0, 1) = 1
Store gradient_tiled.0(1, 1) = 2
Store gradient_tiled.0(2, 1) = 3
Store gradient_tiled.0(3, 1) = 4
Store gradient_tiled.0(0, 2) = 2
Store gradient_tiled.0(1, 2) = 3
Store gradient_tiled.0(2, 2) = 4
Store gradient_tiled.0(3, 2) = 5
Store gradient_tiled.0(0, 3) = 3
Store gradient_tiled.0(1, 3) = 4
Store gradient_tiled.0(2, 3) = 5
Store gradient_tiled.0(3, 3) = 6
Store gradient_tiled.0(4, 0) = 4
Store gradient_tiled.0(5, 0) = 5
Store gradient_tiled.0(6, 0) = 6
Store gradient_tiled.0(7, 0) = 7
Store gradient_tiled.0(4, 1) = 5
Store gradient_tiled.0(5, 1) = 6
Store gradient_tiled.0(6, 1) = 7
Store gradient_tiled.0(7, 1) = 8
Store gradient_tiled.0(4, 2) = 6
Store gradient_tiled.0(5, 2) = 7
Store gradient_tiled.0(6, 2) = 8
Store gradient_tiled.0(7, 2) = 9
Store gradient_tiled.0(4, 3) = 7
Store gradient_tiled.0(5, 3) = 8
Store gradient_tiled.0(6, 3) = 9
Store gradient_tiled.0(7, 3) = 10
Store gradient_tiled.0(0, 4) = 4
Store gradient_tiled.0(1, 4) = 5
Store gradient_tiled.0(2, 4) = 6
Store gradient_tiled.0(3, 4) = 7
Store gradient_tiled.0(0, 5) = 5
Store gradient_tiled.0(1, 5) = 6
Store gradient_tiled.0(2, 5) = 7
Store gradient_tiled.0(3, 5) = 8
Store gradient_tiled.0(0, 6) = 6
Store gradient_tiled.0(1, 6) = 7
Store gradient_tiled.0(2, 6) = 8
Store gradient_tiled.0(3, 6) = 9
Store gradient_tiled.0(0, 7) = 7
Store gradient_tiled.0(1, 7) = 8
Store gradient_tiled.0(2, 7) = 9
Store gradient_tiled.0(3, 7) = 10
Store gradient_tiled.0(4, 4) = 8
Store gradient_tiled.0(5, 4) = 9
Store gradient_tiled.0(6, 4) = 10
Store gradient_tiled.0(7, 4) = 11
Store gradient_tiled.0(4, 5) = 9
Store gradient_tiled.0(5, 5) = 10
Store gradient_tiled.0(6, 5) = 11
Store gradient_tiled.0(7, 5) = 12
Store gradient_tiled.0(4, 6) = 10
Store gradient_tiled.0(5, 6) = 11
Store gradient_tiled.0(6, 6) = 12
Store gradient_tiled.0(7, 6) = 13
Store gradient_tiled.0(4, 7) = 11
Store gradient_tiled.0(5, 7) = 12
Store gradient_tiled.0(6, 7) = 13
Store gradient_tiled.0(7, 7) = 14
End pipeline gradient_tiled.0()
Evaluating gradient in 4x4 tiles
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Evaluating at x = 4, y = 0: 4
Evaluating at x = 5, y = 0: 5
Evaluating at x = 6, y = 0: 6
Evaluating at x = 7, y = 0: 7
Evaluating at x = 4, y = 1: 5
Evaluating at x = 5, y = 1: 6
Evaluating at x = 6, y = 1: 7
Evaluating at x = 7, y = 1: 8
Evaluating at x = 4, y = 2: 6
Evaluating at x = 5, y = 2: 7
Evaluating at x = 6, y = 2: 8
Evaluating at x = 7, y = 2: 9
Evaluating at x = 4, y = 3: 7
Evaluating at x = 5, y = 3: 8
Evaluating at x = 6, y = 3: 9
Evaluating at x = 7, y = 3: 10
Evaluating at x = 0, y = 4: 4
Evaluating at x = 1, y = 4: 5
Evaluating at x = 2, y = 4: 6
Evaluating at x = 3, y = 4: 7
Evaluating at x = 0, y = 5: 5
Evaluating at x = 1, y = 5: 6
Evaluating at x = 2, y = 5: 7
Evaluating at x = 3, y = 5: 8
Evaluating at x = 0, y = 6: 6
Evaluating at x = 1, y = 6: 7
Evaluating at x = 2, y = 6: 8
Evaluating at x = 3, y = 6: 9
Evaluating at x = 0, y = 7: 7
Evaluating at x = 1, y = 7: 8
Evaluating at x = 2, y = 7: 9
Evaluating at x = 3, y = 7: 10
Evaluating at x = 4, y = 4: 8
Evaluating at x = 5, y = 4: 9
Evaluating at x = 6, y = 4: 10
Evaluating at x = 7, y = 4: 11
Evaluating at x = 4, y = 5: 9
Evaluating at x = 5, y = 5: 10
Evaluating at x = 6, y = 5: 11
Evaluating at x = 7, y = 5: 12
Evaluating at x = 4, y = 6: 10
Evaluating at x = 5, y = 6: 11
Evaluating at x = 6, y = 6: 12
Evaluating at x = 7, y = 6: 13
Evaluating at x = 4, y = 7: 11
Evaluating at x = 5, y = 7: 12
Evaluating at x = 6, y = 7: 13
Evaluating at x = 7, y = 7: 14
Pseudo-code for the schedule:
produce gradient_tiled:
for y.v5:
for x.v3:
for y.v6 in [0, 3]:
for x.v4 in [0, 3]:
gradient_tiled(...) = ...
向量化操作
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
// 向量化操作
Var x,y;
Func gradient("gradient_in_vectors");
gradient(x, y) = x + y;
gradient.trace_stores();
// 使用向量化操作,在x86平台上可以使用SSE指令集进行长度为4的向量加速
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 4);
gradient.vectorize(x_inner);
// 可以使用以下操作等效上述两个语句
// gradient.vectorize(x, 4);
printf("Evaluating gradient with x_inner vectorized \n");
Buffer<int> output = gradient.realize({8, 4});
// 动态图可见lesson_05_vectors.gif
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
// 内部的循环已经被指令集加速进行替换
int x_vec[] = {x_outer * 4 + 0,
x_outer * 4 + 1,
x_outer * 4 + 2,
x_outer * 4 + 3};
int val[] = {x_vec[0] + y,
x_vec[1] + y,
x_vec[2] + y,
x_vec[3] + y};
printf("Evaluating at <%d, %d, %d, %d>, <%d, %d, %d, %d>:"
" <%d, %d, %d, %d>\n",
x_vec[0], x_vec[1], x_vec[2], x_vec[3],
y, y, y, y,
val[0], val[1], val[2], val[3]);
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
向量化的示意图如下所示:
#运行结果如下所示
Begin pipeline gradient_in_vectors.0()
Tag gradient_in_vectors.0() tag = "func_type_and_dim: 1 0 32 1 2 0 8 0 4"
Store gradient_in_vectors.0(<0, 1, 2, 3>, <0, 0, 0, 0>) = <0, 1, 2, 3>
Store gradient_in_vectors.0(<4, 5, 6, 7>, <0, 0, 0, 0>) = <4, 5, 6, 7>
Store gradient_in_vectors.0(<0, 1, 2, 3>, <1, 1, 1, 1>) = <1, 2, 3, 4>
Store gradient_in_vectors.0(<4, 5, 6, 7>, <1, 1, 1, 1>) = <5, 6, 7, 8>
Store gradient_in_vectors.0(<0, 1, 2, 3>, <2, 2, 2, 2>) = <2, 3, 4, 5>
Store gradient_in_vectors.0(<4, 5, 6, 7>, <2, 2, 2, 2>) = <6, 7, 8, 9>
Store gradient_in_vectors.0(<0, 1, 2, 3>, <3, 3, 3, 3>) = <3, 4, 5, 6>
Store gradient_in_vectors.0(<4, 5, 6, 7>, <3, 3, 3, 3>) = <7, 8, 9, 10>
End pipeline gradient_in_vectors.0()
Evaluating gradient with x_inner vectorized
Equivalent C:
Evaluating at <0, 1, 2, 3>, <0, 0, 0, 0>: <0, 1, 2, 3>
Evaluating at <4, 5, 6, 7>, <0, 0, 0, 0>: <4, 5, 6, 7>
Evaluating at <0, 1, 2, 3>, <1, 1, 1, 1>: <1, 2, 3, 4>
Evaluating at <4, 5, 6, 7>, <1, 1, 1, 1>: <5, 6, 7, 8>
Evaluating at <0, 1, 2, 3>, <2, 2, 2, 2>: <2, 3, 4, 5>
Evaluating at <4, 5, 6, 7>, <2, 2, 2, 2>: <6, 7, 8, 9>
Evaluating at <0, 1, 2, 3>, <3, 3, 3, 3>: <3, 4, 5, 6>
Evaluating at <4, 5, 6, 7>, <3, 3, 3, 3>: <7, 8, 9, 10>
Pseudo-code for the schedule:
produce gradient_in_vectors:
for y:
for x.v7:
vectorized x.v8 in [0, 3]:
gradient_in_vectors(...) = ...
循环展开
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
Func gradient("gradient_unroll");
gradient(x, y) = x + y;
gradient.trace_stores();
// 循环展开就指被指定的那层循环用重复代码进行替换,消除掉该层
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 2);
gradient.unroll(x_inner);
// 上述可以用unroll一个语句代替,如下所示
// gradient.unroll(x, 2);
printf("Evaluating gradient unrolled by a factor of two\n");
Buffer<int> result = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
// 循环展开如下所示
{
int x_inner = 0;
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
{
int x_inner = 1;
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
#运行结果如下所示
Begin pipeline gradient_unroll.0()
Tag gradient_unroll.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
Store gradient_unroll.0(0, 0) = 0
Store gradient_unroll.0(1, 0) = 1
Store gradient_unroll.0(2, 0) = 2
Store gradient_unroll.0(3, 0) = 3
Store gradient_unroll.0(0, 1) = 1
Store gradient_unroll.0(1, 1) = 2
Store gradient_unroll.0(2, 1) = 3
Store gradient_unroll.0(3, 1) = 4
Store gradient_unroll.0(0, 2) = 2
Store gradient_unroll.0(1, 2) = 3
Store gradient_unroll.0(2, 2) = 4
Store gradient_unroll.0(3, 2) = 5
Store gradient_unroll.0(0, 3) = 3
Store gradient_unroll.0(1, 3) = 4
Store gradient_unroll.0(2, 3) = 5
Store gradient_unroll.0(3, 3) = 6
End pipeline gradient_unroll.0()
Evaluating gradient unrolled by a factor of two
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Pseudo-code for the schedule:
produce gradient_unroll:
for y:
for x.v9:
unrolled x.v10 in [0, 1]:
gradient_unroll(...) = ...
维度拆分split的特殊情况
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
Func gradient("gradient_split_7x2");
gradient(x, y) = x + y;
gradient.trace_stores();
// 此时进行拆分,那么将会出现没有维度溢出的情况
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 3);
printf("Evaluating gradient over a 7x2 box with x split by three \n");
Buffer<int> output = gradient.realize({7, 2});
// 示意图可以见lesson_05_split_7_by_3.gif
printf("Equivalent C:\n");
for (int y = 0; y < 2; y++) {
for (int x_outer = 0; x_outer < 3; x_outer++) { // Now runs from 0 to 2
for (int x_inner = 0; x_inner < 3; x_inner++) {
int x = x_outer * 3;
// 在计算前,为了防止溢出7之外,这里使用判断进行溢出处理
// 这样的判断导致了不会溢出,最多到6下标这个点
if (x > 4) x = 4;
x += x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
// The general rule is: If we require x from x_min to x_min + x_extent, and
// we split by a factor 'factor', then:
//
// x_outer runs from 0 to (x_extent + factor - 1)/factor
// x_inner runs from 0 to factor
// x = min(x_outer * factor, x_extent - factor) + x_inner + x_min
// In our example, x_min was 0, x_extent was 7, and factor was 3.
return 0;
}
维度拆分的特殊情况的示意图如下所示:
#运行结果如下所示
Begin pipeline gradient_split_7x2.0()
Tag gradient_split_7x2.0() tag = "func_type_and_dim: 1 0 32 1 2 0 7 0 2"
Store gradient_split_7x2.0(0, 0) = 0
Store gradient_split_7x2.0(1, 0) = 1
Store gradient_split_7x2.0(2, 0) = 2
Store gradient_split_7x2.0(3, 0) = 3
Store gradient_split_7x2.0(4, 0) = 4
Store gradient_split_7x2.0(5, 0) = 5
Store gradient_split_7x2.0(4, 0) = 4
Store gradient_split_7x2.0(5, 0) = 5
Store gradient_split_7x2.0(6, 0) = 6
Store gradient_split_7x2.0(0, 1) = 1
Store gradient_split_7x2.0(1, 1) = 2
Store gradient_split_7x2.0(2, 1) = 3
Store gradient_split_7x2.0(3, 1) = 4
Store gradient_split_7x2.0(4, 1) = 5
Store gradient_split_7x2.0(5, 1) = 6
Store gradient_split_7x2.0(4, 1) = 5
Store gradient_split_7x2.0(5, 1) = 6
Store gradient_split_7x2.0(6, 1) = 7
End pipeline gradient_split_7x2.0()
Evaluating gradient over a 7x2 box with x split by three
Equivalent C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 4, y = 0: 4
Evaluating at x = 5, y = 0: 5
Evaluating at x = 4, y = 0: 4
Evaluating at x = 5, y = 0: 5
Evaluating at x = 6, y = 0: 6
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 4, y = 1: 5
Evaluating at x = 5, y = 1: 6
Evaluating at x = 4, y = 1: 5
Evaluating at x = 5, y = 1: 6
Evaluating at x = 6, y = 1: 7
Pseudo-code for the schedule:
produce gradient_split_7x2:
for y:
for x.v11:
for x.v12 in [0, 2]:
gradient_split_7x2(...) = ...
融合、平铺、并行化组合技
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
// 有时候为了进行并行化操作,我们需要使用融合和平铺来跨维度进行并行化操作
Func gradient("gradient_fused_tiles");
gradient(x, y) = x + y;
gradient.trace_stores();
// 平铺->外部循环融合->外部循环并行
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
gradient.fuse(x_outer, y_outer, tile_index);
gradient.parallel(tile_index);
// 能够使用如下的链式表达式进行上述的替代
// gradient
// .tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4)
// .fuse(x_outer, y_outer, tile_index)
// .parallel(tile_index);
printf("Evaluating gradient tiles in parallel\n");
Buffer<int> output = gradient.realize({8, 8});
// 如此一来,每个瓦片都是并行运行的
printf("Equivalent (serial) C:\n");
// 外部循环应该进行展开
// C中使用 # omp for
for (int tile_index = 0; tile_index < 4; tile_index++) {
int y_outer = tile_index / 2;
int x_outer = tile_index % 2;
for (int y_inner = 0; y_inner < 4; y_inner++) {
for (int x_inner = 0; x_inner < 4; x_inner++) {
int y = y_outer * 4 + y_inner;
int x = x_outer * 4 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
return 0;
}
三个组合技的示意图如下所示:
#运行结果如下所示
Begin pipeline gradient_fused_tiles.0()
Tag gradient_fused_tiles.0() tag = "func_type_and_dim: 1 0 32 1 2 0 8 0 8"
Store gradient_fused_tiles.0(0, 0) = 0
Store gradient_fused_tiles.0(1, 0) = 1
Store gradient_fused_tiles.0(2, 0) = 2
Store gradient_fused_tiles.0(3, 0) = 3
Store gradient_fused_tiles.0(0, 1) = 1
Store gradient_fused_tiles.0(1, 1) = 2
Store gradient_fused_tiles.0(2, 1) = 3
Store gradient_fused_tiles.0(3, 1) = 4
Store gradient_fused_tiles.0(0, 2) = 2
Store gradient_fused_tiles.0(1, 2) = 3
Store gradient_fused_tiles.0(2, 2) = 4
Store gradient_fused_tiles.0(3, 2) = 5
Store gradient_fused_tiles.0(0, 3) = 3
Store gradient_fused_tiles.0(1, 3) = 4
Store gradient_fused_tiles.0(2, 3) = 5
Store gradient_fused_tiles.0(3, 3) = 6
Store gradient_fused_tiles.0(4, 0) = 4
Store gradient_fused_tiles.0(4, 4) = 8
Store gradient_fused_tiles.0(0, 4) = 4
Store gradient_fused_tiles.0(5, 0) = 5
Store gradient_fused_tiles.0(1, 4) = 5
Store gradient_fused_tiles.0(5, 4) = 9
Store gradient_fused_tiles.0(6, 0) = 6
Store gradient_fused_tiles.0(2, 4) = 6
Store gradient_fused_tiles.0(6, 4) = 10
Store gradient_fused_tiles.0(7, 0) = 7
Store gradient_fused_tiles.0(3, 4) = 7
Store gradient_fused_tiles.0(4, 1) = 5
Store gradient_fused_tiles.0(7, 4) = 11
Store gradient_fused_tiles.0(0, 5) = 5
Store gradient_fused_tiles.0(5, 1) = 6
Store gradient_fused_tiles.0(1, 5) = 6
Store gradient_fused_tiles.0(4, 5) = 9
Store gradient_fused_tiles.0(2, 5) = 7
Store gradient_fused_tiles.0(5, 5) = 10
Store gradient_fused_tiles.0(6, 1) = 7
Store gradient_fused_tiles.0(3, 5) = 8
Store gradient_fused_tiles.0(6, 5) = 11
Store gradient_fused_tiles.0(7, 1) = 8
Store gradient_fused_tiles.0(0, 6) = 6
Store gradient_fused_tiles.0(7, 5) = 12
Store gradient_fused_tiles.0(4, 2) = 6
Store gradient_fused_tiles.0(1, 6) = 7
Store gradient_fused_tiles.0(4, 6) = 10
Store gradient_fused_tiles.0(5, 2) = 7
Store gradient_fused_tiles.0(5, 6) = 11
Store gradient_fused_tiles.0(2, 6) = 8
Store gradient_fused_tiles.0(6, 2) = 8
Store gradient_fused_tiles.0(6, 6) = 12
Store gradient_fused_tiles.0(3, 6) = 9
Store gradient_fused_tiles.0(7, 2) = 9
Store gradient_fused_tiles.0(7, 6) = 13
Store gradient_fused_tiles.0(0, 7) = 7
Store gradient_fused_tiles.0(4, 3) = 7
Store gradient_fused_tiles.0(1, 7) = 8
Store gradient_fused_tiles.0(4, 7) = 11
Store gradient_fused_tiles.0(5, 3) = 8
Store gradient_fused_tiles.0(2, 7) = 9
Store gradient_fused_tiles.0(6, 3) = 9
Store gradient_fused_tiles.0(5, 7) = 12
Store gradient_fused_tiles.0(3, 7) = 10
Store gradient_fused_tiles.0(7, 3) = 10
Store gradient_fused_tiles.0(6, 7) = 13
Store gradient_fused_tiles.0(7, 7) = 14
End pipeline gradient_fused_tiles.0()
Evaluating gradient tiles in parallel
Equivalent (serial) C:
Evaluating at x = 0, y = 0: 0
Evaluating at x = 1, y = 0: 1
Evaluating at x = 2, y = 0: 2
Evaluating at x = 3, y = 0: 3
Evaluating at x = 0, y = 1: 1
Evaluating at x = 1, y = 1: 2
Evaluating at x = 2, y = 1: 3
Evaluating at x = 3, y = 1: 4
Evaluating at x = 0, y = 2: 2
Evaluating at x = 1, y = 2: 3
Evaluating at x = 2, y = 2: 4
Evaluating at x = 3, y = 2: 5
Evaluating at x = 0, y = 3: 3
Evaluating at x = 1, y = 3: 4
Evaluating at x = 2, y = 3: 5
Evaluating at x = 3, y = 3: 6
Evaluating at x = 4, y = 0: 4
Evaluating at x = 5, y = 0: 5
Evaluating at x = 6, y = 0: 6
Evaluating at x = 7, y = 0: 7
Evaluating at x = 4, y = 1: 5
Evaluating at x = 5, y = 1: 6
Evaluating at x = 6, y = 1: 7
Evaluating at x = 7, y = 1: 8
Evaluating at x = 4, y = 2: 6
Evaluating at x = 5, y = 2: 7
Evaluating at x = 6, y = 2: 8
Evaluating at x = 7, y = 2: 9
Evaluating at x = 4, y = 3: 7
Evaluating at x = 5, y = 3: 8
Evaluating at x = 6, y = 3: 9
Evaluating at x = 7, y = 3: 10
Evaluating at x = 0, y = 4: 4
Evaluating at x = 1, y = 4: 5
Evaluating at x = 2, y = 4: 6
Evaluating at x = 3, y = 4: 7
Evaluating at x = 0, y = 5: 5
Evaluating at x = 1, y = 5: 6
Evaluating at x = 2, y = 5: 7
Evaluating at x = 3, y = 5: 8
Evaluating at x = 0, y = 6: 6
Evaluating at x = 1, y = 6: 7
Evaluating at x = 2, y = 6: 8
Evaluating at x = 3, y = 6: 9
Evaluating at x = 0, y = 7: 7
Evaluating at x = 1, y = 7: 8
Evaluating at x = 2, y = 7: 9
Evaluating at x = 3, y = 7: 10
Evaluating at x = 4, y = 4: 8
Evaluating at x = 5, y = 4: 9
Evaluating at x = 6, y = 4: 10
Evaluating at x = 7, y = 4: 11
Evaluating at x = 4, y = 5: 9
Evaluating at x = 5, y = 5: 10
Evaluating at x = 6, y = 5: 11
Evaluating at x = 7, y = 5: 12
Evaluating at x = 4, y = 6: 10
Evaluating at x = 5, y = 6: 11
Evaluating at x = 6, y = 6: 12
Evaluating at x = 7, y = 6: 13
Evaluating at x = 4, y = 7: 11
Evaluating at x = 5, y = 7: 12
Evaluating at x = 6, y = 7: 13
Evaluating at x = 7, y = 7: 14
Pseudo-code for the schedule:
produce gradient_fused_tiles:
parallel x.v13.v17:
for y.v16 in [0, 3]:
for x.v15 in [0, 3]:
gradient_fused_tiles(...) = ...
所有循环优化的手段组合运用
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x,y;
Func gradient_fast("gradient_fast");
gradient_fast(x, y) = x + y;
// 首先进行了大循环平铺->外循环融合->外循环并行化
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient_fast
.tile(x, y, x_outer, y_outer, x_inner, y_inner, 64, 64)
.fuse(x_outer, y_outer, tile_index)
.parallel(tile_index);
// 然后对平铺的瓦片进行平铺->向量化->循环展开
Var x_inner_outer, y_inner_outer, x_vectors, y_pairs;
gradient_fast
.tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs, 4, 2)
.vectorize(x_vectors)
.unroll(y_pairs);
// 这里没有用到维度分割和维度变化操作,因为使用了其他操作进行替换,但是这两种可以组合成其他操作
Buffer<int> result = gradient_fast.realize({350, 250});
// See figures/lesson_05_fast.mp4 for a visualization.
printf("Checking Halide result against equivalent C...\n");
for (int tile_index = 0; tile_index < 6 * 4; tile_index++) {
int y_outer = tile_index / 4;
int x_outer = tile_index % 4;
for (int y_inner_outer = 0; y_inner_outer < 64 / 2; y_inner_outer++) {
for (int x_inner_outer = 0; x_inner_outer < 64 / 4; x_inner_outer++) {
// We're vectorized across x
int x = std::min(x_outer * 64, 350 - 64) + x_inner_outer * 4;
int x_vec[4] = {x + 0,
x + 1,
x + 2,
x + 3};
// And we unrolled across y
int y_base = std::min(y_outer * 64, 250 - 64) + y_inner_outer * 2;
{
// y_pairs = 0
int y = y_base + 0;
int y_vec[4] = {y, y, y, y};
int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};
// Check the result.
for (int i = 0; i < 4; i++) {
if (result(x_vec[i], y_vec[i]) != val[i]) {
printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);
return -1;
}
}
}
{
// y_pairs = 1
int y = y_base + 1;
int y_vec[4] = {y, y, y, y};
int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};
// Check the result.
for (int i = 0; i < 4; i++) {
if (result(x_vec[i], y_vec[i]) != val[i]) {
printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);
return -1;
}
}
}
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient_fast.print_loop_nest();
printf("\n");
return 0;
}
所有的组合技叠加的效果:
#运行结果如下所示
Checking Halide result against equivalent C...
Pseudo-code for the schedule:
produce gradient_fast:
parallel x.v18.v22:
for y.v21.v24 in [0, 31]:
for x.v20.v23 in [0, 15]:
unrolled y.v21.v26 in [0, 1]:
vectorized x.v20.v25 in [0, 3]:
gradient_fast(...) = ...