# 算法描述与性能优化的解耦——Halide语言 （1）

##### 版权声明：原创作品，欢迎转载，但转载请以超链接形式注明文章来源(planckscale.info)、作者信息和本声明，否则将追究法律责任。

Halide就是这样一门语言。

void box_filter_3x3(const Image &amp; in, Image &amp; blury) {
Image blurx(in.width(), in.height()); // allocate blurx array
for (int y = 0; y &lt; in.height(); y++)
for (int x = 0; x &lt; in.width(); x++)
blurx(x, y) = (in(x - 1, y) + in(x, y) + in(x + 1, y)) / 3;
for (int y = 0; y &lt; in.height(); y++)
for (int x = 0; x &lt; in.width(); x++)
blury(x, y) = (blurx(x, y - 1) + blurx(x, y) + blurx(x, y + 1)) / 3;
}



9.96 ms/megapixel

void box_filter_3x3(const Image &amp; in, Image &amp; blury) {
__m128ione_third = _mm_set1_epi16(21846);
#pragmaomp parallel for
for (int yTile = 0; yTile &lt; in.height(); yTile += 32) {
__m128ia, b, c, sum, avg;
__m128i blurx[(256 / 8)*(32 + 2)]; // allocate tile blurx array
for (int xTile = 0; xTile &lt; in.width(); xTile += 256) {
__m128i*blurxPtr = blurx;
for (int y = -1; y &lt; 32 + 1; y++) {
const uint16_t *inPtr = &amp; (in[yTile + y][xTile]);
for (int x = 0; x &lt; 256; x += 8) {
avg = _mm_mulhi_epi16(sum, one_third);
_mm_store_si128(blurxPtr++, avg);
inPtr += 8;
}
}
blurxPtr = blurx;
for (int y = 0; y &lt; 32; y++) {
__m128i*outPtr = (__m128i*)(&amp; (blury[yTile + y][xTile]));
for (int x = 0; x &lt; 256; x += 8) {
a = _mm_load_si128(blurxPtr + (2 * 256) / 8);
b = _mm_load_si128(blurxPtr + 256 / 8);
avg = _mm_mulhi_epi16(sum, one_third);
_mm_store_si128(outPtr++, avg);
}
}
}
}
}



11x fasterthan a
naïve implementation
0.9 ms/megapixel

Func halide_blur(Func in) {
Func tmp, blurred;
Var x, y, xi, yi;
// The algorithm
tmp(x, y) = (in(x - 1, y) + in(x, y) + in(x + 1, y)) / 3;
blurred(x, y) = (tmp(x, y - 1) + tmp(x, y) + tmp(x, y + 1)) / 3;
// The schedule
blurred.tile(x, y, xi, yi, 256, 32)
.vectorize(xi, 8).parallel(y);
tmp.chunk(x).vectorize(x, 8);
return blurred;
}



0.9 ms/megapixel

Halide目前并没有太多的考虑编译器自动优化的问题，但这是一个漂亮的开端。如果将来在手动优化的同时仍有强大的编译器优化做后盾，将会是一番什么景象？

（未完待续）

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客