heightfiled mask blur 节点分析与实现优化（一）

LinerPhong

已于 2022-03-07 20:46:59 修改

阅读量565

点赞数 3

分类专栏： Houdini 文章标签： houdini

于 2022-03-07 20:28:46 首次发布

本文链接：https://blog.csdn.net/LinerPhong/article/details/123337101

版权

Houdini 专栏收录该内容

5 篇文章 1 订阅

订阅专栏

当我们在houdini中对mask进行模糊时，通常需要使用两次heightfiled mask blur节点。

第一次使用aware blur以防止mask模糊后影响到mask为值0的volume。

如果直接使用based radius blur 则会造成硬边效果。

而aware blur的实现是based on iterations blur，在有限的迭代下无法做到大面积的模糊，因此必须要与based on radius的模糊搭配使用。人总是懒得，因此我决定制作一个mask blur节点，完成我们想要的模糊效果，还可以尝试是否有提升性能的机会。

以下便是在下的想法以及制作过程：

---------------------------------------------------------------------------------------------------------------------------------

在研究heightfiled mask blur节点的内部实现后，我发现aware blur是通过opencl实现，算法是基础的图形模糊算法，且有下几点可以进行优化：

1、heightfile类型的volume在z轴上仅有一层，heightfiled mask blur的默认实现是使用了for循环遍历了x，y，z个轴的volume，我认为z轴的for循环可以省略（当然实际情况得看需求），此优化部分在下面源码中标为红色。

2、我认为mask的值在无特殊需求下始终保持在(0, 1)的范围是必要工作，在mask值的范围在(0, 1)的前提下，aware blur可以省去clamp的步骤，此优化部分在下面源码中标为绿色。

3、使用opencl进行模糊时额外使用了两个layer，__mask以及__scratch，这两个层其实就是height layer以及mask layer的拷贝，我判断可以省略。

原因如下：

opencl的源码实现中__mask layer就是mask layer的拷贝，__scratch layer的存在是为了解决opencl并行运行时会发生的问题（某一线程读取数据之前，该数据就被运行得快的线程重新写入新的值了）。而根据heightfiled数据结构的特性，我们完全拥有更好的解决办法。

heightfiled 的数据结构如下，如果我们创建一个3×3×1的volume，真正的数据全部都存储在绿色部分，白色和灰色的部分真实存在且值为空。在此volume的条件下，height_stride_x = 1， height_stride_y = 5，height_stride_z = 25，height_stride_offset = 6。也就是说，我们要访问第一份数据——height[7]，访问第4份数据——height[12]，转换为变量访问的形式——height[height_stride_offset + height_stride_x * gidx+ height_stride_y * gidy]，数据部分的gidz始终为0（第0层），因此可以省略来提升一些效率。

又回到刚刚要优化__scratch layer的话题，我们可以给height的每一个索引都加上height_stride_z，把我们更新的值暂时存放在z轴向的第一层（也就是灰色box所在的层），之后通过writeBack写回到height的数据层，这样不就节省了关于__scratch layer的操作了吗！

4、一些名称上的更改，此opencl的height其实是mask层的拷贝，没错，opencl代码中的三个层其实都是mask层的拷贝，毕竟官方源码一定都具有更通用的特点，在某些更加细节的领域应该是可以有优化机会的，我将会把height名称更改为layer，以及在实参前面添加const修饰符更加符合C++代码规范。

#include "interpolate.h"
float lerpConstant( constant float * in, int size, float pos);

kernel void iterateMaskAwareBlur(
int height_stride_x,
int height_stride_y,
int height_stride_z,
int height_stride_offset,
int height_res_x,
int height_res_y,
int height_res_z,
global float * height ,
global float * mask ,
global float * scratch
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int idx = height_stride_offset + height_stride_x * gidx
+ height_stride_y * gidy
+ height_stride_z * gidz;

float acc = 0.0f;
int count = 0;
int startx = max(0, gidx-1);
int endx = min(gidx+1, height_res_x-1);
int starty = max(0, gidy-1);
int endy = min(gidy+1, height_res_y-1);
int startz = max(0, gidz-1);
int endz = min(gidz+1, height_res_z-1);

for (int z = startz; z <= endz; z++)
{
for (int y = starty; y <= endy; y++)
{
for (int x = startx; x <= endx; x++)
{
acc += height[height_stride_offset + height_stride_x*x + height_stride_y*y + height_stride_z*z];
count++;
}
}
}
if (count)
{
acc /= count;
// If we softened the blur by the mask, a blur of 0.5 would simulate half the iteration count.
// But the iteration count is the square of the blur radius. So if we want a mask of 0.5 to
// effectively half the blur radius at that point, we need to multiply by the square of the mask.
float m = clamp(mask[idx], 0.0f, 1.0f);
acc = mix(height[idx], acc, m*m);
}
scratch[idx] = acc;
}

kernel void writeBack(
int height_stride_x,
int height_stride_y,
int height_stride_z,
int height_stride_offset,
int height_res_x,
int height_res_y,
int height_res_z,
global float * height,
global float * mask,
global float * scratch
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int idx = height_stride_offset + height_stride_x * gidx
+ height_stride_y * gidy
+ height_stride_z * gidz;
if (mask[idx] > 0)
{
height[idx] = scratch[idx];
}
}

以下是优化后的opencl源代码

kernel void awareBlur(
const int layer_stride_x,
const int layer_stride_y,
const int layer_stride_z,
const int layer_stride_offset,
const int layer_res_x,
const int layer_res_y,
const int layer_res_z,
global float * layer,
global float * mask
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int idx = layer_stride_offset + layer_stride_x * gidx
+ layer_stride_y * gidy;

if(mask[idx] <= 0) return;

float acc = 0.0f;
int count = 0;
int startx = max(0, gidx-1);
int endx = min(gidx+1, layer_res_x-1);
int starty = max(0, gidy-1);
int endy = min(gidy+1, layer_res_y-1);

for (int y = starty; y <= endy; y++)
{
for (int x = startx; x <= endx; x++)
{
acc += layer[layer_stride_offset + layer_stride_x*x + layer_stride_y*y];
count++;
}
}

if(count)
{
acc /= count;
}

layer[idx + layer_stride_z] = acc;
}

kernel void writeBack(
const int layer_stride_x,
const int layer_stride_y,
const int layer_stride_z,
const int layer_stride_offset,
const int layer_res_x,
const int layer_res_y,
const int layer_res_z,
global float * layer,
global float * mask)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int idx = layer_stride_offset + layer_stride_x * gidx
+ layer_stride_y * gidy
+ layer_stride_z * gidz;
if(mask[idx] > 0)
{
layer[idx] = layer[idx + layer_stride_z];
}

}