这篇文章的目标是:
开发一个Height Field Blur节点的Cuda实现版本。
——————————————————————————————
——————————————————————————————
Houdini的OpenCL节点有着天然的劣势,当我们计算一个较为复杂的地形材质时会频繁地进行CPU-GPU通信,这是必须要避免的,这就是在HDK中引入Cuda的原因。
HDK想要直接编译.cu文件比较复杂,而且不美观,我的做法是将cuda函数用C函数封装,编译成dll文件供HDK使用。
最基础的方框模糊算法是做两层for循环,我们这里直接线性分解。
__global__ void boxBlurXdir(const float* __restrict__ idata, float* odata, int radius, int nx, int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= nx || iy >= ny)
return;
int idx = ix + iy * nx;
int xStart = ix - radius;
int xEnd = ix + radius;
if (xStart < 0) xStart = 0; if (xEnd > nx - 1) xEnd = nx - 1;
int num = 0;
float sum = 0;
for (int i = xStart; i <= xEnd; i++)
{
sum += idata[i + nx * iy];
num++;
}
odata[idx] = sum / num;
}
__global__ void boxBlurYdir(const float* __restrict__ idata, float* odata, int radius, int nx, int ny)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
if (ix >= nx || iy >= ny)
return;
int idx = ix + iy * nx;
int yStart = iy - radius;
int yEnd = iy + radius;
if (yStart < 0) yStart = 0; if (yEnd > ny - 1) yEnd = ny - 1;
int num = 0;
float sum = 0;
for (int i = yStart; i <= yEnd; i++)
{
sum += idata[ix + nx * i];
num++;
}
odata[idx] = sum / num;
}
接下来用C函数封装并导出
//注意,必须要用C编译器编译
#define CUDA_API extern "C" __declspec(dllexport)
CUDA_API void boxBlur(const float* idata, float* odata, int radius, int nx, int ny)
{
dim3 block(BDIMX, BDIMY);
dim3 grid((nx + BDIMX - 1) / BDIMX, (ny + BDIMY - 1) / BDIMY);
int nBytes = nx * ny * sizeof(float);
float* dIdata, * dTemp, * dOdata;
cudaMalloc((float**)&dIdata, nBytes);
cudaMalloc((float**)&dTemp, nBytes);
cudaMalloc((float**)&dOdata, nBytes);
cudaMemcpy(dIdata, idata, nBytes, cudaMemcpyHostToDevice);
boxBlurXdir << <grid, block >> > (dIdata, dTemp, radius, nx, ny);
boxBlurYdir << <grid, block >> > (dTemp, dOdata, radius, nx, ny);
cudaMemcpy(odata, dOdata, nBytes, cudaMemcpyDeviceToHost);
cudaFree(dIdata);
cudaFree(dTemp);
cudaFree(dOdata);
}
接下来是头文件的编写
#pragma once
#pragma once
#include <SOP/SOP_Node.h>
class GEO_PrimVolume;
#define CUDA_API extern "C" __declspec(dllimport)
//将.cu文件生成的dll函数导入
CUDA_API void boxBlur(const float* idata, float* odata, int radius, int nx, int ny);
class SOP_HeightFieldBlur : public SOP_Node
{
public:
SOP_HeightFieldBlur(OP_Network* net, const char* name, OP_Operator* op);
~SOP_HeightFieldBlur() override;
static PRM_Template myTemplateList[];
static OP_Node* myConstructor(OP_Network* net, const char* name, OP_Operator* op);
protected:
OP_ERROR cookMySop(OP_Context& context) override;
};
cpp文件的实现
#include "SOP_HeightFieldBlur.h"
#include "OP/OP_AutoLockInputs.h"
#include <OP/OP_OperatorTable.h>
#include <UT/UT_DSOVersion.h>
#include <UT/UT_String.h>
#include <PRM/PRM_Range.h>
#include <UT/UT_VoxelArray.h>
#include <GEO/GEO_PrimVolume.h>
//CudaLib.lib就是.cu文件生成的lib文件
#pragma comment(lib, "CudaLib.lib")
//定义参数面板的名字
static PRM_Name names[] =
{
PRM_Name("layer", "Layer"),
PRM_Name("radius", "Radius")
};
//定义一个参数的默认值"mask"(0.0f忽略不计,是HDK的风格)
static PRM_Default PRMMaskDefault(0.0f, "mask");
//定义参数的UI滑动面板范围
static PRM_Range PRM0to200Range(PRM_RANGE_RESTRICTED, 0.0f, PRM_RANGE_UI, 200.0f);
//定义参数面板
PRM_Template SOP_HeightFieldBlur::myTemplateList[] =
{
//需要模糊的layer的name属性
PRM_Template(PRM_STRING, 1, &names[0], &PRMMaskDefault, &SOP_Node::primGroupMenu,
0, 0,SOP_Node::getGroupSelectButton(GA_GROUP_PRIMITIVE)),
//模糊半径
PRM_Template(PRM_INT_J, 1, &names[1], PRMoneDefaults, 0, &PRM0to200Range),
PRM_Template()
};
void newSopOperator(OP_OperatorTable* table)
{
table->addOperator(new OP_Operator(
"hdk_heightfield_blur",
"HDK HeightFieldBlur",
SOP_HeightFieldBlur::myConstructor,
SOP_HeightFieldBlur::myTemplateList,
1,
1,
0));
}
OP_Node* SOP_HeightFieldBlur::myConstructor(OP_Network* net, const char* name, OP_Operator* op)
{
return new SOP_HeightFieldBlur(net, name, op);
}
SOP_HeightFieldBlur::SOP_HeightFieldBlur(OP_Network* net, const char* name, OP_Operator* op)
: SOP_Node(net, name, op)
{
mySopFlags.setManagesDataIDs(true);
}
SOP_HeightFieldBlur::~SOP_HeightFieldBlur() {}
OP_ERROR SOP_HeightFieldBlur::cookMySop(OP_Context& context)
{
OP_AutoLockInputs inputs(this);
if (inputs.lock(context) >= UT_ERROR_ABORT)
return error();
duplicatePointSource(0, context);
fpreal t = context.getTime();
UT_String layerName;
evalString(layerName, "layer", 0, t);
//得到primitive层级下name属性的指针,如果为指针空则return error()
GA_Attribute* attrib = gdp->findStringTuple(GA_ATTRIB_PRIMITIVE, "name", 1);
GA_ROHandleS hAttrib(attrib);
if (!hAttrib.isValid())
return error();
//得到对应名字的prim,且只有prim的类型为GEO_PRIMVOLUME,也就是heightfield时才进行模糊操作
GEO_Primitive* prim = gdp->findPrimitiveByName(layerName);
if (prim && prim->getTypeId() == GEO_PRIMVOLUME)
{
GEO_PrimVolume* volume = (GEO_PrimVolume*)prim;
//Houdini的volume存储方式不是传统的线性,因此需要将其转为线程二维数组
//而VoxelArray则是指向volume数据的指针
UT_VoxelArrayHandleF hVoxelArray = volume->getVoxelHandle();
UT_VoxelArrayReadHandleF hrVoxelArray(hVoxelArray);
UT_VoxelArrayF* va = (UT_VoxelArrayF*)&*hrVoxelArray;
int nx, ny, nz;
volume->getRes(nx, ny, nz);
if (nz != 1)
{
UT_WorkBuffer buf;
buf.sprintf("This node only surport heightfied!\n");
addError(SOP_MESSAGE, buf.buffer());
}
int nElem = nx * ny;
int nBytes = nElem * sizeof(float);
float* idata = (float*)malloc(nBytes);
float* odata = (float*)malloc(nBytes);
UTparallelForEachNumber((exint)va->numTiles(),
[&](const UT_BlockedRange<exint>& r)
{
exint curTile = r.begin();
UT_VoxelTileIteratorF vit;
vit.setLinearTile(curTile, va);
for (vit.rewind(); !vit.atEnd(); vit.advance())
{
idata[vit.x() + vit.y() * nx] = vit.getValue();
}
});
int radius = evalInt("radius", 0, t);
//调用模糊函数,结束存在odata指针中
boxBlur(idata, odata, radius, nx, ny);
UT_VoxelArrayWriteHandleF hwVoxelArray = volume->getVoxelWriteHandle();
va = (UT_VoxelArrayF*)&*hwVoxelArray;
UTparallelForEachNumber((exint)va->numTiles(),
[&](const UT_BlockedRange<exint>& r)
{
exint curTile = r.begin();
UT_VoxelTileIteratorF vit;
vit.setLinearTile(curTile, va);
for (vit.rewind(); !vit.atEnd(); vit.advance())
{
//将odata的值写会到当前节点的VoxelArray
vit.setValue(odata[vit.x() + vit.y() * nx]);
}
});
free(idata);
free(odata);
}
else
{
UT_WorkBuffer buf;
buf.sprintf("This node only surport heightfied!\n");
addError(SOP_MESSAGE, buf.buffer());
}
//最后绑定
gdp->bumpAllDataIds();
return error();
}
ok接下来使用hcustom编译hdk,直接看运行结果。
这样的模糊其实在边界处是有问题的,原因是边界处模糊的采样数没有中心处采样数多,那么接下来换一种思路实现boxBlur,通过迭代地执行半径为1的模糊。
CUDA_API void boxBlur(const float* idata, float* odata, int radius, int nx, int ny)
{
dim3 block(BDIMX, BDIMY);
dim3 grid((nx + BDIMX - 1) / BDIMX, (ny + BDIMY - 1) / BDIMY);
int nBytes = nx * ny * sizeof(float);
float* dTemp, * dOdata;
cudaMalloc((float**)&dTemp, nBytes);
cudaMalloc((float**)&dOdata, nBytes);
cudaMemcpy(dOdata, idata, nBytes, cudaMemcpyHostToDevice);
for (int i = 0; i < radius; i++)
{
boxBlurXdir << <grid, block >> > (dOdata, dTemp, 1, nx, ny);
boxBlurYdir << <grid, block >> > (dTemp, dOdata, 1, nx, ny);
}
cudaMemcpy(odata, dOdata, nBytes, cudaMemcpyDeviceToHost);
cudaFree(dTemp);
cudaFree(dOdata);
}
编译运行,ok现在边缘处没有硬痕了。
还可以对mask进行模糊(其实我们参数面板中设置的默认值就是mask)
那么至此,cuda版本的方框模糊实现完毕,这并不是引入cuda的真正好处,只有构建复杂的系统时才能体现这种做法真正的价值,我们拭目以待。
并且之后有时间的话可以实现shared memory版本的blur,感谢观看。