CP就是控制点,SP可以由CP求得:在两端的顶点,CP与SP的值相同,其余的SP都是2个CP的中点。
其实SP的引入思想:DirectX 11由2个shader, 3个shader stage来完成多边形的动态细分。hull shader主要是计算控制点,这里我们先把控制用的点信息都预先计算出来,再在beam的渲染时,用hull shader简单控制细分次数,再使用domain shader来实现贝塞尔平滑细分。就可以做出速度快,效果好的渲染。
为了进一步提高渲染效率,发挥GPU并行运算的优势,我们将会使用compute shader来计算 CP 与 SP:
#include "PerlinNoise.hlsl"
#define blocksize 512
struct Beam_Vertex
{
float3 vCP;
float3 vSP;
};
cbuffer CB : register( b0 )
{
float3 g_vEyePos : packoffset( c0 );
float g_fWidth : packoffset( c0.w );
matrix g_mWorld : packoffset( c1 );
};
StructuredBuffer<float3> ControlPositions : register(t0);
RWStructuredBuffer<float3> PerlinSeeds: register(u0);
RWStructuredBuffer<Beam_Vertex> BeamVertexBuffer : register(u1);
#ifdef FACE_TO_CAMERA
groupshared float3 sharedCP[256];
#endif
groupshared float3 sharedVB[512];
float3 noise3(float3 input)
{
float3 output;
output.x = PerlinNoise(input.x);
output.y = PerlinNoise(input.y);
output.z = PerlinNoise(input.z);
return output;
}
[numthreads(blocksize, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
int VBid = DTid.x * 2;
#ifdef FACE_TO_CAMERA
// load ControlPoints
float3 vPos = ControlPositions[DTid.x] + noise3(PerlinSeeds[DTid.x]);
sharedCP[DTid.x] = mul( float4(vPos,1), g_mWorld ).xyz;
PerlinSeeds[DTid.x] += float3(0.0135,0.0135,0.0135);
GroupMemoryBarrierWithGroupSync();
//
float3 vEyeDir = g_vEyePos - sharedCP[DTid.x];
float3 vFaceNormal = sharedCP[DTid.x-1] - sharedCP[DTid.x+1];
float3 vDir = normalize(cross(vEyeDir, vFaceNormal));
// calc sharedVB
sharedVB[VBid] = sharedCP[DTid.x] + vDir * g_fWidth;
sharedVB[VBid+1] = sharedCP[DTid.x] - vDir * g_fWidth;
#else
float fWidth = g_fWidth * 3.0f;
float3 vPos = mul( float4(ControlPositions[DTid.x],1), g_mWorld ).xyz;
float3 seed = PerlinSeeds[DTid.x];
sharedVB[VBid] = vPos + fWidth * noise3(seed);
seed += float3(123.0f,123.0f,123.f);
sharedVB[VBid+1] = vPos + fWidth * noise3(seed);
PerlinSeeds[DTid.x] += float3( 0.035, 0.035, 0.035 );
#endif
GroupMemoryBarrierWithGroupSync();
// Cacl SP
if (DTid.x % 2 == 0)
{
BeamVertexBuffer[VBid].vSP = lerp(sharedVB[VBid-2],sharedVB[VBid],0.5);
BeamVertexBuffer[VBid+1].vSP = lerp(sharedVB[VBid-1],sharedVB[VBid+1],0.5);
}
else
{
BeamVertexBuffer[VBid].vSP = lerp(sharedVB[VBid+2],sharedVB[VBid],0.5);
BeamVertexBuffer[VBid+1].vSP = lerp(sharedVB[VBid+3],sharedVB[VBid+1],0.5);
}
GroupMemoryBarrierWithGroupSync();
BeamVertexBuffer[VBid].vCP = sharedVB[VBid];
BeamVertexBuffer[VBid+1].vCP = sharedVB[VBid+1];
}
注意,这是HLSL代码
代码我没有用注释的原因,因为我觉得可以不写注释的代码,就不必多此一举,毕竟从命名也可以理解其含义。
这段HLSL代码比较难上手,因为其中涉及的其他算法,但这都不是什么高深的学问。理解GPU的构造,有良好的图形基础,多思考,攻破也不是难事。