【NoisyHeart的炼成】Part4:顶点计算_rwstructuredbuffer register-CSDN博客

本文链接：https://blog.csdn.net/u014733621/article/details/24849881

CP就是控制点，SP可以由CP求得：在两端的顶点，CP与SP的值相同，其余的SP都是2个CP的中点。

其实SP的引入思想：DirectX 11由2个shader， 3个shader stage来完成多边形的动态细分。hull shader主要是计算控制点，这里我们先把控制用的点信息都预先计算出来，再在beam的渲染时，用hull shader简单控制细分次数，再使用domain shader来实现贝塞尔平滑细分。就可以做出速度快，效果好的渲染。

为了进一步提高渲染效率，发挥GPU并行运算的优势，我们将会使用compute shader来计算 CP 与 SP:

#include "PerlinNoise.hlsl"

#define blocksize 512

struct Beam_Vertex
{
	float3 vCP;
	float3 vSP;
};

cbuffer CB : register( b0 )
{
	float3 	g_vEyePos		: packoffset( c0 );
	float 	g_fWidth		: packoffset( c0.w );
	matrix	g_mWorld		: packoffset( c1 );
};

StructuredBuffer<float3> ControlPositions : register(t0);
RWStructuredBuffer<float3> PerlinSeeds: register(u0);
RWStructuredBuffer<Beam_Vertex> BeamVertexBuffer : register(u1);

#ifdef FACE_TO_CAMERA	
groupshared float3 sharedCP[256];
#endif
groupshared float3 sharedVB[512];

float3 noise3(float3 input)
{
	float3 output;
	output.x = PerlinNoise(input.x);
	output.y = PerlinNoise(input.y);
	output.z = PerlinNoise(input.z);
	return output;
}

[numthreads(blocksize, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
	int VBid = DTid.x * 2;

    #ifdef FACE_TO_CAMERA	
   		// load ControlPoints
		float3 vPos = ControlPositions[DTid.x] + noise3(PerlinSeeds[DTid.x]);
		sharedCP[DTid.x] = mul( float4(vPos,1), g_mWorld ).xyz;
	 	PerlinSeeds[DTid.x] += float3(0.0135,0.0135,0.0135); 	
		GroupMemoryBarrierWithGroupSync();	
			
		//
		float3 vEyeDir = g_vEyePos - sharedCP[DTid.x];
		float3 vFaceNormal = sharedCP[DTid.x-1] - sharedCP[DTid.x+1];
		float3 vDir = normalize(cross(vEyeDir, vFaceNormal));

		// calc sharedVB
		sharedVB[VBid] = sharedCP[DTid.x] + vDir * g_fWidth;
		sharedVB[VBid+1] = sharedCP[DTid.x] - vDir * g_fWidth; 			
    #else
		float fWidth = g_fWidth * 3.0f;
		float3 vPos = mul( float4(ControlPositions[DTid.x],1), g_mWorld ).xyz;
		float3 seed = PerlinSeeds[DTid.x];
		sharedVB[VBid] = vPos + fWidth * noise3(seed);
		seed += float3(123.0f,123.0f,123.f);
		sharedVB[VBid+1] = vPos + fWidth * noise3(seed);
		PerlinSeeds[DTid.x] += float3( 0.035, 0.035, 0.035 );
	#endif

	GroupMemoryBarrierWithGroupSync();
	// Cacl SP
	if (DTid.x % 2 == 0)
	{
		BeamVertexBuffer[VBid].vSP = lerp(sharedVB[VBid-2],sharedVB[VBid],0.5);
		BeamVertexBuffer[VBid+1].vSP = lerp(sharedVB[VBid-1],sharedVB[VBid+1],0.5);
	}
	else
	{
		BeamVertexBuffer[VBid].vSP = lerp(sharedVB[VBid+2],sharedVB[VBid],0.5);
		BeamVertexBuffer[VBid+1].vSP = lerp(sharedVB[VBid+3],sharedVB[VBid+1],0.5);		
	}

	GroupMemoryBarrierWithGroupSync();

	BeamVertexBuffer[VBid].vCP = sharedVB[VBid];
	BeamVertexBuffer[VBid+1].vCP = sharedVB[VBid+1];
}

注意，这是HLSL代码

代码我没有用注释的原因，因为我觉得可以不写注释的代码，就不必多此一举，毕竟从命名也可以理解其含义。

这段HLSL代码比较难上手，因为其中涉及的其他算法，但这都不是什么高深的学问。

理解GPU的构造，有良好的图形基础，多思考，攻破也不是难事。