public:
union
{
__declspec(align(16)) __m128 v; // SIMD data type access
float M[4]; // array indexed storage
// explicit names
struct
{
float x,y,z,w;
}; // end struct
}; // end union
_asm
{
// first we need dot product of this*this
movaps xmm0, vr.v // move left operand into xmm0
mulps xmm0, xmm0 // multiply operands vertically
// at this point, xmm0 =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// or more simply: let xmm0 = [x,y,z,1] =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// we need to sum the x,y,z components into a single scalar
// to compute the final dot product of:
// dp = x + y + z == x1*x2 + y1*y2 + z1*z2
// begin
// xmm0: = [x,y,z,1] (note: all regs in low to hight order)
// xmm1: = [?,?,?,?]
movaps xmm1, xmm0 // copy result into xmm1
// xmm0: = [x,y,z,1]
// xmm1: = [x,y,z,1]
shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02)
// xmm0: = [x,y,z,1]
// xmm1: = [z,1,x,y]
addps xmm1, xmm0
// xmm0: = [x ,y ,z ,1]
// xmm1: = [x+z,y+1,x+z,y+1]
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)
// xmm0: = [y ,x ,y+1,x+z]
// xmm1: = [x+z,y+1,x+z,y+1]
// finally we can add!
addps xmm0, xmm1
// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
// xmm1: = [x+z ,y+1 ,x+z ,y+1]
// xmm0.x contains the dot product
// xmm0.z, xmm0.w contains the dot+1
// now low double word contains dot product, let's take squaroot
sqrtss xmm0, xmm0
movaps vr, xmm0 // save results
} // end asm
代码如上所示,这里重点分析shufps指令的用法,shufps xmm_dest, xmm_src, xontrol8
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)
这行代码前,xmm0: = [x ,y ,z ,1],xmm1: = [x+z,y+1,x+z,y+1],现在把xmm0作为目标寄存器,xmm1作为源寄存器,control8的编码(2,3,0,1)的含义就是:
将源操作数中的第二个32位字作为最终结果的第3个字,x+z
将源操作数中的第三个32位字作为最终结果的第2个字,y+1
将目标操作数中的第零个32位字作为最终结果的第1个字,x
将目标操作数中的第一个32位字作为最终结果的第0个字,y
组合起来xmm0 = [y,x,y+1,x+z],从左到右就是上面的顺序。
参考书籍:《3D游戏编程大师技巧》下册