使用SIMD指令shufps计算点积-CSDN博客

本文链接：https://blog.csdn.net/oLingXi12/article/details/141030972

public:

union
    {
    __declspec(align(16)) __m128 v;   // SIMD data type access
    float M[4];                       // array indexed storage
    // explicit names
    struct
         {
         float x,y,z,w;
         }; // end struct
    }; // end union

_asm
   {
   // first we need dot product of this*this
   movaps xmm0, vr.v   // move left operand into xmm0
   mulps xmm0, xmm0    // multiply operands vertically
   
   // at this point, xmm0  = 
   // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
   // or more simply: let xmm0 = [x,y,z,1] = 
   // [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
   // we need to sum the x,y,z components into a single scalar
   // to compute the final dot product of:
   // dp = x + y + z == x1*x2 + y1*y2 + z1*z2

   // begin 
   // xmm0: = [x,y,z,1] (note: all regs in low to hight order)
   // xmm1: = [?,?,?,?]
   movaps xmm1, xmm0 // copy result into xmm1
   // xmm0: = [x,y,z,1]
   // xmm1: = [x,y,z,1]

   shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) 
   // xmm0: = [x,y,z,1]
   // xmm1: = [z,1,x,y]

   addps xmm1, xmm0
   // xmm0: = [x  ,y  ,z  ,1]
   // xmm1: = [x+z,y+1,x+z,y+1]

   shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) 
   // xmm0: = [y  ,x  ,y+1,x+z]
   // xmm1: = [x+z,y+1,x+z,y+1]

   // finally we can add!
   addps xmm0, xmm1
   // xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
   // xmm1: = [x+z  ,y+1  ,x+z    ,y+1]
   // xmm0.x contains the dot product
   // xmm0.z, xmm0.w contains the dot+1

   // now low double word contains dot product, let's take squaroot
   sqrtss xmm0, xmm0
 
   movaps vr, xmm0 // save results

   } // end asm

代码如上所示，这里重点分析shufps指令的用法，shufps xmm_dest, xmm_src, xontrol8

shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)

这行代码前，xmm0: = [x ,y ,z ,1]，xmm1: = [x+z,y+1,x+z,y+1],现在把xmm0作为目标寄存器，xmm1作为源寄存器，control8的编码(2,3,0,1)的含义就是：

将源操作数中的第二个32位字作为最终结果的第3个字，x+z

将源操作数中的第三个32位字作为最终结果的第2个字，y+1

将目标操作数中的第零个32位字作为最终结果的第1个字，x

将目标操作数中的第一个32位字作为最终结果的第0个字,y

组合起来xmm0 = [y,x,y+1,x+z]，从左到右就是上面的顺序。

参考书籍：《3D游戏编程大师技巧》下册