本来想着写一些SSE指令用来优化数学运算,但是又不知道从何入手,本着不造轮子的原则,查看了一下DX的数学库。DX的数学库都是内联函数,可以看到源代码。看了一下点乘操作,自己不写是对的,人家用了这么久的数学库,肯定是最好的。上一段代码。
inline XMVECTOR XM_CALLCONV XMVector3Dot
(
FXMVECTOR V1,
FXMVECTOR V2
)
{
#if defined(_XM_NO_INTRINSICS_)
float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
XMVECTORF32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = fValue;
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x4_t vTemp = vmulq_f32( V1, V2 );
float32x2_t v1 = vget_low_f32( vTemp );
float32x2_t v2 = vget_high_f32( vTemp );
v1 = vpadd_f32( v1, v1 );
v2 = vdup_lane_f32( v2, 0 );
v1 = vadd_f32( v1, v2 );
return vcombine_f32( v1, v1 );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_dp_ps( V1, V2, 0x7f );
#elif defined(_XM_SSE3_INTRINSICS_)
XMVECTOR vTemp = _mm_mul_ps(V1,V2);
vTemp = _mm_and_ps(vTemp, g_XMMask3);
vTemp = _mm_hadd_ps(vTemp,vTemp);
return _mm_hadd_ps(vTemp,vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(V1,V2);
// x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
// Result.vector4_f32[0] = x+y
vDot = _mm_add_ss(vDot,vTemp);
// x=Dot.vector4_f32[2]
vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
// Result.vector4_f32[0] = (x+y)+z
vDot = _mm_add_ss(vDot,vTemp);
// Splat x
return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
#endif
}
代码简单易懂SSE4只需要一条指令就可以计算点乘,果然硬件的提升才是王道。
SS4和SS2性能差距还是挺大的。不知道手机平台对应的指令集是什么,应该是ARM的东西吧,先不着急,学好一个,其他的就触类旁通了。
反汇编看一下汇编代码
#if defined(_XM_NO_INTRINSICS_)
float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
XMVECTORF32 vResult;
vResult.f[0] =
vResult.f[1] =
vResult.f[2] =
vResult.f[3] = fValue;
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x4_t vTemp = vmulq_f32( V1, V2 );
float32x2_t v1 = vget_low_f32( vTemp );
float32x2_t v2 = vget_high_f32( vTemp );
v1 = vpadd_f32( v1, v1 );
v2 = vdup_lane_f32( v2, 0 );
v1 = vadd_f32( v1, v2 );
return vcombine_f32( v1, v1 );
#elif defined(_XM_SSE4_INTRINSICS_)
return _mm_dp_ps( V1, V2, 0x7f );
#elif defined(_XM_SSE3_INTRINSICS_)
XMVECTOR vTemp = _mm_mul_ps(V1,V2);
vTemp = _mm_and_ps(vTemp, g_XMMask3);
vTemp = _mm_hadd_ps(vTemp,vTemp);
return _mm_hadd_ps(vTemp,vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product
XMVECTOR vDot = _mm_mul_ps(V1,V2);
009C2F04 movaps xmm0,xmmword ptr [V1]
009C2F08 mulps xmm0,xmmword ptr [V2]
009C2F0C movaps xmmword ptr [ebp-160h],xmm0
009C2F13 movaps xmm0,xmmword ptr [ebp-160h]
009C2F1A movaps xmmword ptr [vDot],xmm0
// x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
009C2F1E movaps xmm0,xmmword ptr [vDot]
009C2F22 shufps xmm0,xmmword ptr [vDot],99h
009C2F27 movaps xmmword ptr [ebp-180h],xmm0
009C2F2E movaps xmm0,xmmword ptr [ebp-180h]
009C2F35 movaps xmmword ptr [vTemp],xmm0
// Result.vector4_f32[0] = x+y
vDot = _mm_add_ss(vDot,vTemp);
009C2F39 movaps xmm0,xmmword ptr [vDot]
009C2F3D addss xmm0,dword ptr [vTemp]
009C2F42 movaps xmmword ptr [ebp-1A0h],xmm0
009C2F49 movaps xmm0,xmmword ptr [ebp-1A0h]
009C2F50 movaps xmmword ptr [vDot],xmm0
// x=Dot.vector4_f32[2]
vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
009C2F54 movaps xmm0,xmmword ptr [vTemp]
009C2F58 shufps xmm0,xmmword ptr [vTemp],55h
009C2F5D movaps xmmword ptr [ebp-1C0h],xmm0
009C2F64 movaps xmm0,xmmword ptr [ebp-1C0h]
009C2F6B movaps xmmword ptr [vTemp],xmm0
// Result.vector4_f32[0] = (x+y)+z
vDot = _mm_add_ss(vDot,vTemp);
009C2F6F movaps xmm0,xmmword ptr [vDot]
009C2F73 addss xmm0,dword ptr [vTemp]
009C2F78 movaps xmmword ptr [ebp-1E0h],xmm0
009C2F7F movaps xmm0,xmmword ptr [ebp-1E0h]
009C2F86 movaps xmmword ptr [vDot],xmm0
// Splat x
return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
009C2F8A movaps xmm0,xmmword ptr [vDot]
// Splat x
return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
009C2F8E shufps xmm0,xmmword ptr [vDot],0
009C2F93 movaps xmmword ptr [ebp-200h],xmm0
009C2F9A movaps xmm0,xmmword ptr [ebp-200h]
#endif
SSE4一条汇编就搞定了。反汇编看代码记得把用Debug模式,否则编译器的优化会让你完全摸不到头脑。
8086汇编基本快学完了,今天购买了新书现代X86汇编语言程序设计,等看完这本书,上面的汇编理解起来问题就不大了。