; pOut mm1, mm4, mm5, mm3
%macro MMX_Trans4x4W 5
MMX_XSwap wd, %1, %2, %5
MMX_XSwap wd, %3, %4, %2
MMX_XSwap dq, %1, %3, %4
MMX_XSwap dq, %5, %2, %3
%endmacro
%macro MMX_XSwap 4
movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
Here, assume, mm0 = a3a2a1a0, mm1 = b3b2b1b0, mm2 = c3c2c1c0, mm3 = d3d2d1d0 (4个16bit)
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 将使得
mm0 = d0c0b0a0, mm3 = d1c1b1a1, mm4 = d2c2b2a2, mm2 = d3c3b3a3
上述展开如下:
MMX_XSwap wd, mm0, mm1, mm4 =>
movq mm4, mm0 => mm4 = mm0 = a3a2a1a0
punpckhwd mm4, mm1 => mm4 = b3a3b2a2
punpcklwd mm0, mm1 => mm0 = b1a1b0a0
MMX_XSwap wd, mm2, mm3, mm1 =>
movq mm1, mm2 => mm1 = mm2 = c3c2c1c0
punpckhwd mm1, mm3 => mm1 = d3c3d2c2
punpcklwd mm2, mm3 => mm2 = d1c1d0c0
MMX_XSwap dq, mm0, mm2, mm3 =>
movq mm3, mm0 => mm3 = mm0 = b1a1b0a0
punpckhdq mm3, mm2 => mm3 = d1c1b1a1
punpckldq mm0, mm2 => mm0 = d0c0b0a0
MMX_XSwap dq, mm4, mm1, mm2 =>
movq mm2, mm4 => mm2 = mm4 = b3a3b2a2
punpckhdq mm2, mm1 => mm2 = d3c3b3a3
punpckldq mm4, mm1 => mm4 = d2c2b2a2
=> mm0 = d0c0b0a0, mm3 = d1c1b1a1, mm4 = d2c2b2a2, mm2 = d3c3b3a3