Assembly x64 Intro - SSE2 2x4x4W Transpose




;in: xmm0, xmm1, xmm2, xmm3,  xmm4 pOut:  xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
    SSE2_XSawp wd,  %1, %2, %5
    SSE2_XSawp wd,  %3, %4, %2
    SSE2_XSawp dq,  %1, %3, %4
    SSE2_XSawp dq,  %5, %2, %3
    SSE2_XSawp qdq, %1, %5, %2
    SSE2_XSawp qdq, %4, %3, %5
%endmacro


;for TRANSPOSE
%macro SSE2_XSawp 4
    movdqa      %4, %2
    punpckl%1   %2, %3
    punpckh%1   %4, %3
%endmacro

xmm2 = aw7aw6aw5aw4aw3aw2aw1aw0,      xmm0 = bw7bw6bw5bw4bw3bw2bw1bw0
xmm3 = cw7cw6cw5cw4cw3cw2cw1cw0,        xmm4 = dw7dw6dw5dw4dw3dw2dw1dw0

SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1

展开如下:

SSE2_XSawp wd, xmm2, xmm0, xmm1 =>
         movdqa        xmm1, xmm2     => xmm1 = xmm2 = aw7.....aw0 
         punpcklwd    xmm2, xmm0    => xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
         punpckhwd   xmm1, xmm0    => xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
SSE2_XSawp wd, xmm3, xmm4, xmm0 =>
         movdqa        xmm0, xmm3    => xmm0 = xmm3 = cw7......cw0
         punpcklwd   xmm3, xmm4    =>  xmm3 = dw3cw3dw2cw2dw1cw1dw0cw0
         punpckhwd  xmm0, xmm4    =>  xmm0 = dw7cw7dw6cw6dw5cw5dw4cw4
SSE2_XSawp dq, xmm2, xmm3, xmm4 =>
         movdqa        xmm4, xmm2   =>  xmm4 = xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
         punpckldq     xmm2, xmm3  =>  xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
         punpckhdq    xmm4, xmm3  =>  xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
SSE2_XSawp dq, xmm1, xmm0, xmm3 =>
         movdqa        xmm3, xmm1   => xmm3 = xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
         punpckldq    xmm1, xmm0   =>  xmm1 = dw5cw5bw5aw5dw4cw4bw4aw4
         punpckhdq   xmm3, xmm0   =>  xmm3 = dw7cw7bw7aw7dw6cw6bw6aw6
SSE2_XSawp qdq, xmm2, xmm1, xmm0 =>
         movdqa        xmm0, xmm2   =>  xmm0 = xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
         punpcklqdq  xmm2, xmm1   =>  xmm2 = dw4cw4bw4aw4dw0cw0bw0aw0
         punpckhqdq xmm0, xmm1   =>  xmm0 = dw5cw5bw5aw5dw1cw1bw1aw1
SSE2_XSawp qdq, xmm4, xmm3, xmm1 =>
         movdqa        xmm1, xmm4   => xmm1 = xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
         punpcklqdq  xmm4, xmm3   => xmm4 = dw6cw6bw6aw6dw2cw2bw2aw2
         punpckhqdq xmm1, xmm3   => xmm1 = dw7cw7bw7aw7dw3cw3bw3aw3







  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值