Assembly x64 Intro - SSE2 4x8 Load



%macro SSE2_Load4x8p 6
    MOVDQ    %2,    [%1+0x00]
    MOVDQ    %4,    [%1+0x10]
    MOVDQ    %6,    [%1+0x20]
    MOVDQ    %3,    [%1+0x30]
    SSE2_XSawp qdq, %4, %3, %5          ; 为什么要做这两步???
    SSE2_XSawp qdq, %2, %6, %3
%endmacro


notes: MOVDQ defined as movdqa

;for TRANSPOSE
%macro SSE2_XSawp 4
    movdqa      %4, %2
    punpckl%1   %2, %3
    punpckh%1   %4, %3
%endmacro


如:

    ;Load 4x8
    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5

=> (r4:   int16_t *)

    movdqa   xmm0,        [r4 + 0x00];             // mov first 8 x 16 into 128bit' xmm0 => xmm0 = aw7aw6aw5aw4aw3aw2aw1aw0

    movdqa   xmm4,         [r4 + 0x10];            // 2nd 8x16 into xmm4 => xmm4 = bw7bw6bw5bw4bw3bw2bw1bw0

    movdqa   xmm5,         [r4 + 0x20];            // 3rd 8x16 into xmm5  => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0

    movdqa   xmm1,         [r4 + 0x30];            // 4th 8x16 into xmm1 =>   xmm1= dw7dw6dw5dw4dw3dw2dw1dw0

    SSE2_XSawp qdq,     xmm4, xmm1, xmm2 =>

              movdqa         xmm2,   xmm4    => xmm2 = xmm4 = bw7bw6bw5bw4bw3bw2bw1bw0

             punpcklqdq    xmm4,   xmm1    =>  xmm1 = dw7dw6dw5dw4dw3dw2dw1dw0, xmm4 = dw3dw2dw1dw0bw3bw2bw1bw0

             punpckhqdq   xmm2,   xmm1    => xmml = dw7dw6dw5dw4dw3dw2dw1dw0, xmm2 = dw7dw6dw5dw4bw7bw6bw5bw4

    SSE2_XSawp  qdq,    xmm0, xmm5, xmm1 =>

             movdqa         xmm1, xmm0       => xmm1 = xmm0 = aw7aw6aw5aw4aw3aw2aw1aw0

             punpcklqdq    xmm0, xmm5      => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0,  xmm0 = cw3cw2cw1cw0aw3aw2aw1aw0

             punpckhqdq   xmm1, xmm5     => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0,   xmm1 = cw7cw6cw5cw4aw7aw6aw5aw4          

   




%macro SSE2_Load4x8p 6
    MOVDQ    %2,    [%1+0x00]
    MOVDQ    %4,    [%1+0x10]
    MOVDQ    %6,    [%1+0x20]
    MOVDQ    %3,    [%1+0x30]
    SSE2_XSawp qdq, %4, %3, %5
    SSE2_XSawp qdq, %2, %6, %3
%endmacro

;for TRANSPOSE
%macro SSE2_XSawp 4
    movdqa      %4, %2
    punpckl%1   %2, %3
    punpckh%1   %4, %3
%endmacro

    ;Load 4x8
    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
=>  movdqa  xmm0, [r4]
    movdqa  xmm4  [r4 + 16]
 movdqa  xmm5  [r4 + 32]
 movdqa  xmm1  [r4 + 48]
;   隔行置换, xmm0 <=> xmm5, xmm4 <=> xmm1

(xmm00, xmm01等分别表示4个16bit的word, 低高位)

xmm00xmm01
xmm40xmm41
xmm50xmm51
xmm10xmm11


 SSE2_XSawp qdq xmm4, xmm1 xmm2 =>
   movdqa     xmm2, xmm4
   punpcklqdq xmm4, xmm1
   punpckhqdq xmm2, xmm1
 SSE2_XSawp qdq xmm0, xmm5, xmm1 =>
   movdqa     xmm1, xmm0
   punpcklqdq xmm0, xmm5
   punpckhqdq xmm1, xmm5




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值