一 接上一篇哈德玛变换C语言实现
二 哈德玛变换汇编实现
cglobal dct4x4dc, 1,1,5 //参数说明 ,这个函数一个参数 使用了1个寄存器,使用了5个向量寄存器
/*
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = (optional) stack size to be allocated. The stack will be aligned before
; allocating the specified stack size. If the required stack alignment is
; larger than the known stack alignment the stack will be manually aligned
; and an extra register will be allocated to hold the original stack
; pointer (to not invalidate r0m etc.). To prevent the use of an extra
; register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
*/
0000000000000000 <s264_8_dct4x4dc_mmx2>: //参数一个数据是2字节,一共32字节。
0: 0f 6f 5f 18 movq 0x18(%rdi),%mm3 //16 + 8 offset。24 将数组的末尾8字节装入xmm3
4: 0f 6f 57 10 movq 0x10(%rdi),%mm2
8: 0f 6f 4f 08 movq 0x8(%rdi),%mm1
c: 0f 6f 07 movq (%rdi),%mm0 //依次装载,就是1,2,3,4行,一行存一个向量寄存器
f: 0f 6f 3d 00 00 00 00 movq 0x0(%rip),%mm7 # 16 <s264_8_dct4x4dc_mmx2+0x16> //当前这条指令存入xmm7 ,
16: 0f 6f e3 movq %mm3,%mm4 //xmm3->xmm4
19: 0f fd da paddw %mm2,%mm3 // 按16 bit为单位相加 LINE2 + LINE3 ->xmm3
1c: 0f f9 d4 psubw %mm4,%mm2 // LINE2 - LINE3 ->xmm2
1f: 0f 6f e1 movq %mm1,%mm4 //LINE1 ->xmm4
22: 0f fd c8 paddw %mm0,%mm1 //LINE0 + LINE1 ->xmm1
25: 0f f9 c4 psubw %mm4,%mm0 //LINE0 - LINE1 ->xmm0
//上面一段是和C语言代码对应的
28: 0f 6f e3 movq %mm3,%mm4 //LINE2 + LINE3 ->xmm4
2b: 0f fd d9 paddw %mm1,%mm3 //LINE0 + LINE1 + LINE2 + LINE3 ->xmm3
2e: 0f f9 cc psubw %mm4,%mm1 //LINE0 + LINE1 - (LINE2 + LINE3)->xmm1
31: 0f 6f e2 movq %mm2,%mm4 //LINE2 - LINE3->xmm4
34: 0f fd d0 paddw %mm0,%mm2 //LINE0 - LINE1 + (LINE2 - LINE3 )->xmm2
37: 0f f9 c4 psubw %mm4,%mm0 //LINE0 - LINE1 - (LINE2 - LINE3 )->xmm0
3a: 0f 6f e3 movq %mm3,%mm4 //LINE0 + LINE1 + LINE2 + LINE3->xmm4
3d: 0f 61 d9 punpcklwd %mm1,%mm3 //xmm1->低32位->xmm3
40: 0f 69 e1 punpckhwd %mm1,%mm4 //xmm1高32位 ->xmm4
43: 0f 6f c8 movq %mm0,%mm1 //LINE0 - LINE1 - (LINE2 - LINE3 )->xmm1
46: 0f 61 c2 punpcklwd %mm2,%mm0 //xmm2 低32位->xmm0
49: 0f 69 ca punpckhwd %mm2,%mm1 //xmm2高32位xmm1
4c: 0f 6f d3 movq %mm3,%mm2 //xmm3->xmm2
4f: 0f 62 d8 punpckldq %mm0,%mm3 //xmm0低32位->xmm3
52: 0f 6a d0 punpckhdq %mm0,%mm2 //xmm0高32位->xmm2
55: 0f 6f c4 movq %mm4,%mm0 //xmm4->xmm0
58: 0f 62 e1 punpckldq %mm1,%mm4 //xmm1低32位->xmm4
5b: 0f 6a c1 punpckhdq %mm1,%mm0 //xmm1高32位->xmm0
5e: 0f 6f ca movq %mm2,%mm1 //xmm2->xmm1
61: 0f fd d3 paddw %mm3,%mm2 //tmp[i*4+0] + tmp[i*4+1]; s01
64: 0f f9 d9 psubw %mm1,%mm3 //tmp[i*4+0] - tmp[i*4+1]; d01
67: 0f 6f c8 movq %mm0,%mm1 //xmm0->xmm1
6a: 0f fd c4 paddw %mm4,%mm0 //tmp[i*4+2] + tmp[i*4+3]; s23
6d: 0f f9 e1 psubw %mm1,%mm4 //tmp[i*4+2] - tmp[i*4+3]; d23
70: 0f 6f cf movq %mm7,%mm1 //xmm7->xmm1
73: 0f ef d7 pxor %mm7,%mm2 // xmm7 ^ xmm2->xmm2 ,再pxor一次就能恢复
76: 0f f9 c8 psubw %mm0,%mm1 //xmm1->xmm0 ->xmm1
79: 0f ef c7 pxor %mm7,%mm0 //xmm7 ^ xmm0 ->xmm0
7c: 0f e3 ca pavgw %mm2,%mm1 //(xmm2 + xmm1 + 1)/2 取整->xmm1
7f: 0f e3 c2 pavgw %mm2,%mm0 // (xmm2 + xmm0 + 1)/2 取整->xmm0
82: 0f ef cf pxor %mm7,%mm1 //xmm7^xmm1->xmm1
85: 0f ef c7 pxor %mm7,%mm0 //xmm7->xmm0->xmm0
88: 0f 6f ef movq %mm7,%mm5 //xmm7->xmm5
8b: 0f ef df pxor %mm7,%mm3 //xmm7^xmm3->xmm3
8e: 0f f9 ec psubw %mm4,%mm5 //xmm5-xmm4->xmm5
91: 0f ef e7 pxor %mm7,%mm4 //xmm7^xmm4->xmm4
94: 0f e3 eb pavgw %mm3,%mm5 // (xmm3 + xmm5 + 1)/2 能对应上C语言代码,
97: 0f e3 e3 pavgw %mm3,%mm4 //(xmm3 + xmm4 + 1)/2
9a: 0f ef ef pxor %mm7,%mm5 //xmm7^xmm5->xmm5
9d: 0f ef e7 pxor %mm7,%mm4 //xmm7^xmm4->xmm4
a0: 0f 7f 07 movq %mm0,(%rdi) //xmm0->rdi
a3: 0f 7f 4f 08 movq %mm1,0x8(%rdi) //xmm1->rdi + 8
a7: 0f 7f 6f 10 movq %mm5,0x10(%rdi) //xmm5->rdi + 16
ab: 0f 7f 67 18 movq %mm4,0x18(%rdi) //xmm4->rdi + 24
af: c3 retq
因为汇编代码里面有很多宏,为了方便读代码,直接拿编译后的.o 反汇编来看,比较直观。
三 疑惑点
xmm7寄存器的一些运算没太看明白,用rip 指令寄存器存入xmm7 ,然后用xmm7寄存器做各种异或运算来保存xmm0,xmm1,xmm3,xmm5寄存器,这一段没太理解。但是总体上和C语言代码是对得上的。