一 C语言实现
static void idct4x4dc( dctcoef d[16] )
{
dctcoef tmp[16];
for( int i = 0; i < 4; i++ )
{
int s01 = d[i*4+0] + d[i*4+1];
int d01 = d[i*4+0] - d[i*4+1];
int s23 = d[i*4+2] + d[i*4+3];
int d23 = d[i*4+2] - d[i*4+3];
tmp[0*4+i] = s01 + s23;
tmp[1*4+i] = s01 - s23;
tmp[2*4+i] = d01 - d23;
tmp[3*4+i] = d01 + d23;
}
for( int i = 0; i < 4; i++ )
{
int s01 = tmp[i*4+0] + tmp[i*4+1];
int d01 = tmp[i*4+0] - tmp[i*4+1];
int s23 = tmp[i*4+2] + tmp[i*4+3];
int d23 = tmp[i*4+2] - tmp[i*4+3];
d[i*4+0] = s01 + s23; 只有这里和哈德玛变换有区别, 没有对2取整
d[i*4+1] = s01 - s23;
d[i*4+2] = d01 - d23;
d[i*4+3] = d01 + d23;
}
}
二 SIMD汇编实现
%macro IDCT4x4DC 0
cglobal idct4x4dc, 1,1
mova m3, [r0+48] //和哈德玛变换不同,这里一个元素是4字节
mova m2, [r0+32]
mova m1, [r0+16]
mova m0, [r0+ 0]
//将数据载入xmm寄存器
WALSH4_1D d,0,1,2,3,4
TRANSPOSE4x4D 0,1,2,3,4
WALSH4_1D d,0,1,2,3,4
//将数据从xmm寄存器取出来
mova [r0+ 0], m0
mova [r0+16], m1
mova [r0+32], m2
mova [r0+48], m3
RET
%endmacro ; IDCT4x4DC
%macro WALSH4_1D 6 //6个参数
SUMSUB_BADC %1, %5, %4, %3, %2, %6
SUMSUB_BADC %1, %5, %3, %4, %2, %6
SWAP %2, %5, %4
%endmacro
%macro SUMSUB_BADC 5-6
%if %0==6
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
padd%1 m%2, m%3 //xmm2 + xmm3->xmm2
padd%1 m%4, m%5 //xmm4 + xmm5->xmm4
padd%1 m%3, m%3 //xmm3 + xmm3 ->xmm3
padd%1 m%5, m%5 //xmm5 + xmm5 ->xmm5
psub%1 m%3, m%2 //xmm3 - xmm2->xmm3
psub%1 m%5, m%4 //xmm5 - xmm4->xmm5
%endif
%endmacro
%macro SUMSUB_BA 3-4
%if %0==3 //如果是3个参数
padd%1 m%2, m%3
padd%1 m%3, m%3
psub%1 m%3, m%2
%elif avx_enabled
padd%1 m%4, m%2, m%3
psub%1 m%3, m%2
SWAP %2, %4
%else
mova m%4, m%2 //xmm2->xmm4
padd%1 m%2, m%3 //xmm3 + xmm2->xmm2. paddd
psub%1 m%3, m%4.//xmm3 - xmm4 ->xmm3
%endif
%endmacro
三 反汇编代码解读
00000000000000b0 <s264_8_idct4x4dc_mmx>:
b0: 0f 6f 5f 18 movq 0x18(%rdi),%mm3
b4: 0f 6f 57 10 movq 0x10(%rdi),%mm2
b8: 0f 6f 4f 08 movq 0x8(%rdi),%mm1
bc: 0f 6f 07 movq (%rdi),%mm0
//载入数据
bf: 0f 6f e3 movq %mm3,%mm4 //xmm3->xmm4
c2: 0f fd da paddw %mm2,%mm3 //xmm2 + xmm3->xmm3
c5: 0f f9 d4 psubw %mm4,%mm2 //xmm4 - xmm2->xmm2
c8: 0f 6f e1 movq %mm1,%mm4 //xmm1->xmm4
cb: 0f fd c8 paddw %mm0,%mm1 //xmm0 + xmm1->xmm1
ce: 0f f9 c4 psubw %mm4,%mm0 //xmm0 - xmm1->xmm0
d1: 0f 6f e3 movq %mm3,%mm4 //xmm3->xmm4
d4: 0f fd d9 paddw %mm1,%mm3 //xmm3 + xmm1->xmm3
d7: 0f f9 cc psubw %mm4,%mm1 //xmm3 - xmm1->xmm1
da: 0f 6f e2 movq %mm2,%mm4 //
dd: 0f fd d0 paddw %mm0,%mm2
e0: 0f f9 c4 psubw %mm4,%mm0
e3: 0f 6f e3 movq %mm3,%mm4
//上面和哈德玛变换的代码一致,左边乘以哈德玛矩阵
e6: 0f 61 d9 punpcklwd %mm1,%mm3
e9: 0f 69 e1 punpckhwd %mm1,%mm4
ec: 0f 6f c8 movq %mm0,%mm1
ef: 0f 61 c2 punpcklwd %mm2,%mm0
f2: 0f 69 ca punpckhwd %mm2,%mm1
f5: 0f 6f d3 movq %mm3,%mm2
f8: 0f 62 d8 punpckldq %mm0,%mm3
fb: 0f 6a d0 punpckhdq %mm0,%mm2
fe: 0f 6f c4 movq %mm4,%mm0
101: 0f 62 e1 punpckldq %mm1,%mm4
104: 0f 6a c1 punpckhdq %mm1,%mm0
//下面一段是右边乘以 哈德玛矩阵
107: 0f 6f c8 movq %mm0,%mm1
10a: 0f fd c4 paddw %mm4,%mm0
10d: 0f f9 e1 psubw %mm1,%mm4
110: 0f 6f ca movq %mm2,%mm1
113: 0f fd d3 paddw %mm3,%mm2
116: 0f f9 d9 psubw %mm1,%mm3
119: 0f 6f c8 movq %mm0,%mm1
11c: 0f fd c2 paddw %mm2,%mm0
11f: 0f f9 d1 psubw %mm1,%mm2
122: 0f 6f cc movq %mm4,%mm1
125: 0f fd e3 paddw %mm3,%mm4
128: 0f f9 d9 psubw %mm1,%mm3
//下面是存数据到内存
12b: 0f 7f 07 movq %mm0,(%rdi)
12e: 0f 7f 57 08 movq %mm2,0x8(%rdi)
132: 0f 7f 5f 10 movq %mm3,0x10(%rdi)
136: 0f 7f 67 18 movq %mm4,0x18(%rdi)
13a: c3 retq
13b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)