# yuv 到 32 位 rgb 的 快速转换算法基mmx 实现

//  if have any problem, contact me.

// the asm function nearly have the same speed as mmx!

///  asm code

unsigned char *clip;

void  init_clip()
{
unsigned char *p;
p=(unsigned char*)malloc(2048);
clip=p+1024;
for (int i=-1024;i<1024;i++)
{
clip[i]=(i>=0)? ((i<=255)?i:255):0;
}
}

//  r=1.164(y-16)+1.596(v-128)
//  g=1.164(y-16)-0.391(u-128)-0.813(v-128)
//  b=1.164(y-16)+2.018(u-128)

const static int p_1164 = 75;
const static int p_1596 = 102;
const static int p_0391 = 25;
const static int p_0813 = 52;
const static int p_2018 = 129;

const static int ooffooff=0x00ff00ff;
const static int ffooffoo=0xff00ff00;

const static short p_223[]=  {25632,25632};
const static short p_135[]= {4349,4349};
const static short p_277[]=  { 23906,23906};

void  paroll_yuv2rgb(unsigned char *y,
unsigned char *u,
unsigned char *v,
unsigned char *r,
int h ,
int w)
{
// h: height of y matrix
// w: width of y matrix
// chroma type:: must be 420

//  r=1.164*y + 1.596*v           -223
//  g=1.164*y - 0.391*u - 0.813*v +135.9
//  b=1.164*y + 2.018*u           -276.93

int  py1164_20;
int  py1164_31;
int  pv1596;
int  pv0813;
int  pu0391;
int  pu2018;
int  pr20,pr31,pg20,pg31,pb20,pb31;

int rw=w<<2;
int rws16=rw-16;
int lw=w>>2;
int lh=h>>1;
int lw0=lw;
int iclip=(int)clip;

__asm
{
mov   esi,y
llw:
mov   edi,v

movzx ebx,byte ptr [edi]
movzx eax,byte ptr [edi+1]

mov   edi,u

shl   eax,16
or    eax,ebx               // 00 v1 00 v0

movzx ecx,byte ptr [edi+1]

mov   ebx,eax
mul   [p_0813]
shl   ecx,16
mov   [pv0813],eax
mov   eax,ebx
mul   dword ptr p_1596

movzx ebx,byte ptr [edi]

mov   [pv1596],eax

mov   eax,ecx
or    eax,ebx       // 00 u1 00 u0

mov   ecx,[esi]   // y3 y2 y1 y0

mov   ebx,eax
mul   dword ptr p_0391

mov   edi,ecx

mov   [pu0391],eax

mov   eax,ebx
mul   dword ptr p_2018

and   ecx,ooffooff     // 0  y2 0  y0
mov   [pu2018],eax

mov   eax,ecx
mul   [p_1164]        // y2 y0
and   edi,ffooffoo    // y3 0  y1 0
mov   [py1164_20],eax
mov   eax,edi
shr   eax,8           // 0  y3 0  y2

mul   [p_1164]        // y3 y1

mov   ecx,[pv1596]

mov   ebx,[py1164_20]

mov   edx,dword ptr p_223

mov   [py1164_31],eax

shr   eax,1
shr   ebx,1
shl   eax,1
shl   ebx,1

mov   [pr31],eax         // r3 r1
mov   [pr20],ebx         // r2 r0

mov   ecx,[pu2018]
mov   eax,[py1164_20]
mov   ebx,[py1164_31]
mov   edx,dword ptr [p_277]
shr   eax,1
shr   ebx,1
shl   eax,1
shl   ebx,1
mov   [pb20],eax
mov   [pb31],ebx

mov   eax,[py1164_20]
mov   ebx,[py1164_31]
mov   ecx,[pu0391]
mov   edx,[pv0813]
shr   eax,1
shr   ebx,1
shr   ecx,1
shr   edx,1
sub   eax,ecx
sub   ebx,ecx
mov   ecx,dword ptr [p_135]
sub   eax,edx
sub   ebx,edx
shl   eax,1
shl   ebx,1
mov   [pg20],eax
mov   [pg31],ebx

// clip and output
mov   edi,r

lea   edx, [pr20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // r0
mov   dl,[ebx]     // r2
mov   [edi+2],cl
mov   [edi+10],dl

lea   edx,[pr31]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // r1
mov   dl,[ebx]     // r3
mov   [edi+6],cl
mov   [edi+14],dl

lea   edx,[pg20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // g0
mov   dl,[ebx]     // g2
mov   [edi+1],cl
mov   [edi+9],dl

lea   edx,[pg31]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // g1
mov   dl,[ebx]     // g3
mov   [edi+5],cl
mov   [edi+13],dl

lea   edx,[pb20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // b0
mov   dl,[ebx]     // b2
mov   [edi],cl
mov   [edi+8],dl

lea   edx,[pb31]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // b1
mov   dl,[ebx]     // b3

mov   ebx,rw
mov   eax,[esi]   // y3 y2 y1 y0

mov   [edi+4],cl
mov   [edi+12],dl

// next row of y
mov   ebx,eax
and   eax,ooffooff     // 0  y2 0  y0
mul   [p_1164]
and   ebx,ffooffoo     // y3 0  y1 0
shr   ebx,8
mov   [py1164_20],eax
mov   eax,ebx

mul   [p_1164]

mov   ecx,pv1596

mov   ebx,py1164_20
mov   edx,dword ptr p_223

mov   [py1164_31],eax

shr   eax,1
shr   ebx,1
shl   eax,1
shl   ebx,1
mov   [pr31],eax         // r3 r1
mov   [pr20],ebx         // r2 r0

mov   ecx,[pu2018]
mov   eax,[py1164_20]
mov   ebx,[py1164_31]
mov   edx,dword ptr [p_277]
shr   eax,1
shr   ebx,1
shl   eax,1
shl   ebx,1
mov   [pb20],eax
mov   [pb31],ebx

mov   ecx,[pu0391]
mov   eax,[py1164_20]
mov   ebx,[py1164_31]
mov   edx,[pv0813]
shr   ecx,1
shr   eax,1
shr   ebx,1
shr   edx,1
sub   eax,ecx
sub   ebx,ecx
mov   ecx,dword ptr [p_135]
sub   eax,edx
sub   ebx,edx
shl   eax,1
shl   ebx,1
mov   [pg20],eax
mov   [pg31],ebx

// clip and output
mov   edi,r

lea   edx,[pr20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // r0
mov   dl,[ebx]     // r2
mov   [edi+2],cl
mov   [edi+10],dl

lea   edx,[pr31]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // r1
mov   dl,[ebx]     // r3
mov   [edi+6],cl
mov   [edi+14],dl

lea   edx,[pg20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // g0
mov   dl,[ebx]     // g2
mov   [edi+1],cl
mov   [edi+9],dl

lea   edx,[pg31]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // g1
mov   dl,[ebx]     // g3
mov   [edi+5],cl
mov   [edi+13],dl

lea   edx,[pb20]
mov   ecx,iclip

movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // b0
mov   dl,[ebx]     // b2
mov   [edi],cl
mov   [edi+8],dl

lea   edx,[pb31]
mov   ecx,iclip

movsx eax,word ptr [edx]     file://b1
movsx ebx,word ptr [edx+2]   file://b3
sar   eax,6
sar   ebx,6

xor   ecx,ecx
xor   edx,edx
mov   cl,[eax]     // b1
mov   dl,[ebx]     // b3
mov   [edi+4],cl
mov   [edi+12],dl

mov   eax,rws16
sub   esi,w
sub   [r],eax

sub   [lw],1
jnz   llw

mov   eax,lw0
mov   ebx,rw

mov   [lw],eax

sub   [lh],1
jnz   llw

}

}

/ asm code end

/ mmx code begin

#ifdef  __yuv2rgb_mul32

const static short t16[4]={16,16,16,16};
const static short t128[4]={128,128,128,128};
const short t1164[4]=
{
4768,4768,4768,4768
};
const short t1596[4]=
{
6538,6538,6538,6538
};
const short t0391[4]=
{
1602,1602,1602,1602
};
const short t0813[4]=
{
3330,3330,3330,3330
};
const short t2018[4]=
{
8266,8266,8266,8266
};

//  r=1.164(y-16)+1.596(v-128)
//  g=1.164(y-16)-0.391(u-128)-0.813(v-128)
//  b=1.164(y-16)+2.018(u-128)

#define ___0rgb

void  VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY,
unsigned char *lpU,
unsigned char *lpV,
unsigned char *lpRGB,
int nSrcHeight ,
int nSrcWidth)

{
int rgbwidth=nSrcWidth<<2;// 32 bits 0rgb;
int nyw=nSrcWidth;
int col=nSrcWidth>>3;
int row=nSrcHeight>>1;

int  t1596v_128_10[2];
int  t1596v_128_32[2];
int  t0813v_128_10[2];
int  t0813v_128_32[2];
int  t0391u_128_10[2];
int  t0391u_128_32[2];
int  t2018u_128_10[2];
int  t2018u_128_32[2];

__asm
{
mov        esi,lpU
mov        edi,lpV
mov        eax,lpY
mov        edx,lpRGB
mov        ecx,col
mov        ebx,row

rrr: pxor       mm0,mm0

movq       mm3,qword ptr t128
movd       mm2,dword ptr [edi]          file://00 00 00 00 v3 v2 v1 v0
movd       mm1,dword ptr [esi]          file://00 00 00 00 u3 u2 u1 u0
punpcklbw  mm2,mm0                      file://00 v3 00 v2 00 v1 00 v0
punpcklbw  mm1,mm0                      file://00 u3 00 u2 00 u1 00 u0
psubsw     mm1,mm3                     file://u-128
psubsw     mm2,mm3                     file://v-128
file://compute u,v data
file://t0391u_128
movq       mm7,qword ptr t0391
movq       mm3,mm1
movq       mm4,mm1
pmullw     mm4,mm7
pmulhw     mm3,mm7
movq       mm7,mm4
punpckhwd  mm4,mm3                     file://t0391u_128_32-->mm4
punpcklwd  mm7,mm3                     file://t0391u_128_10-->mm7
movq       qword ptr t0391u_128_32,mm4
movq       qword ptr t0391u_128_10,mm7
file://t2018u_128
movq       mm7,qword ptr t2018
movq       mm3,mm1
pmullw     mm1,mm7
pmulhw     mm3,mm7
movq       mm7,mm1
punpckhwd  mm1,mm3                     file://t2018u_128_32-->mm1
punpcklwd  mm7,mm3                     file://t2018u_128_10-->mm7
movq       qword ptr t2018u_128_32,mm1
movq       qword ptr t2018u_128_10,mm7
file://t1596v_128
movq       mm7,qword ptr t1596
movq       mm3,mm2
movq       mm4,mm2
pmullw     mm4,mm7
pmulhw     mm3,mm7
movq       mm7,mm4
punpckhwd  mm4,mm3                     file://t1596v_128_32-->mm4
punpcklwd  mm7,mm3                     file://t1596v_128_10-->mm7
movq       qword ptr t1596v_128_32,mm4
movq       qword ptr t1596v_128_10,mm7
file://t0813v_128
movq       mm7,qword ptr t0813
movq       mm3,mm2
pmullw     mm2,mm7
pmulhw     mm3,mm7
movq       mm7,mm2
punpckhwd  mm2,mm3                     file://t0813v_128_32-->mm2
punpcklwd  mm7,mm3                     file://t0813v_128_10-->mm7
movq       qword ptr t0813v_128_32,mm2
movq       qword ptr t0813v_128_10,mm7

movq       mm3,dword ptr [eax]          // 76 54 32 10
pxor       mm0,mm0
movq       mm2,mm3
punpcklbw  mm2,mm0                      // 03 02 01 00
punpckhbw  mm3,mm0                      // 07 06 05 04
movq       mm4,mm2
movq       mm5,mm3
punpcklwd  mm2,mm0                      // 00 01 00 00
punpckhwd  mm0,mm4                      // 03 00 02 00
pxor       mm4,mm4
por        mm0,mm2                      // 03 01 02 00--->mm0
movq       mm7,qword ptr t16
punpcklwd  mm3,mm4                      // 00 05 00 04
punpckhwd  mm4,mm5                      // 07 00 06 00
por        mm4,mm3                      // 07 05 06 04-->mm5
psubsw     mm0,mm7                    file://y-16
movq       mm5,mm4
psubsw     mm5,mm7                    file://y-16
file://compute
movq       mm7,qword ptr t1164
movq       mm6,mm0                    file://y3 y1 y2 y0
pmullw     mm6,mm7
pmulhw     mm0,mm7
movq       mm7,mm6
punpckhwd  mm7,mm0  // y3 y1     file://1.164(y-16)-->mm7
punpcklwd  mm6,mm0  // y2 y0     file://1.164(y-16)-->mm6

movq       mm0,qword ptr t1596v_128_10
movq       mm1,mm6  // y2 y0
movq       mm2,mm7  // y3 y1
// r=1.164(y-16)+1.596(v-128)
movq       mm0,mm1
punpckhdq  mm1,mm2  // r3 r2
punpckldq  mm0,mm2  // r1 r0
packssdw   mm0,mm1  // r3 r2 r1 r0 --->mm0

movq       mm1,qword ptr t0391u_128_10
movq       mm4,qword ptr t0813v_128_10
// g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq       mm2,mm6
movq       mm3,mm7
psubd      mm2,mm1
psubd      mm3,mm1
psubd      mm2,mm4
psubd      mm3,mm4
movq       mm4,mm2
movq       mm1,qword ptr t2018u_128_10
punpckhdq  mm2,mm3
punpckldq  mm4,mm3
packssdw   mm4,mm2  // g3 g2 g1 g0 --->mm4

//  b=1.164(y-16)+2.018(u-128)
movq       mm1,mm6
punpckhdq  mm1,mm7
punpckldq  mm6,mm7
pxor        mm2,mm2
packssdw   mm6,mm1  // b3 b2 b1 b0 --->mm6

// b-->mm6,g-->mm4,r-->mm0

#ifdef ___0rgb
packuswb    mm6,mm2
packuswb    mm4,mm2
packuswb    mm0,mm2
punpcklbw   mm6,mm4     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm6
punpcklbw   mm0,mm2     // 00 r3 00 r2 00 r1 00 r0  -->mm0
movq        mm7,mm6
punpcklwd   mm6,mm0     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm0     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm6
movq        qword ptr[edx+8], mm7
#else
packuswb    mm0,mm2   file://r
packuswb    mm4,mm2   file://g
packuswb    mm6,mm2   file://b

punpcklbw   mm0,mm4     // g3 r3 g2 r2 g1 r1 g0 r0  -->mm0
punpcklbw   mm6,mm2     // 00 b3 00 b2 00 b1 00 b0  -->mm6
movq        mm7,mm0

punpcklwd   mm0,mm6     // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd   mm7,mm6     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm0
movq        qword ptr[edx+8], mm7
#endif
file://compute
movq       mm7,qword ptr t1164
movq       mm6,mm5                    file://y7 y5 y6 y4
pmullw     mm6,mm7
pmulhw     mm5,mm7
movq       mm7,mm6
punpckhwd  mm7,mm5  // y7 y5     file://1.164(y-16)-->mm7
punpcklwd  mm6,mm5  // y6 y4     file://1.164(y-16)-->mm6

movq       mm0,qword ptr t1596v_128_32
movq       mm1,mm6  // y6 y4
movq       mm2,mm7  // y7 y5
// r=1.164(y-16)+1.596(v-128)
movq       mm0,mm1
punpckhdq  mm1,mm2  // r3 r2
punpckldq  mm0,mm2  // r1 r0
packssdw   mm0,mm1  // r3 r2 r1 r0 --->mm0

movq       mm1,qword ptr t0391u_128_32
movq       mm4,qword ptr t0813v_128_32
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq       mm2,mm6
movq       mm3,mm7
psubd      mm2,mm1
psubd      mm3,mm1
psubd      mm2,mm4
psubd      mm3,mm4
movq       mm1,qword ptr t2018u_128_32
movq       mm4,mm2
punpckhdq  mm2,mm3
punpckldq  mm4,mm3
packssdw   mm4,mm2  // g3 g2 g1 g0 --->mm4

//  b=1.164(y-16)+2.018(u-128)
movq       mm1,mm6
punpckhdq  mm1,mm7
punpckldq  mm6,mm7
pxor        mm2,mm2
packssdw   mm6,mm1  // b3 b2 b1 b0 --->mm6

// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb    mm6,mm2
packuswb    mm4,mm2
punpcklbw   mm6,mm4     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm6
packuswb    mm0,mm2
punpcklbw   mm0,mm2     // 00 r3 00 r2 00 r1 00 r0  -->mm0
movq        mm7,mm6
punpcklwd   mm6,mm0     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm0     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx+16], mm6
movq        qword ptr[edx+24], mm7
#else
packuswb    mm0,mm2   file://r
packuswb    mm4,mm2   file://g
packuswb    mm6,mm2   file://b

punpcklbw   mm0,mm4     // g3 r3 g2 r2 g1 r1 g0 r0  -->mm0
punpcklbw   mm6,mm2     // 00 b3 00 b2 00 b1 00 b0  -->mm6
movq        mm7,mm0

punpcklwd   mm0,mm6     // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd   mm7,mm6     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx+16], mm0
movq        qword ptr[edx+24], mm7
#endif
/
file://second stage , next row of y

movq       mm3,dword ptr [eax]          // 76 54 32 10
pxor       mm0,mm0
movq       mm2,mm3
punpcklbw  mm2,mm0                      // 03 02 01 00
punpckhbw  mm3,mm0                      // 07 06 05 04
movq       mm4,mm2
punpcklwd  mm2,mm0                      // 00 01 00 00
punpckhwd  mm0,mm4                      // 03 00 02 00
pxor       mm4,mm4
por        mm0,mm2                      // 03 01 02 00--->mm0
movq       mm7,qword ptr t16
movq       mm5,mm3
punpcklwd  mm3,mm4                      // 00 05 00 04
punpckhwd  mm4,mm5                      // 07 00 06 00
por        mm4,mm3                      // 07 05 06 04-->mm4
psubsw     mm0,mm7                      file://y-16
movq       mm5,mm4
psubsw     mm5,mm7                      file://y-16

file://compute
movq       mm7,qword ptr t1164
movq       mm6,mm0                    file://y3 y1 y2 y0
pmullw     mm6,mm7
pmulhw     mm0,mm7
movq       mm7,mm6
punpckhwd  mm7,mm0  // y3 y1     file://1.164(y-16)-->mm7
punpcklwd  mm6,mm0  // y2 y0     file://1.164(y-16)-->mm6

movq       mm0,qword ptr t1596v_128_10
movq       mm1,mm6  // y2 y0
movq       mm2,mm7  // y3 y1
// r=1.164(y-16)+1.596(v-128)
movq       mm0,mm1
punpckhdq  mm1,mm2  // r3 r2
punpckldq  mm0,mm2  // r1 r0
packssdw   mm0,mm1  // r3 r2 r1 r0 --->mm0

movq       mm1,qword ptr t0391u_128_10
movq       mm4,qword ptr t0813v_128_10
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq       mm2,mm6
movq       mm3,mm7
psubd      mm2,mm1
psubd      mm3,mm1
psubd      mm2,mm4
psubd      mm3,mm4
movq       mm4,mm2
movq       mm1,qword ptr t2018u_128_10
punpckhdq  mm2,mm3
punpckldq  mm4,mm3
packssdw   mm4,mm2  // g3 g2 g1 g0 --->mm4

//  b=1.164(y-16)+2.018(u-128)
movq       mm1,mm6
punpckhdq  mm1,mm7
punpckldq  mm6,mm7
pxor        mm2,mm2
packssdw   mm6,mm1  // b3 b2 b1 b0 --->mm6

// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb    mm6,mm2
packuswb    mm4,mm2
punpcklbw   mm6,mm4     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm6
packuswb    mm0,mm2
punpcklbw   mm0,mm2     // 00 r3 00 r2 00 r1 00 r0  -->mm0
movq        mm7,mm6
punpcklwd   mm6,mm0     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm0     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm6
movq        qword ptr[edx+8], mm7
#else
packuswb    mm0,mm2   file://r
packuswb    mm4,mm2   file://g
packuswb    mm6,mm2   file://b

punpcklbw   mm0,mm4     // g3 r3 g2 r2 g1 r1 g0 r0  -->mm0
punpcklbw   mm6,mm2     // 00 b3 00 b2 00 b1 00 b0  -->mm6
movq        mm7,mm0

punpcklwd   mm0,mm6     // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd   mm7,mm6     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm0
movq        qword ptr[edx+8], mm7
#endif
file://compute
movq       mm7,qword ptr t1164
movq       mm6,mm5                    file://y7 y5 y6 y4
pmullw     mm6,mm7
pmulhw     mm5,mm7
movq       mm7,mm6
punpckhwd  mm7,mm5  // y7 y5     file://1.164(y-16)-->mm7
punpcklwd  mm6,mm5  // y6 y4     file://1.164(y-16)-->mm6

movq       mm0,qword ptr t1596v_128_32
movq       mm1,mm6  // y6 y4
movq       mm2,mm7  // y7 y5
// r=1.164(y-16)+1.596(v-128)
movq       mm0,mm1
punpckhdq  mm1,mm2  // r3 r2
punpckldq  mm0,mm2  // r1 r0
packssdw   mm0,mm1  // r3 r2 r1 r0 --->mm0

movq       mm1,qword ptr t0391u_128_32
movq       mm4,qword ptr t0813v_128_32
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq       mm2,mm6
movq       mm3,mm7
psubd      mm2,mm1
psubd      mm3,mm1
psubd      mm2,mm4
psubd      mm3,mm4
movq       mm1,qword ptr t2018u_128_32
movq       mm4,mm2
punpckhdq  mm2,mm3
punpckldq  mm4,mm3
packssdw   mm4,mm2  // g3 g2 g1 g0 --->mm4

//  b=1.164(y-16)+2.018(u-128)
movq       mm1,mm6
punpckhdq  mm1,mm7
punpckldq  mm6,mm7
pxor        mm2,mm2
packssdw   mm6,mm1  // b3 b2 b1 b0 --->mm6

// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb    mm6,mm2
packuswb    mm4,mm2
punpcklbw   mm6,mm4     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm6
packuswb    mm0,mm2
punpcklbw   mm0,mm2     // 00 r3 00 r2 00 r1 00 r0  -->mm0
movq        mm7,mm6
punpcklwd   mm6,mm0     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm0     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx+16], mm6
movq        qword ptr[edx+24], mm7
#else
packuswb    mm0,mm2   file://r
packuswb    mm4,mm2   file://g
packuswb    mm6,mm2   file://b

punpcklbw   mm0,mm4     // g3 r3 g2 r2 g1 r1 g0 r0  -->mm0
punpcklbw   mm6,mm2     // 00 b3 00 b2 00 b1 00 b0  -->mm6
movq        mm7,mm0

punpcklwd   mm0,mm6     // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd   mm7,mm6     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx+16], mm0
movq        qword ptr[edx+24], mm7
#endif
sub        eax,nyw
sub        edx,rgbwidth
dec        ecx
jnz        rrr

mov        ecx,col
dec        ebx
jnz        rrr
emms
}
}

#else

short t1164[4]=
{
19071,19071,19071,19071//<<2
};
short t1596[4]=
{
26149,26149,26149,26149//<<2
};
short t0391[4]=
{
25625,25625,25625,25625//<<0
};
short t0813[4]=
{
26641,26641,26641,26641//<<1
};
short t2018[4]=
{
16532,16532,16532,16532//<<3
};
short t16[4]=
{
16,16,16,16
};
short t128[4]=
{
128,128,128,128
};

void  VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY,
unsigned char *lpU,
unsigned char *lpV,
unsigned char *lpRGB,
int           nSrcHeight,
int           nSrcWidth)
{
int rgbwidth=nSrcWidth<<2;// 32 bits rgb0;
int nyw=nSrcWidth;
int col=nSrcWidth>>3;
int row=nSrcHeight>>1;

#define  mmt2018u  mm1
#define  mmt0813v  mm2
#define  mmt0391u  mm3
#define  mmt1596v  mm4

__int64  ty;

__asm
{
mov        esi,lpU
mov        edi,lpV
mov        eax,lpY
mov        edx,lpRGB
mov        ecx,col
mov        ebx,row

rrr:
pxor       mm0,mm0

movq       mm3,qword ptr t128
movq       mm4,qword ptr t0391
movq       mm5,qword ptr t2018
movq       mm6,qword ptr t1596
movq       mm7,qword ptr t0813

movd       mm1,dword ptr [esi]
movd       mm2,dword ptr [edi]
punpcklbw  mm1,mm0
punpcklbw  mm2,mm0

file://copute u,v
psubsw     mm1,mm3           file://u-128
psubsw     mm2,mm3           file://v-128
movq       mm3,mm1
psllw      mm1,3
pmulhw     mm3,mm4         // t0391u-->mm3
pmulhw     mm1,mm5         // t2018u-->mm1
movq       mm4,mm2
psllw      mm2,1
psllw      mm4,2
pmulhw     mm2,mm7          // t0813v-->mm2
pmulhw     mm4,mm6          // t1596v-->mm4

movq       mm5,dword ptr [eax]          // 76 54 32 10
pxor       mm0,mm0
movq       mm6,mm5
punpcklbw  mm5,mm0                      // 03 02 01 00
punpckhbw  mm0,mm6                      // 70 60 50 40
por        mm0,mm5                      // 73 62 51 40
pxor       mm6,mm6
pxor       mm5,mm5
punpckhbw  mm6,mm0                    // 70 30 60 20
punpcklbw  mm0,mm5                    // 05 01 04 00
por        mm0,mm6                    // 75 31 64 20
pxor       mm5,mm5
movq       mm6,mm0
punpckhbw  mm6,mm5                    // y7 y5 y3 y1
punpcklbw  mm0,mm5
movq       mm5,qword ptr t16
movq       mm7,qword ptr t1164
psubsw     mm6,mm5
psubsw     mm0,mm5
psllw      mm6,2
psllw      mm0,2
pmulhw     mm6,mm7
pmulhw     mm0,mm7                     // y6 y4 y2 y0 -->mm0
movq       qword ptr ty,mm6            // y7 y5 y3 y1 -->ty

file://compute
pxor       mm7,mm7
movq       mm5,mmt1596v
movq       mm6,mm0                     file://copy 1.164(y-16)
psubsw     mm6,mmt0391u                  file://1.164(y-16)-0.391(u-128)
psubsw     mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128)     g-->mm6

// b-->mm0,g-->mm6,r-->mm5
packuswb    mm6,mm7
packuswb    mm0,mm7
punpcklbw   mm0,mm6     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm0
packuswb    mm5,mm7
punpcklbw   mm5,mm7     // 00 r3 00 r2 00 r1 00 r0  -->mm5
movq        mm7,mm0
punpcklwd   mm0,mm5     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm5     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm0
movq  mm0,qword ptr ty
movq        qword ptr[edx+8], mm7

file://compute
pxor       mm7,mm7
movq       mm5,mmt1596v
movq       mm6,mm0                     file://copy 1.164(y-16)
psubsw     mm6,mmt0391u                  file://1.164(y-16)-0.391(u-128)
psubsw     mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128)     g-->mm6

// b-->mm0,g-->mm6,r-->mm5
packuswb    mm6,mm7
packuswb    mm0,mm7
punpcklbw   mm0,mm6     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm0
packuswb    mm5,mm7
punpcklbw   mm5,mm7     // 00 r3 00 r2 00 r1 00 r0  -->mm5
movq        mm7,mm0
movq        mm6,[edx]   // 2 0

punpcklwd   mm0,mm5     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm5     // 00 r3 g3 b3 00 r2 g2 b2

movq        mm5,mm6
punpckldq   mm6,mm0     // 1 0
punpckhdq   mm5,mm0     // 3 2
movq        mm0,[edx+8] // 4 6
movq        [edx],mm6
movq        [edx+8], mm5
movq        mm6,mm0
punpckhdq   mm0,mm7     // 7 6
punpckldq   mm6,mm7     // 5 4
movq        [edx+24], mm0
movq        [edx+16],mm6

file://next row of y

movq       mm5,dword ptr [eax]          // 76 54 32 10
pxor       mm0,mm0
movq       mm6,mm5
punpcklbw  mm5,mm0                      // 03 02 01 00
punpckhbw  mm0,mm6                      // 70 60 50 40
por        mm0,mm5                      // 73 62 51 40
pxor       mm6,mm6
pxor       mm5,mm5
punpckhbw  mm6,mm0                    // 70 30 60 20
punpcklbw  mm0,mm5                    // 05 01 04 00
por        mm0,mm6                    // 75 31 64 20
pxor       mm5,mm5
movq       mm6,mm0
punpckhbw  mm6,mm5                    // y7 y5 y3 y1
punpcklbw  mm0,mm5
movq       mm5,qword ptr t16
movq       mm7,qword ptr t1164
psubsw     mm6,mm5
psubsw     mm0,mm5
psllw      mm6,2
psllw      mm0,2
pmulhw     mm6,mm7
pmulhw     mm0,mm7                      // y6 y4 y2 y0 -->mm0
movq       qword ptr ty,mm6           // y7 y5 y3 y1 -->ty

file://compute
pxor       mm7,mm7
movq       mm5,mmt1596v
movq       mm6,mm0                     file://copy 1.164(y-16)
psubsw     mm6,mmt0391u                  file://1.164(y-16)-0.391(u-128)
psubsw     mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128)     g-->mm6

// b-->mm0,g-->mm6,r-->mm5
packuswb    mm6,mm7
packuswb    mm0,mm7
punpcklbw   mm0,mm6     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm0
packuswb    mm5,mm7
punpcklbw   mm5,mm7     // 00 r3 00 r2 00 r1 00 r0  -->mm5
movq        mm7,mm0
punpcklwd   mm0,mm5     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm5     // 00 r3 g3 b3 00 r2 g2 b2
movq        qword ptr[edx], mm0
movq        qword ptr[edx+8], mm7

file://compute
movq       mm0,qword ptr ty
pxor       mm7,mm7
movq       mm5,mmt1596v
movq       mm6,mm0                     file://copy 1.164(y-16)
psubsw     mm6,mmt0391u                  file://1.164(y-16)-0.391(u-128)
psubsw     mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128)     g-->mm6

// b-->mm0,g-->mm6,r-->mm5
packuswb    mm6,mm7
packuswb    mm0,mm7
punpcklbw   mm0,mm6     // g3 b3 g2 b2 g1 b1 g0 b0  -->mm0
packuswb    mm5,mm7
punpcklbw   mm5,mm7     // 00 r3 00 r2 00 r1 00 r0  -->mm5
movq        mm7,mm0

movq        mm6,[edx]   // 2 0

punpcklwd   mm0,mm5     // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd   mm7,mm5     // 00 r3 g3 b3 00 r2 g2 b2

movq        mm5,mm6
punpckldq   mm6,mm0     // 1 0
punpckhdq   mm5,mm0     // 3 2
movq        mm0,[edx+8] // 4 6
movq        [edx],mm6
movq        [edx+8], mm5
movq        mm6,mm0
punpckhdq   mm0,mm7     // 7 6
punpckldq   mm6,mm7     // 5 4
movq        [edx+24], mm0
movq        [edx+16],mm6

sub        eax,nyw
sub        edx,rgbwidth
dec        ecx
jnz        rrr

mov        ecx,col
dec        ebx
jnz        rrr
emms
}
}
#endif

• 0
点赞
• 0
收藏
觉得还不错? 一键收藏
• 打赏
• 2
评论

baojinlong

¥1 ¥2 ¥4 ¥6 ¥10 ¥20

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。