大家都知道使用Overlay平面显示视频可能充分利用显示的硬件加速功能,自由缩放,同时显著降低CPU的消耗。但现在市场上流行的MP4,H.264板卡解压时往往输出的格式是YV12,这种格式在多数显卡上无法正常显示,一般需要转换为YUYV格式进行显示。
一.直接C++代码进行转换
void CDispOverlay::BufferFrame(
UINT nID, // 通道编号
PBYTE pBuf, // 源数据
UINT nLen, // 缓冲区长度
DWORD nFmt, // 原始格式, 可以是YUYV,RGB24或YV12
UINT nWidth, // 宽度(缺省:352)
UINT nHeight // 高(缺省:288)
)
{
/*
其它格式的转换
*/
if(MAKEFOURCC('Y','U','Y','2')==m_pDisp->GetFmt()->biCompression
&& MAKEFOURCC('Y','V','1','2')==nFmt)
{
// pBuf指向YV12格式的数据
UINT x=0, y=0;
BYTE* pDest = (BYTE*)m_pDispData[nID].GetData();
BYTE* y_src = (BYTE*)pBuf;
BYTE* v_src = (BYTE*)(pBuf+(nWidth*nHeight));
BYTE* u_src = (BYTE*)(pBuf+(nWidth*nHeight*5/4));
UINT width=nWidth>>1;
for (y = 0; y
{
for (x = 0; x
{
pDest[0] = y_src[x<<1];
pDest[1] = u_src[x];
pDest[2] = y_src[(x<<1)+1];
pDest[3]= v_src[x];
pDest += 4;
}
y_src += nWidth;
if (y & 1)
{
u_src += width;
v_src += width;
}
}
}
}
二.使用MMX指令集
int height=nHeight>>1;
for(int y=0;y
{ //about 10% faster than plain C++
__asm{
mov edi,[pDest]
mov ebx,[nWidth]
shr ebx,3
mov esi,[y_src]
mov ecx,[u_src]
mov edx,[v_src]
xor eax,eax ; x=0
align 8
xxloop1:
movd mm1,[edx+4*eax]
movd mm0,[ecx+4*eax]
punpcklbw mm0, mm1 ; [vuvu|vuvu]
movq mm2,[esi+8*eax]
movq mm3, mm2
punpcklbw mm2, mm0 ; [vyuy|vyuy]
movntq [edi], mm2
punpckhbw mm3, mm0 ; [vyuy|vyuy]
movntq [edi+8], mm3
add edi,16
inc eax
cmp eax,ebx
jb xxloop1
add esi,[width]
xor eax,eax
xxloop2:
movd mm1,[edx+4*eax]
movd mm0,[ecx+4*eax]
punpcklbw mm0, mm1 ; [vuvu|vuvu]
movq mm2,[esi+8*eax]
movq mm3, mm2
punpcklbw mm2, mm0 ; [vyuy|vyuy]
movntq [edi], mm2
punpckhbw mm3, mm0 ; [vyuy|vyuy]
movntq [edi+8], mm3
add edi,16
inc eax
cmp eax, ebx
jb xxloop2
add esi,[width]
mov [pDest],edi
mov [y_src],esi
shl ebx,2
add ecx,ebx
add edx,ebx
mov [u_src],ecx
mov [v_src],edx
}
}
__asm EMMS;
三.使用SSE指令集
int height=nHeight>>1;
for(int y=0;y
{//about 20% faster than plain C++
__asm{
mov edi,[pDest]
mov ebx,[nWidth]
shr ebx,4
mov esi,[y_src]
mov ecx,[u_src]
mov edx,[v_src]
xor eax,eax ; x=0
align 16
xloop1:
movq xmm1,mmword ptr [edx+8*eax]
movq xmm0,mmword ptr [ecx+8*eax]
movdqa xmm2,xmmword ptr [esi]
inc eax
movq xmm5,mmword ptr [edx+8*eax]
punpcklbw xmm0, xmm1 ; [vuvu|vuvu]
movq xmm4,mmword ptr [ecx+8*eax]
movdqa xmm3, xmm2
movdqa xmm7,xmmword ptr [esi+16]
punpcklbw xmm4,xmm5
movdqa xmm6,xmm7
punpcklbw xmm2, xmm0 ; [vyuy|vyuy]
movntdq [edi], xmm2
punpckhbw xmm3, xmm0 ; [vyuy|vyuy]
punpcklbw xmm6,xmm4
movntdq xmmword ptr [edi+16], xmm3
punpckhbw xmm7,xmm4
movntdq [edi+32],xmm6
add esi,32
movntdq [edi+48],xmm7
add edi,64
inc eax
cmp eax,ebx
jb xloop1
xor eax,eax
xloop2:
movq xmm1,mmword ptr [edx+8*eax]
movq xmm0,mmword ptr [ecx+8*eax]
movdqa xmm2,xmmword ptr [esi]
inc eax
movq xmm5,mmword ptr [edx+8*eax]
punpcklbw xmm0, xmm1 ; [vuvu|vuvu]
movq xmm4,mmword ptr [ecx+8*eax]
movdqa xmm3, xmm2
movdqa xmm7,xmmword ptr [esi+16]
punpcklbw xmm4,xmm5
movdqa xmm6,xmm7
punpcklbw xmm2, xmm0 ; [vyuy|vyuy]
movntdq [edi], xmm2
punpckhbw xmm3, xmm0 ; [vyuy|vyuy]
punpcklbw xmm6,xmm4
movntdq xmmword ptr [edi+16], xmm3
punpckhbw xmm7,xmm4
movntdq [edi+32],xmm6
add esi,32
movntdq [edi+48],xmm7
add edi,64
inc eax
cmp eax, ebx
jb xloop2
mov [pDest],edi
mov [y_src],esi
shl ebx,3
add ecx,ebx
add edx,ebx
mov [u_src],ecx
mov [v_src],edx
}
}
__asm EMMS;
四.如何判断CPU是否支持MMX和SSE指令集
#include
static bool _IsFeature(DWORD dwRequestFeature)
{
_p_info cpuinfo;
_cpuid(&cpuinfo);
return (cpuinfo.feature & dwRequestFeature)!=0;
}
bool IsMMX()
{
static bool bMMX = _IsFeature(_CPU_FEATURE_MMX);
return(bMMX);
}
bool IsSSE2()
{
/**/
static bool bSSE2 = _IsFeature(_CPU_FEATURE_SSE2);
return(bSSE2);
/**/
}
bool IsSSE()
{
/**/
static bool bSSE = _IsFeature(_CPU_FEATURE_SSE);
return(bSSE);
/**/
}
bool Is3DNow()
{
/**/
static bool b3DNow = _IsFeature(_CPU_FEATURE_3DNOW);
return(b3DNow);
/**/
}