# 能是最快的算法alpha blend汇编源代码

 Intel官方网站有一个ablend_565的快速汇编算法，理论上是是把一块32bit RGBA渲染到16bit的buffer上，我的机器是PIII800,函数在system menory中进行，640*480的256级alpha blending，达到100fps，我想可以满足绝大部分的要求了，在这里，我提供了这个算法的应用，希望可以对大家有所帮助。ablend_565函数，源代码可以直接编译使用，无需其他库函数，感谢intel提供这么好的东西。 首先，我提供一些本人编写的把32bit tga文件读入pRGBABuffer的函数文件尺寸保存在 width,height//-----------------------------------------------------------------------// Name: LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )// Desc: 读取32bit tga文件到DWORD缓冲里，返回其尺寸// Time: 2002.06.22 00:36// Author: RealRender// Para:// Return:// Note: 这段代码来自directx 7.0 sample中的d3dtextr.cpp，我把他提取了出来// 方便使用//-----------------------------------------------------------------------BOOL LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height ){FILE* file = fopen( strPathname, "rb" );if( NULL == file )return false;struct TargaHeader{BYTE IDLength;BYTE ColormapType;BYTE ImageType;BYTE ColormapSpecification[5];WORD XOrigin;WORD YOrigin;WORD ImageWidth;WORD ImageHeight;BYTE PixelDepth;BYTE ImageDescriptor;} tga;fread( &tga, sizeof(TargaHeader), 1, file );// Only true color, non-mapped images are supportedif( ( 0 != tga.ColormapType ) ||( tga.ImageType != 10 && tga.ImageType != 2 ) ){fclose( file );return false;}// Skip the ID field. The first byte of the header is the length of this fieldif( tga.IDLength )fseek( file, tga.IDLength, SEEK_CUR );DWORD m_dwWidth = tga.ImageWidth;DWORD m_dwHeight = tga.ImageHeight;DWORD m_dwBPP = tga.PixelDepth;DWORD *m_pRGBAData = new DWORD[m_dwWidth*m_dwHeight];if( m_pRGBAData == NULL ){fclose(file);return false;}for( DWORD y=0; y{DWORD dwOffset = y*m_dwWidth;if( 0 == ( tga.ImageDescriptor & 0x0010 ) )dwOffset = (m_dwHeight-y-1)*m_dwWidth;for( DWORD x=0; x{if( tga.ImageType == 10 ){BYTE PacketInfo = getc( file );WORD PacketType = 0x80 & PacketInfo;WORD PixelCount = ( 0x007f & PacketInfo ) + 1;if( PacketType ){DWORD b = getc( file );DWORD g = getc( file );DWORD r = getc( file );DWORD a = 0xff;if( m_dwBPP == 32 )a = getc( file );while( PixelCount-- ){m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);x++;}}else{while( PixelCount-- ){BYTE b = getc( file );BYTE g = getc( file );BYTE r = getc( file );BYTE a = 0xff;if( m_dwBPP == 32 )a = getc( file );m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);x++;}}}else{BYTE b = getc( file );BYTE g = getc( file );BYTE r = getc( file );BYTE a = 0xff;if( m_dwBPP == 32 )a = getc( file );m_pRGBAData[dwOffset+x] = (r<<24L)+(g<<16L)+(b<<8L)+(a);x++;}}}fclose( file );// Check for alpha contentfor( DWORD i=0; i<(m_dwWidth*m_dwHeight); i++ ){if( m_pRGBAData[i] & 0x000000ff != 0xff ){//m_bHasAlpha = TRUE;break;}}*pRGBABuffer = m_pRGBAData;*width = m_dwWidth;*height = m_dwHeight;return true;}把32bit buffer分割为rgb和alpha的代码。

BYTE* p = new BYTE[lSize*2+8];
BYTE* pOrig = p;
p += (DWORD)p%8;
WORD* color = (WORD*)p;

//-----------------------------------------------------------------------
// Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
// Desc:
// Time: 2002.06.22 00:36
// Author: RealRender
// Para:
// Return:
// Note:

//-----------------------------------------------------------------------
void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )
{
long lSize = lWidth*lHeight;
BYTE* alpha = new BYTE[lSize];
BYTE* p = new BYTE[lSize*2+8];
//

p += (DWORD)p%8;
WORD* color = (WORD*)p;

DWORD dwPixel;
DWORD r, g, b, a;
for( int i = 0; i < lSize; i++ )
{
dwPixel = pRGBABuffer[i];
r = ((dwPixel>>24)&0x000000ff);
g = ((dwPixel>>16)&0x000000ff);
b = ((dwPixel>> 8)&0x000000ff);
a = ((dwPixel>> 0)&0x000000ff);

alpha[i] = a;
// 888i

color[i] = RGBTo16( r, g, b );
}
*pAlpha = alpha;
*pBitmap = color;
}

//

unsigned char *lpAlpha, // 256
alpha通道
unsigned int iAlpPitch, // alpha

unsigned char *lpSrc, //

unsigned int iSrcX, //
unsigned int iSrcY, //

unsigned int iSrcPitch, //

unsigned char *lpDst, //

unsigned int iDstX,
unsigned int iDstY, //

unsigned int iDstW,
unsigned int iDstH, //

unsigned int iDstPitch //

void ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,
unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,
unsigned int iSrcPitch, unsigned char *lpDst,
unsigned int iDstX, unsigned int iDstY,
unsigned int iDstW, unsigned int iDstH,
unsigned int iDstPitch)
{
//Mask for isolating the red,green, and blue components

//constants used by the integer alpha blending equation

static __int64 SIXTEEN=0x0010001000100010;

static __int64 FIVETWELVE=0x0200020002000200;

static __int64 SIXONES=0x003F003F003F003F;

unsigned char *lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst; //base pointer for linear destination

unsigned char *lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc; //base pointer for linear source

unsigned char *lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha; //base pointer for linear alpha

_asm{

mov esi,lpLinearSrcBp; //src

mov edi,lpLinearDstBp; //dst

mov eax,lpLinearAlpBp; //alpha

mov ecx,iDstH; //ecx=number of lines to copy

mov ebx,iDstW; //ebx=span width to copy

test esi,6; //check if source address is qword aligned

//since addr coming in is always word aligned(16bit)

jnz done; //if not qword aligned we don't do anything

primeloop:

movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0

pxor mm2,mm2; //mm2=0;

movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0

punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0

loopqword:

mov edx,[eax];

test ebx,0xFFFFFFFC; //check if only 3 pixels left

jz checkback; //3 or less pixels left

//early out tests

cmp edx,0xffffffff; //test for alpha value of 1

je copyback; //if 1's copy the source pixels to the destination

test edx,0xffffffff; //test for alpha value of 0

jz leavefront; //if so go to the next 4 pixels

//the alpha blend starts

//green

//i=a*sg+(63-a)*dg;

//i=(i+32)+((i+32)>>6)>>6;

//red

//i=a*sr+(31-a)*dr;

//i=(i+16)+((i+16)>>5)>>5;

movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0

psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits

psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow

movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0

psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow

psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits

pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0

movq mm2,SIXONES;//g4: mm2=63

pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0

movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0

psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0

pmullw mm4,mm0; //g8: mm4=sg?*a?

movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0

pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)

movq mm2,mm7; //b4: mm2=fiveones

pand mm3,mm7; //b4: mm3=sb3 sb2 sb1 sb0

pmullw mm3,mm1; //b6: mm3=sb?*a?

pand mm0,mm7; //b5: mm0=db3 db2 db1 db0

movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0

pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0

psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0

pmullw mm0,mm2; //b7: mm0=db?*(1-a?)

movq mm5,mm4; //g12: mm5=mm4 green

psrlw mm7,11; //r4: shift src red down to position 0

psrlw mm4,6; //g13: mm4=mm4>>6

movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0

pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0

psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green

movq mm3,mm0; //b10: mm3=mm0 blue

psrlw mm0,5; //b11: mm0=mm0>>5 blue

psrlw mm5,11; //r6: shift dst red down to position 0

psrlw mm0,5; //b13: mm0=000b 000b 000b 000b blue

pmullw mm7,mm1; //mm7=sr?*a?

pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green

pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)

por mm0,mm4; //mm0=00gb 00gb 00gb 00gb

add eax,4; //move to next 4 alphas

add esi,8; //move to next 4 pixels in src

add edi,8; //move to next 4 pixels in dst

movd mm1,[eax]; //mm1=00 00 00 00 a3 a2 a1 a0

pxor mm2,mm2; //mm2=0;

movq mm7,mm5; //r10: mm7=mm5 red

psrlw mm5,5; //r11: mm5=mm5>>5 red

movq mm4,[esi]; //g1: mm4=src3 src2 src1 src0

punpcklbw mm1,mm2; //mm1=00a3 00a2 00a1 00a0

psrlw mm5,5; //r13: mm5=mm5>>5 red

psllw mm5,11; //r14: mm5=mm5<<10 red

por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb

sub ebx,4; //polished off 4 pixels

movq [edi-8],mm0; //dst=0rgb 0rgb 0rgb 0rgb

jmp loopqword; //go back to start

copyback:

movq [edi],mm4; //copy source to destination

leavefront:

sub ebx,4; //decrease pixel count by 4

jmp primeloop;

checkback:

test ebx,0xFF; //check if 0 pixels left

jz nextline; //done with this span

//backalign: //work out back end pixels

movq mm5,[edi]; //g2: mm5=dst3 dst2 dst1 dst0

psrlw mm1,2; //mm1=a?>>2 nuke out lower 2 bits

psrlw mm4,1; //g3a: move src green down by 1 so that we won't overflow

movq mm0,mm1; //mm0=00a3 00a2 00a1 00a0

psrlw mm5,1; //g3b: move dst green down by 1 so that we won't overflow

psrlw mm1,1; //mm1=a?>>1 nuke out lower 1 bits

pand mm4,mm7; //g5: mm4=sg3 sg2 sg1 sg0

movq mm2,SIXONES;//g4: mm2=63

pand mm5,mm7; //g7: mm5=dg3 dg2 dg1 dg0

movq mm3,[esi]; //b1: mm3=src3 src2 src1 src0

psubsb mm2,mm0; //g6: mm2=63-a3 63-a2 63-a1 63-a0

pmullw mm4,mm0; //g8: mm4=sg?*a?

movq mm0,[edi]; //b3: mm0=dst3 dst2 dst1 dst0

pmullw mm5,mm2; //g9: mm5=dg?*(1-a?)

movq mm2,mm7; //b4: mm2=fiveones

pand mm3,mm7; //b4: mm3=sr3 sr2 sr1 sr0

pmullw mm3,mm1; //b6: mm3=sb?*a?

pand mm0,mm7; //b5: mm0=db3 db2 db1 db0

movq mm7,[esi]; //r1: mm7=src3 src2 src1 src0

pand mm7,MASKR; //r2: mm7=sr3 sr2 sr1 sr0

psubsb mm2,mm1; //b5a: mm2=31-a3 31-a2 31-a1 31-a0

pmullw mm0,mm2; //b7: mm0=db?*(1-a?)

movq mm5,mm4; //g12: mm5=(i+512) green

psrlw mm7,11; //r4: shift src red down to position 0

psrlw mm4,6; //g13: mm4=(i+512)>>6

movq mm5,[edi]; //r3: mm5=dst3 dst2 dst1 dst0

pand mm5,MASKR; //r5: mm5=dr3 dr2 dr1 dr0

psrlw mm4,5; //g15: mm4=0?g0 0?g0 0?g0 0?g0 green

movq mm3,mm0; //b10: mm3=(i+16) blue

psrlw mm0,5; //b11: mm0=(i+16)>>5 blue

psrlw mm5,11; //r6: shift dst red down to position 0

psrlw mm0,5; //b13: mm0=000r 000r 000r 000r blue

pmullw mm7,mm1; //mm7=sr?*a?

pand mm4,MASKG; //g16: mm4=00g0 00g0 00g0 00g0 green

pmullw mm5,mm2; //r7: mm5=dr?*(31-a?)

por mm0,mm4; //mm0=00gb 00gb 00gb 00gb

add eax,4; //move to next 4 alphas

//stall

movq mm7,mm5; //r10: mm7=(i+16) red

psrlw mm5,5; //r11: mm5=(i+16)>>5 red

psrlw mm5,5; //r13: mm5=(i+16)+((i+16)>>5)>>5 red

psllw mm5,11; //r14: mm5=mm5<<10 red

por mm0,mm5; //mm0=0rgb 0rgb 0rgb 0rgb

test ebx,2; //check if there are 2 pixels

jz oneendpixel; //goto one pixel if that's it

movd [edi],mm0; //dst=0000 0000 0rgb 0rgb

psrlq mm0,32; //mm0>>32

sub ebx,2; //saved 2 pixels

jz nextline; //all done goto next line

oneendpixel: //work on last pixel

movd edx,mm0; //edx=0rgb

mov [edi],dx; //dst=0rgb

nextline: //goto next line

dec ecx; //nuke one line

jz done; //all done

mov eax,lpLinearAlpBp; //alpha

mov esi,lpLinearSrcBp; //src

mov edi,lpLinearDstBp; //dst

add eax,iAlpPitch; //inc alpha ptr by 1 line

add esi,iSrcPitch; //inc src ptr by 1 line

add edi,iDstPitch; //inc dst ptr by 1 line

mov lpLinearAlpBp,eax; //save new alpha base ptr

mov ebx,iDstW; //ebx=span width to copy

mov lpLinearSrcBp,esi; //save new src base ptr

mov lpLinearDstBp,edi; //save new dst base ptr

jmp primeloop; //start the next span

done:

emms

}

}
• 本文已收录于以下专栏：

举报原因： 您举报文章：能是最快的算法alpha blend汇编源代码 色情 政治 抄袭 广告 招聘 骂人 其他 (最多只允许输入30个字)