光栅预乘Alpha处理函数,像素格式为BBGGRRAA(BB为低地址),
各个版本都集齐了(MMX、SSE、SSE2),原先用于桌面透明窗口(UpdateLayeredWindow)。
注意几点:
1. MMX版本一次处理2个pixel、SSE版本一次处理2个pixel(但是指令更简洁)、SSE2版本一次处理4个pixel,
所以效率层面是 MMX < SSE < SSE2。
2. 处理行内剩余像素的时候,指令会取“当前行末剩余像素” + “下一行首个像素”,当处理"最后一行"的时候,
可能会导致“读取内存越界”。一般情况下都不太会有问题,后面4个字节,取出来但是不会用。
MMX和SSE版本都是一次性处理2个pixel,如果光栅宽度是奇数,就会取 “下一行首个像素”。
3. SSE2版本一次处理4个pixel,如果光栅宽度不是4的倍数,当前版本未做处理,不会导致“读取内存越界”,
但是效果上有缺陷。如果你要用SSE2版本,保证光栅宽度是4的倍数。
//< 32位光栅预乘Alpha,使用MMX指令处理
/************************************************************************/
void BltSurface32ToDIB32_SelfMulAlphaMMX( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{
int src_pitch_sub_dst_pitch; //src pointer对齐到下一行scanline,需要跳过多少字节
__asm
{
//取参数,判断width和height是否有任一为0
mov eax, height //eax = heigh
mov ebx, width
mul ebx //width * height
test eax, eax //影响ZF
jz end_pixel
//常量赋值
mov esi, pSrc
mov edi, pDst
pcmpeqd mm5, mm5 //mm5 = 0xffffffff_ffffffff
pcmpeqd mm6, mm6 //mm6 = 0xffffffff_ffffffff
psrld mm5, 8 //mm5 = 0x00ffffff_00ffffff
psrlw mm6, 8 //mm6 = 0x00ff_00ff_00ff_00ff
pxor mm7, mm7 //mm7 = 0x0
//判断pitch
mov edx, src_pitch
shl ebx, 2 //每个像素4个字节, dst_pitch = width * 4
sub edx, ebx //src_pitch - dst_pitch
jnz diff_pitch
//same_pitch:
mov ecx, eax
mov edx, 1 //how many lines,eax和edx构成2层循环
and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)
shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)
jmp test_pair_pixel
diff_pitch:
mov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitch
mov eax, width
mov edx, height //how many lines,eax和edx构成2层循环
mov ecx, eax
and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)
shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)
mov ebx, eax //main loop count on every scanline
jmp test_pair_pixel
loop_line:
mov eax, ebx
loop_pair_pixel:
movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBB
movq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffff
movq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBB
movq mm2, mm0 //mm2 = mm0 = 0xaarrggbb_AARRGGBB
pandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000
movq mm3, mm0 //mm3 = mm0 = 0xaarrggbb_AARRGGBB
punpckhbw mm1, mm7 //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bb
punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BB
punpcklbw mm2, mm2 //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBB
punpckhbw mm3, mm3 //mm3 = 0xaaaa_rrrr_gggg_bbbb
punpckhwd mm2, mm2 //mm2 = 0xAAAA_AAAA_RRRR_RRRR
punpckhwd mm3, mm3 //mm3 = 0xaaaa_aaaa_rrrr_rrrr
punpckhdq mm2, mm2 //mm2 = 0xAAAA_AAAA_AAAA_AAAA
punpckhdq mm3, mm3 //mm3 = 0xaaaa_aaaa_aaaa_aaaa
pand mm2, mm6 //mm2 = 0x00AA_00AA_00AA_00AA
pand mm3, mm6 //mm3 = 0x00aa_00aa_00aa_00aa
pmullw mm0, mm2 //自乘alpha,字组相乘,取低16位
pmullw mm1, mm3
psrlw mm0, 8 //除以256
psrlw mm1, 8
packuswb mm0, mm0 //合并单个象素
packuswb mm1, mm1
punpckldq mm0, mm1 //将2个象素合并
pand mm0, mm5 //恢复原始alpha
por mm0, mm4
//put_pixel:
movq [edi], mm0
add esi, 8
add edi, 8
dec eax
test_pair_pixel:
jnz loop_pair_pixel
//rest_line_pixel:
jecxz next_line //scanline_rest_pixel不是0就是1
movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBB
movq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffff
movq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBB
movq mm2, mm0 //mm2 = mm0 = 0xaarrggbb_AARRGGBB
pandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000
movq mm3, mm0 //mm3 = mm0 = 0xaarrggbb_AARRGGBB
punpckhbw mm1, mm7 //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bb
punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BB
punpcklbw mm2, mm2 //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBB
punpckhbw mm3, mm3 //mm3 = 0xaaaa_rrrr_gggg_bbbb
punpckhwd mm2, mm2 //mm2 = 0xAAAA_AAAA_RRRR_RRRR
punpckhwd mm3, mm3 //mm3 = 0xaaaa_aaaa_rrrr_rrrr
punpckhdq mm2, mm2 //mm2 = 0xAAAA_AAAA_AAAA_AAAA
punpckhdq mm3, mm3 //mm3 = 0xaaaa_aaaa_aaaa_aaaa
pand mm2, mm6 //mm2 = 0x00AA_00AA_00AA_00AA
pand mm3, mm6 //mm3 = 0x00aa_00aa_00aa_00aa
pmullw mm0, mm2 //自乘alpha,字组相乘,取低16位
pmullw mm1, mm3
psrlw mm0, 8 //除以256
psrlw mm1, 8
packuswb mm0, mm0 //合并单个象素
packuswb mm1, mm1
punpckldq mm0, mm1 //将2个象素合并
pand mm0, mm5 //恢复原始alpha
por mm0, mm4
movd [edi], mm0
add esi, 4
add edi, 4
next_line:
add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行
dec edx
jnz loop_line
emms //清除mmx指令状态
end_pixel:
}
}
//< 32位光栅预乘Alpha,使用SSE指令处理
/************************************************************************/
void BltSurface32ToDIB32_SelfMulAlphaSSE( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{
int src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节
__asm
{
//取参数,判断width和height是否有任一为0
mov eax, height //eax = heigh
mov ebx, width
mul ebx //width * height
test eax, eax //影响ZF
jz end_pixel
//常量赋值
mov esi, pSrc
mov edi, pDst
pcmpeqd mm5, mm5 //mm5 = 0xffffffff_ffffffff
pxor mm7, mm7 //mm7 = 0x0
psrld mm5, 8 //mm5 = 0x00ffffff_00ffffff
//判断pitch
mov edx, src_pitch
shl ebx, 2 //每个像素4个字节, dst_pitch = width * 4
sub edx, ebx //src_pitch - dst_pitch
jnz diff_pitch
//same_pitch:
mov ecx, eax
mov edx, 1 //how many lines,eax和edx构成2层循环
and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)
shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)
jmp test_pair_pixel
diff_pitch:
mov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitch
mov eax, width
mov edx, height //how many lines,eax和edx构成2层循环
mov ecx, eax
and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)
shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)
mov ebx, eax //main loop count on every scanline
jmp test_pair_pixel
loop_line:
mov eax, ebx
loop_pair_pixel:
movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBB
movq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffff
movq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBB
pandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000
punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BB
punpckhbw mm1, mm7 //mm1 = 0x00aa_00rr_00gg_00bb
pshufw mm2, mm0, 0xff //mm2 = 0x00AA_00AA_00AA_00AA
pshufw mm3, mm1, 0xff //mm3 = 0x00aa_00aa_00aa_00aa
pmullw mm0, mm2 //自乘alpha,字组相乘,取低16位
pmullw mm1, mm3
psrlw mm0, 8 //除以256
psrlw mm1, 8
packuswb mm0, mm0 //合并单个象素
packuswb mm1, mm1
punpckldq mm0, mm1 //将2个象素合并
pand mm0, mm5 //恢复原始alpha
por mm0, mm4
//put_pixel:
MOVNTQ [edi], mm0
add esi, 8
add edi, 8
dec eax
test_pair_pixel:
jnz loop_pair_pixel
//rest_line_pixel:
jecxz next_line //scanline_rest_pixel不是0就是1
movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBB
movq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffff
movq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBB
pandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000
punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BB
punpckhbw mm1, mm7 //mm1 = 0x00aa_00rr_00gg_00bb
pshufw mm2, mm0, 0xff //mm2 = 0x00AA_00AA_00AA_00AA
pshufw mm3, mm1, 0xff //mm3 = 0x00aa_00aa_00aa_00aa
pmullw mm0, mm2 //自乘alpha,字组相乘,取低16位
pmullw mm1, mm3
psrlw mm0, 8 //除以256
psrlw mm1, 8
packuswb mm0, mm0 //合并单个象素
packuswb mm1, mm1
punpckldq mm0, mm1 //将2个象素合并
pand mm0, mm5 //恢复原始alpha
por mm0, mm4
movd [edi], mm0
add esi, 4
add edi, 4
next_line:
add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行
dec edx
jnz loop_line
emms
end_pixel:
}
}
//< 32位光栅预乘Alpha,使用SSE2指令处理
/************************************************************************/
void BltSurface32ToDIB32_SelfMulAlphaSSE2( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{
int src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节
__asm
{
//取参数,判断width和height是否有任一为0
mov eax, height //eax = heigh
mov ebx, width
mul ebx //width * height
test eax, eax //影响ZF
jz end_pixel
//常量赋值
mov esi, pSrc
mov edi, pDst
pcmpeqd xmm6, xmm6 //xmm6 = 0xffffffff_ffffffff_ffffffff_ffffffff
xorps xmm7, xmm7 //xmm7 = 0x0
psrld xmm6, 8 //xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffff
//判断pitch
mov edx, src_pitch
shl ebx, 2 //每个像素4个字节, dst_pitch = width * 4
cmp edx, ebx //src_pitch - dst_pitch
jnz diff_pitch
//same_pitch:
mov ecx, eax
mov edx, 1 //how many lines,eax和edx构成2层循环
shr eax, 2 //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)
jmp test_quat_pixel
diff_pitch:
and ebx, ~0x0f
sub edx, ebx //src_pitch - dst_pitch
mov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitch
mov eax, width
mov edx, height //how many lines,eax和edx构成2层循环
test eax, 3 //这里比较特殊,diff_pitch情况下,像素的个数必须是4的倍数,否则会导致Access Violation;因此如果遇到不是4的倍数,则退出
jnz end_pixel
shr eax, 2 //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)
mov ebx, eax //main loop count on every scanline
jmp test_quat_pixel
loop_line:
mov eax, ebx
loop_quat_pixel:
MOVUPS xmm0, [esi] //xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZ
MOVAPS xmm5, xmm6 //xmm5 = xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffff
MOVAPS xmm1, xmm0 //xmm1 = xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZ
andnps xmm5, xmm0 //xmm5 = 0xaa000000_AA000000_ww000000_WW000000, 保存alpha
punpcklbw xmm0, xmm7 //xmm0 = 0x00ww_00xx_00yy_00zz_00WW_00XX_00YY_00ZZ
punpckhbw xmm1, xmm7 //xmm1 = 0x00aa_00rr_00gg_00bb_00AA_00RR_00GG_00BB
pshuflw xmm2, xmm0, 0xff //xmm2 = 0xww_00xx_00yy_00zz_00WW_00WW_00WW_00WW
pshuflw xmm3, xmm1, 0xff //xmm3 = 0xaa_00rr_00gg_00bb_00AA_00AA_00AA_00AA
pshufhw xmm2, xmm2, 0xff //xmm2 = 0x00ww_00ww_00ww_00ww_00WW_00WW_00WW_00WW
pshufhw xmm3, xmm3, 0xff //xmm3 = 0x00aa_00aa_00aa_00aa_00AA_00AA_00AA_00AA
pmullw xmm0, xmm2 //自乘alpha,字组相乘,取低16位
pmullw xmm1, xmm3
psrlw xmm0, 8 //除以256,取16位中的高位
psrlw xmm1, 8
packuswb xmm0, xmm0 //合并2个象素
packuswb xmm1, xmm1
PUNPCKLQDQ xmm0, xmm1 //将4个象素合并
andps xmm0, xmm6 //恢复原始alpha
orps xmm0, xmm5
//put_pixel:
MOVNTDQ [edi], xmm0
add esi, 16
add edi, 16
dec eax
test_quat_pixel:
jnz loop_quat_pixel
//sse2模式下不处理剩余像素,非4的倍数;
//next_line:
add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行
dec edx
jnz loop_line
end_pixel:
}
}