F':尝试利用SSE2新增的MOVNTI指令优化CPU写缓冲
void __declspec(naked) __stdcall PicRotarySSE2_CopyLine(TARGB32* pDstLine,long dstCount,long Ax_16,long Ay_16,
// [esp+ 4] [esp+ 8] [esp+12] [esp+16]
long srcx0_16,long srcy0_16,TARGB32* pSrcLine,long src_byte_width)
// [esp+20] [esp+24] [esp+28] [esp+32]
{
//利用SSE2的MOVNTI指令优化写缓冲的汇编实现
asm
{
push ebx
push esi
push edi
push ebp
//esp offset 16
mov ebx,dword ptr [esp+ 8+16]
mov esi,dword ptr [esp+32+16]
mov edi,dword ptr [esp+28+16]
mov eax,dword ptr [esp+24+16]
mov ecx,dword ptr [esp+20+16]
dec ebx
xor edx,edx
test ebx,ebx
mov dword ptr [esp+ 8+16],ebx
jle loop_bound
jmp loop_begin
align 16
loop_begin:
mov ebx,eax
add eax,dword ptr [esp+16+16]
sar ebx,16
imul ebx,esi
add ebx,edi
mov ebp,ecx
add ecx,dword ptr [esp+12+16]
sar ebp,16
mov ebx,dword ptr [ebx+ebp*4]
mov ebp,dword ptr [esp+ 4+16]
MOVNTI dword ptr [ebp+edx*4],ebx
mov ebx,eax
add eax,dword ptr [esp+16+16]
sar ebx,16
imul ebx,esi
mov ebp,ecx
add ecx,dword ptr [esp+12+16]
sar ebp,16
add ebx,edi
mov ebx,dword ptr [ebx+ebp*4]
mov ebp,dword ptr [esp+4+16]
MOVNTI dword ptr [ebp+edx*4+4],ebx
mov ebx,dword ptr [esp+ 8+16]
add edx,2
cmp edx,ebx
jl loop_begin
loop_bound:
cmp edx,ebx
jne loop_bound_end
sar eax,16
imul eax,esi
sar ecx,16
add eax,edi
mov eax,dword ptr [eax+ecx*4]
mov ecx,dword ptr [esp+ 4+16]
mov dword ptr [ecx+edx*4],eax
loop_bound_end:
pop ebp
pop edi
pop esi
pop ebx
ret 32
}
}
void PicRotarySSE2(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的缩放比例认为已经不可见
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)为旋转中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y)) return;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
long x0=rcData.get_down_x0();
PicRotarySSE2_CopyLine(&pDstLine[x0],rcData.get_down_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
long x0=rcData.get_up_x0();
PicRotarySSE2_CopyLine(&pDstLine[x0],rcData.get_up_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
}
asm sfence //刷新写入
}
//速度测试:
//==============================================================================
// PicRotarySEE2 304.2 fps
一张效果图:
//程序使用的调用参数:
const long testcount=2000;
long dst_wh=1004;
for (int i=0;i<testcount;++i)
{
double zoom=rand()*(1.0/RAND_MAX)+0.5;
PicRotarySSE(ppicDst,ppicSrc,rand()*(PI*2/RAND_MAX),zoom,zoom,((dst_wh+ppicSrc.width)*rand()*(1.0/RAND_MAX)-ppicSrc.width),(dst_wh+ppicSrc.height)*rand()*(1.0/RAND_MAX)-ppicSrc.height);
}
//ps:如果很多时候源图片绘制时可能落在目标区域的外面,那么需要写一个剪切算法快速排除不必要的绘制
一张测试函数速度的时候生成的图像:
G:旋转测试的结果放到一起:
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心,测试成绩取各个旋转角度的平均速度值
//速度测试: (测试CPU为AMD64x2 4200+(2.37G),单线程)
//==============================================================================
// PicRotary0 34.9 fps
// PicRotary1 62.0 fps
// PicRotary2 134.2 fps
// PicRotary3 280.9 fps
// PicRotarySEE 306.3 fps
// PicRotarySEE2 304.2 fps
//(PicRotarySSE2_Block 316.6 fps (参见《下篇 补充话题》))
补充Intel Core2 4400上的测试成绩:
//速度测试: (测试CPU为Intel Core2 4400(2.00G)单线程)
//==============================================================================
// PicRotary0 58.6 fps
// PicRotary1 82.1 fps
// PicRotary2 167.9 fps
// PicRotary3 334.9 fps
// PicRotarySEE 463.1 fps
// PicRotarySEE2 449.3 fps
//(PicRotarySSE2_Block 351.3 fps (参见《下篇 补充话题》))
//ps:文章的下篇将进一步优化图片旋转的质量(使用二次线性插值、三次卷积插值和MipMap链),并完美的处理边缘的锯齿,并考虑介绍颜色的Alpha Blend混合
(希望读者能在这一系列的文章中不仅能学到旋转和缩放,还能够学到一些优化的基本技巧和思路;也欢迎指出文章中的错误、我没有做到的优化、改进意见等)
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/housisong/archive/2007/04/27/1586717.aspx