高质量的快速的图像缩放

最新推荐文章于 2024-09-18 08:53:59 发布

citysheep

最新推荐文章于 2024-09-18 08:53:59 发布

阅读量1.4k

点赞数

文章标签： dst 优化测试 table 汇编 opensource

以下汇编部分我编译出来使用发现图像有黑线，不懂汇编所以修改不了。关于缩放以及颜色空间转换的快速算法大家可以获取vitrualtub的opensource来研究，里面有全部汇编完成的代码，性能非常不错。

tag:图像缩放、速度优化、定点数优化、近邻取样插值、二次线性插值、三次线性插值、
MipMap链、三次卷积插值、MMX/SSE优化、CPU缓存优化

摘要:首先给出一个基本的图像缩放算法，然后一步一步的优化其速度和缩放质量；

高质量的快速的图像缩放全文分为:
上篇近邻取样插值和其速度优化
中篇二次线性插值和三次卷积插值
下篇三次线性插值和MipMap链

正文：

为了便于讨论，这里只处理32bit的ARGB颜色；
代码使用C++;涉及到汇编优化的时候假定为x86平台;使用的编译器为vc6;
为了代码的可读性,没有加入异常处理代码;
测试使用的CPU为赛扬2G;

速度测试说明:
只测试内存数据到内存数据的缩放
测试图片都是800*600缩放到1024*768; fps表示每秒钟的帧数,值越大表示函数越快

//Windows GDI相关函数参考速度:
//==============================================================================
// BitBlt 245 fps //is copy 800*600 to 800*600
// BitBlt 159 fps //is copy 1024*768 to 1024*768
// StretchBlt 169 fps //is zoom 800*600 to 1024*768

A: 首先定义图像数据结构:

#define asm __asm

typedef unsigned char TUInt8; // [0..255]
struct TARGB32 //32 bit color
{
TUInt8 B,G,R,A; // A is alpha
};

struct TPicRegion //一块颜色数据区的描述，便于参数传递
{
TARGB32* pdata; //颜色数据首地址
long byte_width; //一行数据的物理宽度(字节宽度)；
//abs(byte_width)有可能大于等于width*sizeof(TARGB32);
long width; //像素宽度
long height; //像素高度
};

//那么访问一个点的函数可以写为：
inline TARGB32& Pixels(const TPicRegion& pic,const long x,const long y)
{
return ( (TARGB32*)((TUInt8*)pic.pdata+pic.byte_width*y) )[x];
}

B: 缩放原理和公式图示:

缩放后图片原图片
(宽DW,高DH) (宽SW,高SH)

(Sx-0)/(SW-0)=(Dx-0)/(DW-0) (Sy-0)/(SH-0)=(Dy-0)/(DH-0)
=> Sx=Dx*SW/DW Sy=Dy*SH/DH

C: 缩放算法的一个参考实现

//给出一个最简单的缩放函数(插值方式为近邻取样,而且我“尽力”把它写得慢一些了:D)
//Src.PColorData指向源数据区,Dst.PColorData指向目的数据区
//函数将大小为Src.Width*Src.Height的图片缩放到Dst.Width*Dst.Height的区域中

void PicZoom0(const TPicRegion& Dst,const TPicRegion& Src)
{
for (long x=0;x
{
for (long y=0;y
{
long srcx=(x*Src.width/Dst.width);
long srcy=(y*Src.height/Dst.height);
Pixels(Dst,x,y)=Pixels(Src,srcx,srcy);
}
}
}

//速度测试:
//==============================================================================
// PicZoom0 6 fps

D: 优化PicZoom0函数

a.PicZoom0函数并没有按照颜色数据在内存中的排列顺序读写(内部循环递增y行
索引)，将造成CPU缓存预读失败和内存颠簸导致巨大的性能损失,(很多硬件都有这种特性,
包括缓存、内存、显存、硬盘等,优化顺序访问，随机访问时会造成巨大的性能损失)
所以先交换x,y循环的顺序:

void PicZoom1(const TPicRegion& Dst,const TPicRegion& Src)
{
for (long y=0;y
{
for (long x=0;x
{
long srcx=(x*Src.width/Dst.width);
long srcy=(y*Src.height/Dst.height);
Pixels(Dst,x,y)=Pixels(Src,srcx,srcy);
}
}
}

//速度测试:
//==============================================================================
// PicZoom1 23 fps

b.“(x*Src.Width/Dst.Width)”表达式中有一个除法运算，它属于很慢的操作(比一般
的加减运算慢几十倍!),使用定点数的方法来优化它；

void PicZoom2(const TPicRegion& Dst,const TPicRegion& Src)
{
if ((0==Dst.width)||(0==Dst.height)) return;
//函数能够处理的最大图片尺寸65536*65536
unsigned long xrIntFloat_16=(Src.width<<16)/Dst.width+1; //16.16格式定点数
unsigned long yrIntFloat_16=(Src.height<<16)/Dst.height+1; //16.16格式定点数
//可证明: (Dst.width-1)*xrIntFloat_16
for (unsigned long y=0;y
{
for (unsigned long x=0;x
{
unsigned long srcx=(x*xrIntFloat_16)>>16;
unsigned long srcy=(y*yrIntFloat_16)>>16;
Pixels(Dst,x,y)=Pixels(Src,srcx,srcy);
}
}
}

//速度测试:
//==============================================================================
// PicZoom2 86 fps

c. 在x的循环中y一直不变，那么可以提前计算与y相关的值; 1.可以发现srcy的值和x变量无关，可以提前到x轴循环之前；2.展开Pixels函数，优化与y相关的指针计算；

void PicZoom3(const TPicRegion& Dst,const TPicRegion& Src)
{
if ((0==Dst.width)||(0==Dst.height)) return;
unsigned long xrIntFloat_16=(Src.width<<16)/Dst.width+1;
unsigned long yrIntFloat_16=(Src.height<<16)/Dst.height+1;
unsigned long dst_width=Dst.width;
TARGB32* pDstLine=Dst.pdata;
unsigned long srcy_16=0;
for (unsigned long y=0;y
{
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*(srcy_16>>16)));
unsigned long srcx_16=0;
for (unsigned long x=0;x
{
pDstLine[x]=pSrcLine[srcx_16>>16];
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
}

//速度测试:
//==============================================================================
// PicZoom3 183 fps

d.定点数优化使函数能够处理的最大图片尺寸和缩放结果(肉眼不可察觉的误差)受到了一
定的影响,这里给出一个使用浮点运算的版本,可以在有这种需求的场合使用:

void PicZoom3_float(const TPicRegion& Dst,const TPicRegion& Src)
{
//注意: 该函数需要FPU支持

if ((0==Dst.width)||(0==Dst.height)) return;
double xrFloat=1.000000001/((double)Dst.width/Src.width);
double yrFloat=1.000000001/((double)Dst.height/Src.height);

unsigned short RC_Old;
unsigned short RC_Edit;
asm //设置FPU的取整方式为了直接使用fist浮点指令
{
FNSTCW RC_Old // 保存协处理器控制字,用来恢复
FNSTCW RC_Edit // 保存协处理器控制字,用来修改
FWAIT
OR RC_Edit, 0x0F00 // 改为 RC=11 使FPU向零取整
FLDCW RC_Edit // 载入协处理器控制字,RC场已经修改
}

unsigned long dst_width=Dst.width;
TARGB32* pDstLine=Dst.pdata;
double srcy=0;
for (unsigned long y=0;y
{
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*((long)srcy)));
/**//*
double srcx=0;
for (unsigned long x=0;x
{
pDstLine[x]=pSrcLine[(unsigned long)srcx];//因为默认的浮点取整是一个很慢
//的操作! 所以才使用了直接操作FPU的内联汇编代码。
srcx+=xrFloat;
}*/
asm fld xrFloat //st0==xrFloat
asm fldz //st0==0 st1==xrFloat
unsigned long srcx=0;
for (long x=0;x
{
asm fist dword ptr srcx //srcx=(long)st0
pDstLine[x]=pSrcLine[srcx];
asm fadd st,st(1) //st0+=st1 st1==xrFloat
}
asm fstp st
asm fstp st

srcy+=yrFloat;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}

asm //恢复FPU的取整方式
{
FWAIT
FLDCW RC_Old
}
}

//速度测试:
//==============================================================================
// PicZoom3_float 178 fps

e.注意到这样一个事实:每一行的缩放比例是固定的;那么可以预先建立一个缩放映射表格
来处理缩放映射算法(PicZoom3_Table和PicZoom3_float的实现等价);

void PicZoom3_Table(const TPicRegion& Dst,const TPicRegion& Src)
{
if ((0==Dst.width)||(0==Dst.height)) return;
unsigned long dst_width=Dst.width;

unsigned long* SrcX_Table = new unsigned long[dst_width];
for (unsigned long x=0;x
{
SrcX_Table[x]=(x*Src.width/Dst.width);
}

TARGB32* pDstLine=Dst.pdata;
for (unsigned long y=0;y
{
unsigned long srcy=(y*Src.height/Dst.height);
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*srcy));
for (unsigned long x=0;x
pDstLine[x]=pSrcLine[SrcX_Table[x]];
((TUInt8*&)pDstLine)+=Dst.byte_width;
}

delete [] SrcX_Table;
}

//速度测试:
//==============================================================================
// PicZoom3_Table 180 fps

f.为了加快缩放，可以采用根据缩放比例动态生成函数的方式来得到更快的缩放函数；这
有点像编译器的工作原理；要实现它需要的工作量比较大(或比较晦涩)就不再实现了；
(动态生成是一种不错的思路，但个人觉得对于缩放，实现它的必要性不大)

g.现代CPU中，在读取数据和写入数据时，都有自动的缓存机制；很容易知道，算法中生
成的数据不会很快再次使用，所以不需要写入缓存的帮助；在SSE指令集中增加了movntq
等指令来完成这个功能；
(尝试过利用CPU显式prefetcht0、prefetchnta预读指令或直接的mov读取指令等速度反
而略有下降:( 但预读在copy算法中速度优化效果很明显 )

void PicZoom3_SSE(const TPicRegion& Dst,const TPicRegion& Src)
{
//警告: 函数需要CPU支持MMX和movntq指令

if ((0==Dst.width)||(0==Dst.height)) return;
unsigned long xrIntFloat_16=(Src.width<<16)/Dst.width+1;
unsigned long yrIntFloat_16=(Src.height<<16)/Dst.height+1;

unsigned long dst_width=Dst.width;
TARGB32* pDstLine=Dst.pdata;
unsigned long srcy_16=0;
for (unsigned long y=0;y
{
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*(srcy_16>>16)));

asm
{
push ebp
mov esi,pSrcLine
mov edi,pDstLine
mov edx,xrIntFloat_16
mov ecx,dst_width
xor ebp,ebp //srcx_16=0

and ecx, (not 3) //4次展开
TEST ECX,ECX //nop
jle EndWriteLoop

lea edi,[edi+ecx*4]
neg ecx

//todo: 预读

WriteLoop:
mov eax,ebp
shr eax,16 //srcx_16>>16
lea ebx,[ebp+edx]
movd mm0,[esi+eax*4]
shr ebx,16 //srcx_16>>16
PUNPCKlDQ mm0,[esi+ebx*4]
lea ebp,[ebp+edx*2]

// movntq qword ptr [edi+ecx*4], mm0 //不使用缓存的写入指令
asm _emit 0x0F asm _emit 0xE7 asm _emit 0x04 asm _emit 0x8F

mov eax,ebp
shr eax,16 //srcx_16>>16
lea ebx,[ebp+edx]
movd mm1,[esi+eax*4]
shr ebx,16 //srcx_16>>16
PUNPCKlDQ mm1,[esi+ebx*4]
lea ebp,[ebp+edx*2]

// movntq qword ptr [edi+ecx*4+8], mm1 //不使用缓存的写入指令
asm _emit 0x0F asm _emit 0xE7 asm _emit 0x4C asm _emit 0x8F asm _emit 0x08

add ecx, 4
jnz WriteLoop

//sfence //刷新写入
emms
EndWriteLoop:

mov ebx,ebp
pop ebp

//处理边界循环次数为0,1,2,3；(这个循环可以展开,做一个跳转表,略)
mov ecx,dst_width
and ecx,3
TEST ECX,ECX
jle EndLineZoom

lea edi,[edi+ecx*4]
neg ecx
StartBorder:
mov eax,ebx
shr eax,16 //srcx_16>>16
mov eax,[esi+eax*4]
mov [edi+ecx*4],eax
add ebx,edx

inc ECX
JNZ StartBorder
EndLineZoom:
}

//
srcy_16+=yrIntFloat_16;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
}

//速度测试:
//==============================================================================
// PicZoom3_SSE 301--330 fps (fps在一个范围是因为movntq指令受内存对齐的影响)

E: 缩放效果图：

原图

放大图(x轴放大8倍,y轴放大12倍)

原图

缩小图(缩小到0.66倍)

放大图(放大到1.6倍)

F: 把测试成绩放在一起：

//zoom 800*600 to 1024*1024
//==============================================================================
// BitBlt 245 fps //is copy 800*600 to 800*600
// BitBlt 159 fps //is copy 1024*1024 to 1024*1024
// StretchBlt 169 fps
//
// PicZoom0 6 fps
// PicZoom1 23 fps
// PicZoom2 86 fps
// PicZoom3 183 fps
// PicZoom3_float 178 fps
// PicZoom3_Table 180 fps
// PicZoom3_SSE 301--330 fps (fps在一个范围是因为movntq指令受内存对齐的影响)