文章来源:http://blog.csdn.net/housisong/archive/2009/02/12/3884368.aspx
图形图像处理-之-彩色转化到灰度的速度优化
HouSisong@GMail.com 2009.02.08
tag:灰度算法,速度优化,定点数优化,MMX,SSE,SSE2,CPU缓存优化
摘要:
彩色转化到灰度的速度优化文章包括图形图像处理简单Demo框架和灰度转换的实
现及其速度优化,并演示其使用SIMD指令集的优化;
本篇文章将第一次提供完整的可以编译的图像处理完整项目代码;
(以后会用这个框架逐步改写以前的图形图像处理文章)
正文:
为了便于讨论,这里只处理32bit的ARGB颜色;代码使用C++;使用的编译器为vc2008;
(经过测试代码也可以在DevC++和xcode下编译通过) 测试使用的CPU为AMD64x2 4200+(2.33G);
速度测试说明:
只测试内存数据到内存数据的ARGB32颜色的灰度转化;
测试图片是800*600; fps表示每秒钟的帧数,值越大表示函数越快;
A: 图形图像处理简单Demo框架
我以前写的图形图像处理方面的blog文章都没有完整的可以编译运行的代码,
而仅仅列出了关键的核心代码;经常有网友看了我的文章,但因为不能实际运行看看,
从而对代码的理解不深,也不能把代码移植到自己的项目中使用; 所以决定为我的图形
图像处理系列blog文章建立一个简单的小型的框架;我把它命名为hGraphic32,
它会尽量的小,演示为主,仅支持ARGB32颜色,能够加载和保存bmp图片文件,能够在
多个编译器和平台下编译和运行;
现在就下载完整项目源代码吧: 完整项目源代码
<hGraphic32>文件夹里的文件说明:
"hColor32.h" : 里面定义了32bitARGB颜色类型Color32,它占用4字节,代表一个颜色;
TPixels32Ref是图像数据区的描述信息,可以把它理解为一个"指针",指向了Color32构成的像素区;
IPixels32Buf是图像数据区接口,用于描述一个图像的缓冲区;
"hPixels32.h" : 里面定义了TPixels32类,它实现了IPixels32Buf接口,用于申请和管理一块内存像素;
"hStream.h" : 里面定义了IInputStream输入流接口;
IBufInputStream数据区输入流接口,继承自IInputStream;
TFileInputStream文件输入流类,它实现了IBufInputStream接口;
IOutputStream输出流接口;
TFileOutputStream文件输出流类,它实现了IOutputStream接口;
"hBmpFile.h" : 里面定义了TBmpFile类,它负责加载bmp和保存bmp;
"hGraphic32.h" 文件include了上面的*.h头文件,所以使用的时候,只要#include "hGraphic32.h"就可以了
B: 灰度转化项目
所有的转换和测试代码都在"ColorToGray/ColorToGray.cpp"文件中(带有main函数的命令行程序);
"ColorToGray/win_vc/ColorToGray.sln"是windows系统下的vc2008项目文件(测试的时请设定调试运行目录为"..");
"ColorToGray/win_DevC++/ColorToGray.dev"是windows系统下的DevC++项目文件;
"ColorToGray/macosx_xcode/ColorToGray.xcodeproj"是macosx系统下的xcode项目文件;
你也可以自己建立项目,包含ColorToGray.cpp文件和<hGraphic32>文件夹下的所有文件,就可以编译了;
C: 灰度转化公式和代码实现
文章中用的灰度公式: Gray = R*0.299 + G*0.587 + B*0.114;
代码实现:
view plaincopy to clipboardprint?
//灰度转换系数
const double gray_r_coeff=0.299;
const double gray_g_coeff=0.587;
const double gray_b_coeff=0.114;
//处理一个点
must_inline double toGray_float(const Color32& src){
return (src.r*gray_r_coeff +src.g*gray_g_coeff +src.b*gray_b_coeff);
}
//处理一行
void colorToGrayLine_float(const Color32* src,Color32* dst,long width){
for (long x = 0; x < width; ++x){
int gray=(int)toGray_float(src[x]);
dst[x]=Color32(gray,gray,gray,src[x].a);//R,G,B都设置为相同的亮度值,A不变
}
}
void colorToGray_float(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_float(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
}
//灰度转换系数
const double gray_r_coeff=0.299;
const double gray_g_coeff=0.587;
const double gray_b_coeff=0.114;
//处理一个点
must_inline double toGray_float(const Color32& src){
return (src.r*gray_r_coeff +src.g*gray_g_coeff +src.b*gray_b_coeff);
}
//处理一行
void colorToGrayLine_float(const Color32* src,Color32* dst,long width){
for (long x = 0; x < width; ++x){
int gray=(int)toGray_float(src[x]);
dst[x]=Color32(gray,gray,gray,src[x].a);//R,G,B都设置为相同的亮度值,A不变
}
}
void colorToGray_float(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_float(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
}
//速度测试
//==============================================================================
// colorToGray_float 145.49 FPS
D: 将浮点运算转化为定点数(整数)运算
view plaincopy to clipboardprint?
must_inline int toGray_int16(const Color32& src){
const long bit=16;
const int gray_r_coeff_int=(int)( gray_r_coeff*(1<<bit)+0.4999999 );
const int gray_g_coeff_int=(int)( gray_g_coeff*(1<<bit)+0.4999999 );
const int gray_b_coeff_int=(1<<bit)-gray_r_coeff_int-gray_g_coeff_int;
return (src.r*gray_r_coeff_int +src.g*gray_g_coeff_int +src.b*gray_b_coeff_int) >> bit;
}
inline void colorToGrayLine_int16(const Color32* src,Color32* dst,long width){
for (long x = 0; x < width; ++x){
int gray=toGray_int16(src[x]);
dst[x]=Color32(gray,gray,gray,src[x].a);
}
}
colorToGray_int16(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int16(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
must_inline int toGray_int16(const Color32& src){
const long bit=16;
const int gray_r_coeff_int=(int)( gray_r_coeff*(1<<bit)+0.4999999 );
const int gray_g_coeff_int=(int)( gray_g_coeff*(1<<bit)+0.4999999 );
const int gray_b_coeff_int=(1<<bit)-gray_r_coeff_int-gray_g_coeff_int;
return (src.r*gray_r_coeff_int +src.g*gray_g_coeff_int +src.b*gray_b_coeff_int) >> bit;
}
inline void colorToGrayLine_int16(const Color32* src,Color32* dst,long width){
for (long x = 0; x < width; ++x){
int gray=toGray_int16(src[x]);
dst[x]=Color32(gray,gray,gray,src[x].a);
}
}
void colorToGray_int16(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int16(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
}
//速度测试
//==============================================================================
// colorToGray_int16 355.33 FPS
E: 做一个简单的循环代码展开
view plaincopy to clipboardprint?
//四路展开
void colorToGrayLine_int16_expand4(const Color32* src,Color32* dst,long width){
long widthFast=width>>2<<2;
for (long x = 0; x < widthFast; x+=4){
int gray0=toGray_int16(src[x ]);
int gray1=toGray_int16(src[x+1]);
dst[x ]=Color32(gray0,gray0,gray0,src[x ].a);
dst[x+1]=Color32(gray1,gray1,gray1,src[x+1].a);
int gray2=toGray_int16(src[x+2]);
int gray3=toGray_int16(src[x+3]);
dst[x+2]=Color32(gray2,gray2,gray2,src[x+2].a);
dst[x+3]=Color32(gray3,gray3,gray3,src[x+3].a);
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
colorToGray_int16_expand4(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int16_expand4(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
//四路展开
void colorToGrayLine_int16_expand4(const Color32* src,Color32* dst,long width){
long widthFast=width>>2<<2;
for (long x = 0; x < widthFast; x+=4){
int gray0=toGray_int16(src[x ]);
int gray1=toGray_int16(src[x+1]);
dst[x ]=Color32(gray0,gray0,gray0,src[x ].a);
dst[x+1]=Color32(gray1,gray1,gray1,src[x+1].a);
int gray2=toGray_int16(src[x+2]);
int gray3=toGray_int16(src[x+3]);
dst[x+2]=Color32(gray2,gray2,gray2,src[x+2].a);
dst[x+3]=Color32(gray3,gray3,gray3,src[x+3].a);
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_int16_expand4(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int16_expand4(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
}
//速度测试
//==============================================================================
// colorToGray_int16_expand4 413.22 FPS
F: 一个特别的版本
在高级语言范围内进行单条指令多数据流计算,减少需要的乘法量;
在乘法运算代价比较高昂的cpu上应该效果不错; (x86上速度可能慢)
view plaincopy to clipboardprint?
must_inline UInt32 toGray_int8_opMul(const Color32* src2Color){
const UInt32 gray_r_coeff_8=(UInt32)( gray_r_coeff*(1<<8)+0.4999999);
const UInt32 gray_g_coeff_8=(UInt32)( gray_g_coeff*(1<<8)+0.4999999);
const UInt32 gray_b_coeff_8=(1<<8)-gray_r_coeff_8-gray_g_coeff_8;
UInt32 RR,GG,BB;
BB=src2Color[0].b | (src2Color[1].b<<16);
GG=src2Color[0].g | (src2Color[1].g<<16);
RR=src2Color[0].r | (src2Color[1].r<<16);
BB*=gray_b_coeff_8;
GG*=gray_g_coeff_8;
RR*=gray_r_coeff_8;
return BB+GG+RR;
}
void colorToGrayLine_int8_opMul(const Color32* src,Color32* dst,long width){
long widthFast=width>>2<<2;
for (long x = 0; x < widthFast; x+=4){
UInt32 gray01=toGray_int8_opMul(&src[x ]);
int gray0=(gray01&0x0000FF00)>>8;
int gray1=gray01>>24;
dst[x ]=Color32(gray0,gray0,gray0,src[x ].a);
dst[x+1]=Color32(gray1,gray1,gray1,src[x+1].a);
UInt32 gray23=toGray_int8_opMul(&src[x+2]);
int gray2=(gray23&0x0000FF00)>>8;
int gray3=gray23>>24;
dst[x+2]=Color32(gray2,gray2,gray2,src[x+2].a);
dst[x+3]=Color32(gray3,gray3,gray3,src[x+3].a);
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
colorToGray_int8_opMul(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int8_opMul(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
must_inline UInt32 toGray_int8_opMul(const Color32* src2Color){
const UInt32 gray_r_coeff_8=(UInt32)( gray_r_coeff*(1<<8)+0.4999999);
const UInt32 gray_g_coeff_8=(UInt32)( gray_g_coeff*(1<<8)+0.4999999);
const UInt32 gray_b_coeff_8=(1<<8)-gray_r_coeff_8-gray_g_coeff_8;
UInt32 RR,GG,BB;
BB=src2Color[0].b | (src2Color[1].b<<16);
GG=src2Color[0].g | (src2Color[1].g<<16);
RR=src2Color[0].r | (src2Color[1].r<<16);
BB*=gray_b_coeff_8;
GG*=gray_g_coeff_8;
RR*=gray_r_coeff_8;
return BB+GG+RR;
}
void colorToGrayLine_int8_opMul(const Color32* src,Color32* dst,long width){
long widthFast=width>>2<<2;
for (long x = 0; x < widthFast; x+=4){
UInt32 gray01=toGray_int8_opMul(&src[x ]);
int gray0=(gray01&0x0000FF00)>>8;
int gray1=gray01>>24;
dst[x ]=Color32(gray0,gray0,gray0,src[x ].a);
dst[x+1]=Color32(gray1,gray1,gray1,src[x+1].a);
UInt32 gray23=toGray_int8_opMul(&src[x+2]);
int gray2=(gray23&0x0000FF00)>>8;
int gray3=gray23>>24;
dst[x+2]=Color32(gray2,gray2,gray2,src[x+2].a);
dst[x+3]=Color32(gray3,gray3,gray3,src[x+3].a);
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_int8_opMul(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_int8_opMul(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
}
//速度测试
//==============================================================================
// colorToGray_int8_opMul 387.97 FPS
G: 内联汇编的MMX实现版本
注意:这里的MMX代码都只支持x86CPU(奔腾MMX以上CPU);
在x64下不再有MMX寄存器,而应该使用SEE的XMM寄存器;
而且在x64模式下vc2008编译器还没有提供内联汇编的直接支持,而必须使用函数指令方式的实现;
GCC编译器也支持内联汇编模式,但是汇编语法不同,请参考相应的说明;
view plaincopy to clipboardprint?
void colorToGrayLine_MMX(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
const UInt64 csMMX_rgb_coeff_w = (((UInt64)0x00000026)<<32) | 0x004b000f;
long widthFast=width>>1<<1;
if (widthFast>0){
asm{
pcmpeqb mm5,mm5 // FF FF FF FF FF FF FF FF
mov ecx,widthFast
pxor mm7,mm7 // 00 00 00 00 00 00 00 00
pcmpeqb mm4,mm4 // FF FF FF FF FF FF FF FF
mov eax,src
mov edx,dst
movq mm6,csMMX_rgb_coeff_w
psrlw mm5,15 // 1 1 1 1
lea eax,[eax+ecx*4]
lea edx,[edx+ecx*4]
pslld mm4,24 // FF 00 00 00 FF 00 00 00
neg ecx
loop_beign:
movq mm0,[eax+ecx*4] // A1 R1 G1 B1 A0 R0 G0 B0
movq mm1,mm0
movq mm3,mm0
punpcklbw mm0,mm7 // 00 A0 00 R0 00 G0 00 B0
punpckhbw mm1,mm7 // 00 A1 00 R1 00 G1 00 B1
pmaddwd mm0,mm6 // R0*r_coeff G0*g_coeff+B0*b_coeff
pmaddwd mm1,mm6 // R1*r_coeff G1*g_coeff+B1*b_coeff
pand mm3,mm4 // A1 00 00 00 A0 00 00 00
packssdw mm0,mm1 // sR1 sG1+sB1 sR0 sG0+sB0
pmaddwd mm0,mm5 // sR1+sG1+sB1 sR0+sG0+sB0
psrld mm0,7 // 00 00 00 Gray1 00 00 00 Gray0
movq mm1,mm0
movq mm2,mm0
pslld mm1,8 // 00 00 Gray1 00 00 00 Gray0 00
por mm0,mm3
pslld mm2,16 // 00 Gray1 00 00 00 Gray0 00 00
por mm0,mm1
por mm0,mm2 // A1 Gray1 Gray1 Gray1 A0 Gray0 Gray0 Gray0
movq [edx+ecx*4],mm0
add ecx,2
jnz loop_beign
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
asm{
emms //MMX使用结束
}
}
void colorToGrayLine_MMX(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
const UInt64 csMMX_rgb_coeff_w = (((UInt64)0x00000026)<<32) | 0x004b000f;
long widthFast=width>>1<<1;
if (widthFast>0){
asm{
pcmpeqb mm5,mm5 // FF FF FF FF FF FF FF FF
mov ecx,widthFast
pxor mm7,mm7 // 00 00 00 00 00 00 00 00
pcmpeqb mm4,mm4 // FF FF FF FF FF FF FF FF
mov eax,src
mov edx,dst
movq mm6,csMMX_rgb_coeff_w
psrlw mm5,15 // 1 1 1 1
lea eax,[eax+ecx*4]
lea edx,[edx+ecx*4]
pslld mm4,24 // FF 00 00 00 FF 00 00 00
neg ecx
loop_beign:
movq mm0,[eax+ecx*4] // A1 R1 G1 B1 A0 R0 G0 B0
movq mm1,mm0
movq mm3,mm0
punpcklbw mm0,mm7 // 00 A0 00 R0 00 G0 00 B0
punpckhbw mm1,mm7 // 00 A1 00 R1 00 G1 00 B1
pmaddwd mm0,mm6 // R0*r_coeff G0*g_coeff+B0*b_coeff
pmaddwd mm1,mm6 // R1*r_coeff G1*g_coeff+B1*b_coeff
pand mm3,mm4 // A1 00 00 00 A0 00 00 00
packssdw mm0,mm1 // sR1 sG1+sB1 sR0 sG0+sB0
pmaddwd mm0,mm5 // sR1+sG1+sB1 sR0+sG0+sB0
psrld mm0,7 // 00 00 00 Gray1 00 00 00 Gray0
movq mm1,mm0
movq mm2,mm0
pslld mm1,8 // 00 00 Gray1 00 00 00 Gray0 00
por mm0,mm3
pslld mm2,16 // 00 Gray1 00 00 00 Gray0 00 00
por mm0,mm1
por mm0,mm2 // A1 Gray1 Gray1 Gray1 A0 Gray0 Gray0 Gray0
movq [edx+ecx*4],mm0
add ecx,2
jnz loop_beign
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
asm{
emms //MMX使用结束
}
}
//速度测试
//==============================================================================
// colorToGray_MMX 590.84 FPS
H: 优化写缓冲的内联汇编的MMX实现版本
该版本相应于上面的MMX版本只改写了两句:
一是写内存的movq [edx+ecx*4],mm0 改成了 movntq [edx+ecx*4],mm0 绕过缓存
二是函数结束的时候调用sfence刷新写入
完整代码如下:
view plaincopy to clipboardprint?
void colorToGrayLine_MMX2(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
const UInt64 csMMX_rgb_coeff_w = (((UInt64)0x00000026)<<32) | 0x004b000f;
long widthFast=width>>1<<1;
if (widthFast>0){
asm{
pcmpeqb mm5,mm5 // FF FF FF FF FF FF FF FF
mov ecx,widthFast
pxor mm7,mm7 // 00 00 00 00 00 00 00 00
pcmpeqb mm4,mm4 // FF FF FF FF FF FF FF FF
mov eax,src
mov edx,dst
movq mm6,csMMX_rgb_coeff_w
psrlw mm5,15 // 1 1 1 1
lea eax,[eax+ecx*4]
lea edx,[edx+ecx*4]
pslld mm4,24 // FF 00 00 00 FF 00 00 00
neg ecx
loop_beign:
movq mm0,[eax+ecx*4] // A1 R1 G1 B1 A0 R0 G0 B0
movq mm1,mm0
movq mm3,mm0
punpcklbw mm0,mm7 // 00 A0 00 R0 00 G0 00 B0
punpckhbw mm1,mm7 // 00 A1 00 R1 00 G1 00 B1
pmaddwd mm0,mm6 // R0*r_coeff G0*g_coeff+B0*b_coeff
pmaddwd mm1,mm6 // R1*r_coeff G1*g_coeff+B1*b_coeff
pand mm3,mm4 // A1 00 00 00 A0 00 00 00
packssdw mm0,mm1 // sR1 sG1+sB1 sR0 sG0+sB0
pmaddwd mm0,mm5 // sR1+sG1+sB1 sR0+sG0+sB0
psrld mm0,7 // 00 00 00 Gray1 00 00 00 Gray0
movq mm1,mm0
movq mm2,mm0
pslld mm1,8 // 00 00 Gray1 00 00 00 Gray0 00
por mm0,mm3
pslld mm2,16 // 00 Gray1 00 00 00 Gray0 00 00
por mm0,mm1
por mm0,mm2 // A1 Gray1 Gray1 Gray1 A0 Gray0 Gray0 Gray0
movntq [edx+ecx*4],mm0 //和colorToGrayLine_MMX的不同之处
add ecx,2
jnz loop_beign
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
colorToGray_MMX2(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX2(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
asm{
sfence //刷新写入
emms
}
void colorToGrayLine_MMX2(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
const UInt64 csMMX_rgb_coeff_w = (((UInt64)0x00000026)<<32) | 0x004b000f;
long widthFast=width>>1<<1;
if (widthFast>0){
asm{
pcmpeqb mm5,mm5 // FF FF FF FF FF FF FF FF
mov ecx,widthFast
pxor mm7,mm7 // 00 00 00 00 00 00 00 00
pcmpeqb mm4,mm4 // FF FF FF FF FF FF FF FF
mov eax,src
mov edx,dst
movq mm6,csMMX_rgb_coeff_w
psrlw mm5,15 // 1 1 1 1
lea eax,[eax+ecx*4]
lea edx,[edx+ecx*4]
pslld mm4,24 // FF 00 00 00 FF 00 00 00
neg ecx
loop_beign:
movq mm0,[eax+ecx*4] // A1 R1 G1 B1 A0 R0 G0 B0
movq mm1,mm0
movq mm3,mm0
punpcklbw mm0,mm7 // 00 A0 00 R0 00 G0 00 B0
punpckhbw mm1,mm7 // 00 A1 00 R1 00 G1 00 B1
pmaddwd mm0,mm6 // R0*r_coeff G0*g_coeff+B0*b_coeff
pmaddwd mm1,mm6 // R1*r_coeff G1*g_coeff+B1*b_coeff
pand mm3,mm4 // A1 00 00 00 A0 00 00 00
packssdw mm0,mm1 // sR1 sG1+sB1 sR0 sG0+sB0
pmaddwd mm0,mm5 // sR1+sG1+sB1 sR0+sG0+sB0
psrld mm0,7 // 00 00 00 Gray1 00 00 00 Gray0
movq mm1,mm0
movq mm2,mm0
pslld mm1,8 // 00 00 Gray1 00 00 00 Gray0 00
por mm0,mm3
pslld mm2,16 // 00 Gray1 00 00 00 Gray0 00 00
por mm0,mm1
por mm0,mm2 // A1 Gray1 Gray1 Gray1 A0 Gray0 Gray0 Gray0
movntq [edx+ecx*4],mm0 //和colorToGrayLine_MMX的不同之处
add ecx,2
jnz loop_beign
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX2(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX2(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
asm{
sfence //刷新写入
emms
}
}
//速度测试
//==============================================================================
// colorToGray_MMX2 679.50 FPS
I: 使用MMX函数指令方式的实现
MMX/SSE等特殊指令除了内联汇编来使用外,也可以使用函数指令方式的实现,从而在多种
编译器下都可以使用SIMD相关指令,可移植性也会好很多;
但现在看来,vc对此的优化还不够,还可能遇到编译器的实现bug;
(可以考虑使用intel的编译器编译这些代码,感觉优化能力很不错)
view plaincopy to clipboardprint?
#include <mmintrin.h> //mmx
//#include <mm3dnow.h> //3dnow
#include <xmmintrin.h> //sse
//#include <emmintrin.h> //sse2
//#include <pmmintrin.h> //sse3
//#include <tmmintrin.h> //ssse3
//#include <intrin.h> //sse4a
//#include <smmintrin.h> //sse4.1
//#include <nmmintrin.h> //sse4.2
//----------------------------------
void colorToGrayLine_MMX_mmh(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
long widthFast=width>>1<<1;
if (widthFast>0){
const UInt64 csMMX_rgb_coeff_w =(((UInt64)0x00000026)<<32) | 0x004b000f;
const __m64 mm6=*(const __m64*)&csMMX_rgb_coeff_w;
const __m64 mm7=_mm_setzero_si64(); //mm?变量值同colorToGrayLine_MMX中的mmx值一致
__m64 mm5=_mm_cmpeq_pi8(mm7,mm7); //想写成__m64 mm5; mm5=_mm_cmpeq_pi8(mm5,mm5);但会出错:(
const __m64 mm4=_mm_slli_pi32(mm5,24); // ...
mm5=_mm_srli_pi16(mm5,15); // ...
for (long x = 0; x < widthFast; x+=2){
__m64 mm0=*(__m64*)&src[x];
__m64 mm1=mm0;
__m64 mm3=mm0;
mm0=_mm_unpacklo_pi8(mm0,mm7);
mm1=_mm_unpackhi_pi8(mm1,mm7);
mm0=_mm_madd_pi16(mm0,mm6);
mm1=_mm_madd_pi16(mm1,mm6);
mm3=_mm_and_si64(mm3,mm4);
mm0=_mm_packs_pi32(mm0,mm1);
mm0=_mm_madd_pi16(mm0,mm5);
mm0=_mm_srli_pi32(mm0,7);
mm1=mm0;
__m64 mm2=mm0;
mm1=_mm_slli_pi32(mm1,8);
mm0=_mm_or_si64(mm0,mm3);
mm2=_mm_slli_pi32(mm2,16);
mm0=_mm_or_si64(mm0,mm1);
mm0=_mm_or_si64(mm0,mm2);
*(__m64*)&dst[x]=mm0;
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX_mmh(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX_mmh(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
_mm_empty(); //MMX使用结束
}
#include <mmintrin.h> //mmx
//#include <mm3dnow.h> //3dnow
#include <xmmintrin.h> //sse
//#include <emmintrin.h> //sse2
//#include <pmmintrin.h> //sse3
//#include <tmmintrin.h> //ssse3
//#include <intrin.h> //sse4a
//#include <smmintrin.h> //sse4.1
//#include <nmmintrin.h> //sse4.2
//----------------------------------
void colorToGrayLine_MMX_mmh(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
long widthFast=width>>1<<1;
if (widthFast>0){
const UInt64 csMMX_rgb_coeff_w =(((UInt64)0x00000026)<<32) | 0x004b000f;
const __m64 mm6=*(const __m64*)&csMMX_rgb_coeff_w;
const __m64 mm7=_mm_setzero_si64(); //mm?变量值同colorToGrayLine_MMX中的mmx值一致
__m64 mm5=_mm_cmpeq_pi8(mm7,mm7); //想写成__m64 mm5; mm5=_mm_cmpeq_pi8(mm5,mm5);但会出错:(
const __m64 mm4=_mm_slli_pi32(mm5,24); // ...
mm5=_mm_srli_pi16(mm5,15); // ...
for (long x = 0; x < widthFast; x+=2){
__m64 mm0=*(__m64*)&src[x];
__m64 mm1=mm0;
__m64 mm3=mm0;
mm0=_mm_unpacklo_pi8(mm0,mm7);
mm1=_mm_unpackhi_pi8(mm1,mm7);
mm0=_mm_madd_pi16(mm0,mm6);
mm1=_mm_madd_pi16(mm1,mm6);
mm3=_mm_and_si64(mm3,mm4);
mm0=_mm_packs_pi32(mm0,mm1);
mm0=_mm_madd_pi16(mm0,mm5);
mm0=_mm_srli_pi32(mm0,7);
mm1=mm0;
__m64 mm2=mm0;
mm1=_mm_slli_pi32(mm1,8);
mm0=_mm_or_si64(mm0,mm3);
mm2=_mm_slli_pi32(mm2,16);
mm0=_mm_or_si64(mm0,mm1);
mm0=_mm_or_si64(mm0,mm2);
*(__m64*)&dst[x]=mm0;
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX_mmh(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX_mmh(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
_mm_empty(); //MMX使用结束
}
//速度测试
//==============================================================================
// colorToGray_MMX_mmh 508.69 FPS
优化写缓冲的使用MMX函数指令方式的实现
view plaincopy to clipboardprint?
void colorToGrayLine_MMX2_mmh(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
long widthFast=width>>1<<1;
if (widthFast>0){
const UInt64 csMMX_rgb_coeff_w =(((UInt64)0x00000026)<<32) | 0x004b000f;
const __m64 mm6=*(const __m64*)&csMMX_rgb_coeff_w;
const __m64 mm7=_mm_setzero_si64(); //mm?变量值同colorToGrayLine_MMX中的mmx值一致
__m64 mm5=_mm_cmpeq_pi8(mm7,mm7); // ...
const __m64 mm4=_mm_slli_pi32(mm5,24); // ...
mm5=_mm_srli_pi16(mm5,15); // ...
for (long x = 0; x < widthFast; x+=2){
__m64 mm0=*(__m64*)&src[x];
__m64 mm1=mm0;
__m64 mm3=mm0;
mm0=_mm_unpacklo_pi8(mm0,mm7);
mm1=_mm_unpackhi_pi8(mm1,mm7);
mm0=_mm_madd_pi16(mm0,mm6);
mm1=_mm_madd_pi16(mm1,mm6);
mm3=_mm_and_si64(mm3,mm4);
mm0=_mm_packs_pi32(mm0,mm1);
mm0=_mm_madd_pi16(mm0,mm5);
mm0=_mm_srli_pi32(mm0,7);
mm1=mm0;
__m64 mm2=mm0;
mm1=_mm_slli_pi32(mm1,8);
mm0=_mm_or_si64(mm0,mm3);
mm2=_mm_slli_pi32(mm2,16);
mm0=_mm_or_si64(mm0,mm1);
mm0=_mm_or_si64(mm0,mm2);
//*(__m64*)&dst[x]=mm0;
_mm_stream_pi((__m64*)&dst[x],mm0);
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX2_mmh(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX2_mmh(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
_mm_sfence();//刷新写入
_mm_empty(); //MMX使用结束
}
void colorToGrayLine_MMX2_mmh(const Color32* src,Color32* dst,long width){
//const UInt32 gray_r_coeff_7=(UInt32)( gray_r_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_g_coeff_7=(UInt32)( gray_g_coeff*(1<<7)+0.4999999 );
//const UInt32 gray_b_coeff_7=(1<<7)-gray_r_coeff_7-gray_g_coeff_7;
// csMMX_rgb_coeff_w= short[ 0 , gray_r_coeff_7 , gray_g_coeff_7 , gray_b_coeff_7 ]
long widthFast=width>>1<<1;
if (widthFast>0){
const UInt64 csMMX_rgb_coeff_w =(((UInt64)0x00000026)<<32) | 0x004b000f;
const __m64 mm6=*(const __m64*)&csMMX_rgb_coeff_w;
const __m64 mm7=_mm_setzero_si64(); //mm?变量值同colorToGrayLine_MMX中的mmx值一致
__m64 mm5=_mm_cmpeq_pi8(mm7,mm7); // ...
const __m64 mm4=_mm_slli_pi32(mm5,24); // ...
mm5=_mm_srli_pi16(mm5,15); // ...
for (long x = 0; x < widthFast; x+=2){
__m64 mm0=*(__m64*)&src[x];
__m64 mm1=mm0;
__m64 mm3=mm0;
mm0=_mm_unpacklo_pi8(mm0,mm7);
mm1=_mm_unpackhi_pi8(mm1,mm7);
mm0=_mm_madd_pi16(mm0,mm6);
mm1=_mm_madd_pi16(mm1,mm6);
mm3=_mm_and_si64(mm3,mm4);
mm0=_mm_packs_pi32(mm0,mm1);
mm0=_mm_madd_pi16(mm0,mm5);
mm0=_mm_srli_pi32(mm0,7);
mm1=mm0;
__m64 mm2=mm0;
mm1=_mm_slli_pi32(mm1,8);
mm0=_mm_or_si64(mm0,mm3);
mm2=_mm_slli_pi32(mm2,16);
mm0=_mm_or_si64(mm0,mm1);
mm0=_mm_or_si64(mm0,mm2);
//*(__m64*)&dst[x]=mm0;
_mm_stream_pi((__m64*)&dst[x],mm0);
}
}
//border
if (width>widthFast)
colorToGrayLine_int16(&src[widthFast],&dst[widthFast],width-widthFast);
}
void colorToGray_MMX2_mmh(const TPixels32Ref& src,const TPixels32Ref& dst){
long width=std::min(src.width,dst.width);
long height=std::min(src.height,dst.height);
Color32* srcLine=src.pdata;
Color32* dstLine=dst.pdata;
for (long y = 0; y < height; ++y){
colorToGrayLine_MMX2_mmh(srcLine,dstLine,width);
src.nextLine(srcLine);
dst.nextLine(dstLine);
}
_mm_sfence();//刷新写入
_mm_empty(); //MMX使用结束
}
//速度测试
//==============================================================================
// colorToGray_MMX2_mmh 540.78 FPS
J:把测试成绩放在一起:
//CPU: AMD64x2 4200+(2.33G) 800*600 to 800*600
//==============================================================================
// colorToGray_float 145.49 FPS
// colorToGray_int16 355.33 FPS
// colorToGray_int16_expand4 413.22 FPS
// colorToGray_int8_opMul 387.97 FPS
// colorToGray_MMX 590.84 FPS
// colorToGray_MMX2 679.50 FPS
// colorToGray_MMX_mmh 508.69 FPS
// colorToGray_MMX2_mmh 540.78 FPS
ps:用SSE的浮点指令的版本/用SSE2整数指令的版本/利用SSE3的水平加指令等的实现版本有机会时再补充
ps:SIMD特殊指令集的使用框架请参见我的<YUV视频格式到RGB32格式转换的速度优化 中篇>一文,从而
根据CPU对指令集的支持情况动态的调用最优的实现函数版本;
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/housisong/archive/2009/02/12/3884368.aspx