通过前三章的学习,我们知道了 并行优化 + SSE优化,是最优的解决方案。这一节我们用同样的方法,来对图像的亮度进行调节。
Delphi 汇编学习(一)--- 图像灰值化
Delphi 汇编学习(二)--- 学习 SIMD 的痛苦
Delphi 汇编学习(三)--- 图像灰值化的极致优化
procedure Light_SSEParallel_Proc(pColor: PRGBQuad; const intLightValue, bmpWidth: Integer);
asm
{$IFDEF WIN64}
XCHG RAX, RCX
{$IFEND}
MOVSS XMM1, [c_PixBGRAMask] // XMM1 = 000000000000000000000000000000FF
MOVD XMM2, EDX // XMM2 = 0000000000000000000intLightValue
SHUFPS XMM1, XMM1, 0 // XMM1 = |000000FF|000000FF|000000FF|000000FF|
SHUFPS XMM2, XMM2, 0 // XMM2 = |intLightValue|intLightValue|intLightValue|intLightValue|
MOVAPS XMM3, XMM1 // XMM3 = |000000FF|000000FF|000000FF|000000FF|
PSUBB XMM3, XMM2 // XMM3 = |000000FF - intLightValue|000000FF - intLightValue|000000FF - intLightValue|000000FF - intLightValue|
@LOOP:
MOVUPS XMM4, [EAX] // XMM4 = |A3R3G3B3|A2R2G2B2|A1R1G1B1|A0R0G0B0|
MOVAPS XMM5, XMM4 // XMM5 = |A3R3G3B3|A2R2G2B2|A1R1G1B1|A0R0G0B0|
MOVAPS XMM6, XMM4 // XMM6 = |A3R3G3B3|A2R2G2B2|A1R1G1B1|A0R0G0B0|
MOVAPS XMM7, XMM4 // XMM7 = |A3R3G3B3|A2R2G2B2|A1R1G1B1|A0R0G0B0|
// 获取 4 个像素的 B3, B2, B1, B0
ANDPS XMM5, XMM1 // XMM5 = |000000B3|000000B2|000000B1|000000B0|
// 获取 4 个像素的 G3, G2, G1, G0
PSRLD XMM6, 8 // XMM6 = |00A3R3G3|00A2R2G2|00A1R1G1|00A0R0G0|
ANDPS XMM6, XMM1 // XMM6 = |000000G3|000000G2|000000G1|000000G0|
// 获取 4 个像素的 R3, R2, R1, R0
PSRLD XMM7, 16 // XMM7 = |0000A3R3|0000A2R2|0000A1R1|0000A0R0|
ANDPS XMM7, XMM1 // XMM7 = |000000R3|000000R2|000000R1|000000R0|
// 计算亮度值(饱和加法)
CMP EDX, 0
JL @Little
PADDUSB XMM5, XMM2 // XMM5 = |B3+intLightValue|B2+intLightValue|B1+intLightValue|B0+intLightValue|
PADDUSB XMM6, XMM2 // XMM6 = |G3+intLightValue|G2+intLightValue|G1+intLightValue|G0+intLightValue|
PADDUSB XMM7, XMM2 // XMM7 = |R3+intLightValue|R2+intLightValue|R1+intLightValue|R0+intLightValue|
JMP @RValue
@Little:
PSUBUSB XMM5, XMM3 // XMM5 = |B3 - (000000FF - intLightValue)|B2 - (000000FF - intLightValue)|B1 - (000000FF - intLightValue)|B0 - (000000FF - intLightValue)|
PSUBUSB XMM6, XMM3 // XMM6 = |G3 - (000000FF - intLightValue)|G2 - (000000FF - intLightValue)|G1 - (000000FF - intLightValue)|G0 - (000000FF - intLightValue)|
PSUBUSB XMM7, XMM3 // XMM7 = |R3 - (000000FF - intLightValue)|R2 - (000000FF - intLightValue)|R1 - (000000FF - intLightValue)|R0 - (000000FF - intLightValue)|
// 返回结果
@RValue:
PSLLD XMM6, 8 // XMM6 = |0000Y300|0000Y200|0000Y100|0000Y000|
PSLLD XMM7, 16 // XMM7 = |00Y30000|00Y20000|00Y10000|00Y00000|
ORPS XMM5, XMM6 // XMM5 = |0000Y3Y3|0000Y2Y2|0000Y1Y1|0000Y0Y0|
ORPS XMM5, XMM7 // XMM5 = |00Y3Y3Y3|00Y2Y2Y2|00Y1Y1Y1|00Y0Y0Y0|
MOVUPS [EAX], XMM5 // [EAX] = XMM5
ADD EAX, 16 // pColor 地址加 16,EAX 指向下4个像素的地址
SUB ECX, 4 // Width 减 4, 每 4 个像素一循环
JNZ @LOOP // 循环
end;
{ 11ms --- 13ms 需要脱离 IDE 执行 / ScanLine 不能用于 TParallel.For 中 }
procedure Light_SSEParallel(bmp: TBitmap; const intLightValue: Integer);
var
StartScanLine: Integer;
bmpWidthBytes: Integer;
begin
StartScanLine := Integer(bmp.ScanLine[0]);
bmpWidthBytes := Integer(bmp.ScanLine[1]) - Integer(bmp.ScanLine[0]);
TParallel.For(0, bmp.height - 1,
procedure(Y: Integer)
var
pColor: PRGBQuad;
begin
pColor := PRGBQuad(StartScanLine + Y * bmpWidthBytes);
Light_SSEParallel_Proc(pColor, intLightValue, bmp.width);
end);
end;
在我的机器上,4096X4096X32 的图片,调节亮度,耗时在 11 毫秒 --- 13 毫秒之间。还是比较理想的。
如果去除图像的界面绘制用时,实际上这个函数耗时也就只需要 4 毫秒左右的时间。
你也可以将 Light_SSEParallel_Proc 函数中,计算亮度值(饱和加法) 下面的 31 行 至 40 行代码,暂时注释掉。调节亮度,试一下。
相当于光循环,不进行亮度计算了。也需要11 毫秒 --- 13 毫秒。说明 PADDUSB、PSUBUSB 饱和指令基本不耗时。效率非常高。
详细代码:https://github.com/dbyoung720/ImageGray.git
qq交流群:101611228