Delphi 汇编学习(十一)--- 图像旋转的极致优化

17 篇文章 4 订阅
15 篇文章 2 订阅

图像旋转原理:

假设对图片上任意点(x,y),绕一个坐标点(rx0,ry0)逆时针旋转RotaryAngle角度后,
新的坐标设为(x', y'),有公式:
  x'= (x - rx0)*cos(RotaryAngle) + (y - ry0)*sin(RotaryAngle) + rx0 ;
  y'=-(x - rx0)*sin(RotaryAngle) + (y - ry0)*cos(RotaryAngle) + ry0 ;

  那么,根据新的坐标点求源坐标点的公式为:
  x=(x'- rx0)*cos(RotaryAngle) - (y'- ry0)*sin(RotaryAngle) + rx0 ;
  y=(x'- rx0)*sin(RotaryAngle) + (y'- ry0)*cos(RotaryAngle) + ry0 ;

一:标准旋转函数 (耗时:LONG LONG)

procedure Optimize01(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer);
var
  X, Y      : Integer;
  SrcX, SrcY: Integer;
begin
  for Y := 0 to bmpDst.Height - 1 do
  begin
    for X := 0 to bmpDst.Width - 1 do
    begin
      SrcX                       := Round((X - CenterX - MoveX) * Cos(RotaryAngle) - (Y - CenterY - MoveY) * Sin(RotaryAngle) + CenterX);
      SrcY                       := Round((X - CenterX - MoveX) * Sin(RotaryAngle) + (Y - CenterY - MoveY) * Cos(RotaryAngle) + CenterY);
      bmpDst.Canvas.Pixels[X, Y] := bmpSrc.Canvas.Pixels[SrcX, SrcY];
    end;
  end;
end;

这个函数虽然没有实用价值,但原理就是这样的。简单不。我们现在对这个函数进行一步步的优化。

二:优化 Pixels 函数(使用 Delphi 自身的属性,获取位图的像素指针)(耗时:4513 毫秒):

procedure Optimize02(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer);
var
  X, Y      : Integer;
  SrcX, SrcY: Integer;
  srcBits   : PRGBQuadArray;
  dstBits   : PRGBQuadArray;
  dstWidth  : Integer;
  dstHeight : Integer;
  srcWidth  : Integer;
  srcHeight : Integer;
begin
  srcBits := TBitmapImageAccess(TBMPAccess(bmpSrc).FImage).FDIB.dsBm.bmBits;
  dstBits := TBitmapImageAccess(TBMPAccess(bmpDst).FImage).FDIB.dsBm.bmBits;

  dstWidth  := bmpDst.Width;
  dstHeight := bmpDst.Height;
  srcWidth  := bmpSrc.Width;
  srcHeight := bmpSrc.Height;

  for Y := 0 to dstHeight - 1 do
  begin
    for X := 0 to dstWidth - 1 do
    begin
      SrcX := Round((X - CenterX - MoveX) * Cos(RotaryAngle) - (Y - CenterY - MoveY) * Sin(RotaryAngle) + CenterX);
      SrcY := Round((X - CenterX - MoveX) * Sin(RotaryAngle) + (Y - CenterY - MoveY) * Cos(RotaryAngle) + CenterY);
      if (DWORD(SrcY) < DWORD(srcHeight)) and (DWORD(SrcX) < DWORD(srcWidth)) then
      begin
        dstBits[Y * dstWidth + X] := srcBits[SrcY * srcWidth + SrcX];
      end;
    end;
  end;
end;

看看这里的循环。       
  SrcX := Round((X - CenterX - MoveX) * Cos(RotaryAngle) - (Y - CenterY - MoveY) * Sin(RotaryAngle) + CenterX);
  SrcY := Round((X - CenterX - MoveX) * Sin(RotaryAngle) + (Y - CenterY - MoveY) * Cos(RotaryAngle) + CenterY);
可以拿到循环外,先计算好。Y - ,可以拿到循环上一层。减轻循环计算量。

三:优化循环(耗时:291 毫秒):

procedure Optimize03(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer);
var
  X, Y      : Integer;
  SrcX, SrcY: Integer;
  srcBits   : PRGBQuadArray;
  dstBits   : PRGBQuadArray;
  cxc, cxs  : Single;
  cyc, cys  : Single;
  rac, ras  : Single;
  dstWidth  : Integer;
  dstHeight : Integer;
  srcWidth  : Integer;
  srcHeight : Integer;
  krx, kry  : Single;
begin
  srcBits := TBitmapImageAccess(TBMPAccess(bmpSrc).FImage).FDIB.dsBm.bmBits;
  dstBits := TBitmapImageAccess(TBMPAccess(bmpDst).FImage).FDIB.dsBm.bmBits;

  dstWidth  := bmpDst.Width;
  dstHeight := bmpDst.Height;
  srcWidth  := bmpSrc.Width;
  srcHeight := bmpSrc.Height;

  rac := Cos(RotaryAngle);
  ras := Sin(RotaryAngle);
  cxc := (CenterX + MoveX) * rac;
  cxs := (CenterX + MoveX) * ras;
  cys := (CenterY + MoveY) * ras;
  cyc := (CenterY + MoveY) * rac;

  for Y := 0 to dstHeight - 1 do
  begin
    krx   := cxc - cys - CenterX + Y * ras;
    kry   := cxs + cyc - CenterY - Y * rac;
    for X := 0 to dstWidth - 1 do
    begin
      SrcX := Round(X * rac - krx);
      SrcY := Round(X * ras - kry);
      if (DWORD(SrcY) < DWORD(srcHeight)) and (DWORD(SrcX) < DWORD(srcWidth)) then
      begin
        dstBits[Y * dstWidth + X] := srcBits[SrcY * srcWidth + SrcX];
      end;
    end;
  end;
end;

效果已经很不错了。但我们知道 Delphi 浮点运算很慢的。我们尝试一下整数优化。

四:优化浮点运算为整数运算(耗时:180 毫秒):

procedure Optimize04(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer);
var
  X, Y      : Integer;
  SrcX, SrcY: Integer;
  srcBits   : PRGBQuadArray;
  dstBits   : PRGBQuadArray;
  cxc, cxs  : Integer;
  cyc, cys  : Integer;
  rac, ras  : Integer;
  kcx, kcy  : Integer;
  dstWidth  : Integer;
  dstHeight : Integer;
  srcWidth  : Integer;
  srcHeight : Integer;
  krx, kry  : Integer;
begin
  srcBits := TBitmapImageAccess(TBMPAccess(bmpSrc).FImage).FDIB.dsBm.bmBits;
  dstBits := TBitmapImageAccess(TBMPAccess(bmpDst).FImage).FDIB.dsBm.bmBits;

  dstWidth  := bmpDst.Width;
  dstHeight := bmpDst.Height;
  srcWidth  := bmpSrc.Width;
  srcHeight := bmpSrc.Height;

  rac := Trunc(Cos(RotaryAngle) * (1 shl 8));
  ras := Trunc(Sin(RotaryAngle) * (1 shl 8));
  cxc := ((CenterX + MoveX) * rac);
  cxs := ((CenterX + MoveX) * ras);
  cys := ((CenterY + MoveY) * ras);
  cyc := ((CenterY + MoveY) * rac);
  kcx := cxc - cys - CenterX * (1 shl 8);
  kcy := cxs + cyc - CenterY * (1 shl 8);

  for Y := 0 to dstHeight - 1 do
  begin
    krx   := kcx + Y * ras;
    kry   := kcy - Y * rac;
    for X := 0 to dstWidth - 1 do
    begin
      SrcX := SmallInt((X * rac - krx) shr 8);
      SrcY := SmallInt((X * ras - kry) shr 8);
      if (DWORD(SrcY) < DWORD(srcHeight)) and (DWORD(SrcX) < DWORD(srcWidth)) then
      begin
        dstBits[Y * dstWidth + X] := srcBits[SrcY * srcWidth + SrcX];
      end;
    end;
  end;
end;

整数优化效果不错。再进一步进行乘法优化试试。

五:乘法优化(耗时:160 毫秒):
看看上面的循环,只有乘法运算是耗时的。我们可不可以将乘法运算也优化掉?当然可以。
因为 ras、rac,大小在 -256 --- 256 之间,图形大小是 4096X4096,无论旋转多少度,图像的长宽都不会超过 8192,所以我们可以建立了一个乘法表,将乘法预先计算好:

procedure InitRotateTable;
var
  I: Integer;
  J: Integer;
begin
  for I := -256 to 256 do
  begin
    for J := 0 to 8192 do
    begin
      g_RotateTable[I, J] := I * J;
    end;
  end;
end;

那么优化乘法后的代码如下:

procedure Optimize05(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer; const ras: Integer = 0; rac: Integer = 0);
var
  X, Y      : Integer;
  SrcX, SrcY: DWORD;
  srcBits   : PRGBQuadArray;
  dstBits   : PRGBQuadArray;
  cxc, cxs  : Integer;
  cyc, cys  : Integer;
  kcx, kcy  : Integer;
  dstWidth  : Integer;
  dstHeight : Integer;
  srcWidth  : DWORD;
  srcHeight : DWORD;
  krx, kry  : Integer;
  intOffset : Integer;
begin
  srcBits := TBitmapImageAccess(TBMPAccess(bmpSrc).FImage).FDIB.dsBm.bmBits;
  dstBits := TBitmapImageAccess(TBMPAccess(bmpDst).FImage).FDIB.dsBm.bmBits;

  dstWidth  := bmpDst.Width;
  dstHeight := bmpDst.Height;
  srcWidth  := bmpSrc.Width;
  srcHeight := bmpSrc.Height;

  cxc := (CenterX + MoveX) * rac;
  cxs := (CenterX + MoveX) * ras;
  cys := (CenterY + MoveY) * ras;
  cyc := (CenterY + MoveY) * rac;
  kcx := cxc - cys - CenterX * (1 shl 8);
  kcy := cxs + cyc - CenterY * (1 shl 8);

  for Y := 0 to dstHeight - 1 do
  begin
    krx       := kcx + g_RotateTable[ras, Y];
    kry       := kcy - g_RotateTable[rac, Y];
    intOffset := Y * dstWidth;
    for X     := 0 to dstWidth - 1 do
    begin
      SrcX := (g_RotateTable[rac, X] - krx) shr 8;
      SrcY := (g_RotateTable[ras, X] - kry) shr 8;
      if (SrcY < srcHeight) and (SrcX < srcWidth) then
      begin
        dstBits[intOffset + X] := srcBits[srcWidth * SrcY + SrcX];
      end;
    end;
  end;
end;

乘法优化效果一般。尝试进行并行优化试试。

六:并行优化(耗时:12 毫秒):

procedure Optimize06(bmpSrc, bmpDst: TBitmap; const RotaryAngle: double; const CenterX, CenterY, MoveX, MoveY: Integer; const ras: Integer = 0; rac: Integer = 0);
var
  srcBits  : PRGBQuadArray;
  dstBits  : PRGBQuadArray;
  cxc, cxs : Integer;
  cyc, cys : Integer;
  kcx, kcy : Integer;
  dstWidth : Integer;
  dstHeight: Integer;
  srcWidth : DWORD;
  srcHeight: DWORD;
begin
  srcBits := TBitmapImageAccess(TBMPAccess(bmpSrc).FImage).FDIB.dsBm.bmBits;
  dstBits := TBitmapImageAccess(TBMPAccess(bmpDst).FImage).FDIB.dsBm.bmBits;

  dstWidth  := bmpDst.Width;
  dstHeight := bmpDst.Height;
  srcWidth  := bmpSrc.Width;
  srcHeight := bmpSrc.Height;

  cxc := (CenterX + MoveX) * rac;
  cxs := (CenterX + MoveX) * ras;
  cys := (CenterY + MoveY) * ras;
  cyc := (CenterY + MoveY) * rac;
  kcx := cxc - cys - CenterX * (1 shl 8);
  kcy := cxs + cyc - CenterY * (1 shl 8);

  TParallel.For(0, dstHeight - 1,
    procedure(Y: Integer)
    var
      X: Integer;
      krx, kry: Integer;
      SrcX, SrcY: DWORD;
      intOffset: Integer;
    begin
      krx := kcx + g_RotateTable[ras, Y];
      kry := kcy - g_RotateTable[rac, Y];
      intOffset := Y * dstWidth;
      for X := 0 to dstWidth - 1 do
      begin
        SrcX := (g_RotateTable[rac, X] - krx) shr 8;
        SrcY := (g_RotateTable[ras, X] - kry) shr 8;
        if (SrcY < srcHeight) and (SrcX < srcWidth) then
        begin
          dstBits[intOffset + X] := srcBits[srcWidth * SrcY + SrcX];
        end;
      end;
    end);
end;

七:并行+SSE 优化(耗时:12 毫秒):
SSE 优化,就是对这个循环进行 SIMD 改写:

  for X := dstWidth - 1 downto 0 do
  begin
    SrcX := (X * rac - krx) shr 8;
    SrcY := (X * ras - kry) shr 8;
    if (SrcY < srcHeight) and (SrcX < srcWidth) then
    begin
      dstBits[intOffset + X] := srcBits[SrcY * srcWidth + SrcX];
    end;
  end;

由于参数过多,优化效果并不明显。这里就不贴代码了,具体可以参看源码。


测试环境:WIN10X64 + Delphi11
图片大小:4096*4096*32bits。上面的耗时都是顺旋转15度的用时。
源码地址:GitHub - dbyoung720/ImageGray: MMX/SSE/SSE2/SSE4/AVX/AVX2/AVX512 optimization
qq交流群:101611228

  • 3
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值