ncnn 的仿射变换对于深度学习的预处理即小图变换进行了优化,速度可达到 OpenCV 的两倍。详细请参考 opencv ncnn warpaffine 性能测试。在具体实现方面,优点是简洁明快,双线性插值采用10bit 量化,比 OpenCV 精度高;缺点是边界填充仅支持常量值。
下面从 ncnn 的测试代码入手进行分析。
test_mat_pixel_affine.cpp
SRAND(7767517);
return test_mat_pixel_affine_0() || test_mat_pixel_affine_1();
test_mat_pixel_affine_0
return 0
|| test_mat_pixel_affine_a(60, 70)
|| test_mat_pixel_affine_b(60, 70)
|| test_mat_pixel_affine_c(60, 70)
|| test_mat_pixel_affine_d(60, 70)
|| test_mat_pixel_affine_e(60, 70)
|| test_mat_pixel_affine_f(60, 70)
|| test_mat_pixel_affine_g(60, 70)
|| test_mat_pixel_affine_a(120, 160)
|| test_mat_pixel_affine_b(120, 160)
|| test_mat_pixel_affine_c(120, 160)
|| test_mat_pixel_affine_d(120, 160)
|| test_mat_pixel_affine_e(120, 160)
|| test_mat_pixel_affine_f(120, 160)
|| test_mat_pixel_affine_g(120, 160)
|| test_mat_pixel_affine_a(220, 330)
|| test_mat_pixel_affine_b(220, 330)
|| test_mat_pixel_affine_c(220, 330)
|| test_mat_pixel_affine_d(220, 330)
|| test_mat_pixel_affine_e(220, 330)
|| test_mat_pixel_affine_f(220, 330)
|| test_mat_pixel_affine_g(220, 330);
test_mat_pixel_affine_a
get_rotation_matrix 生成变换参数矩阵。
for (int c = 1; c <= 4; c++)
{
ncnn::Mat a0 = RandomMat(w, h, c);
float tm[6];
float tm_inv[6];
ncnn::get_rotation_matrix(10.f, 0.15f, w / 2, h / 2, tm);
ncnn::invert_affine_transform(tm, tm_inv);
ncnn::Mat a1(w / 2, h / 2, (size_t)c, c);
ncnn::Mat a2 = a0.clone();
if (c == 1)
{
ncnn::warpaffine_bilinear_c1(a0, w, h, a1, w / 2, h / 2, tm, 0);
ncnn::warpaffine_bilinear_c1(a1, w / 2, h / 2, a2, w, h, tm_inv, -233);
}
if (c == 2)
{
ncnn::warpaffine_bilinear_c2(a0, w, h, a1, w / 2, h / 2, tm, 0);
ncnn::warpaffine_bilinear_c2(a1, w / 2, h / 2, a2, w, h, tm_inv, -233);
}
if (c == 3)
{
ncnn::warpaffine_bilinear_c3(a0, w, h, a1, w / 2, h / 2, tm, 0);
ncnn::warpaffine_bilinear_c3(a1, w / 2, h / 2, a2, w, h, tm_inv, -233);
}
if (c == 4)
{
ncnn::warpaffine_bilinear_c4(a0, w, h, a1, w / 2, h / 2, tm, 0);
ncnn::warpaffine_bilinear_c4(a1, w / 2, h / 2, a2, w, h, tm_inv, -233);
}
if (CompareNearlyEqual(a0, a2) != 0)
{
fprintf(stderr, "test_mat_pixel_affine_a failed w=%d h=%d c=%d\n", w, h, c);
return -1;
}
}
return 0;
get_rotation_matrix
[ x ′ y ′ w ′ ] = [ 1 0 t x 0 1 t y 0 0 1 ] [ cos θ − sin θ 0 sin θ cos θ 0 0 0 1 ] [ s x 0 0 0 s y 0 0 0 1 ] [ 1 0 − t x 0 1 − t y 0 0 1 ] [ x y w ] = [ cos θ − sin θ t x sin θ cos θ t y 0 0 1 ] [ s x 0 0 0 s y 0 0 0 1 ] [ 1 0 − t x 0 1 − t y 0 0 1 ] [ x y w ] = [ s x cos θ − s y sin θ t x s x sin θ s y cos θ t y 0 0 1 ] [ 1 0 − t x 0 1 − t y 0 0 1 ] [ x y w ] = [ s x cos θ − s y sin θ − t x s x cos + t y s y sin θ + t x s x sin θ s y cos θ − t x s x sin θ − t y s y cos θ + t y 0 0 1 ] [ x y w ] \begin{aligned} \begin{bmatrix} x' \ \\ y' \\ w'\end{bmatrix} &= \begin{bmatrix} 1 & 0 & t_x \\ 0 & 1 & t_y\\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} \cos \theta & -\sin \theta & 0\\ \sin \theta & \cos \theta & 0\\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} s_x & 0 & 0\\ 0 & s_y & 0\\ 0 & 0 & 1 \end{bmatrix} \begin{bmatrix} 1 & 0 & -t_x \\ 0 & 1 & -t_y\\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} x \ \\ y \\ w\end{bmatrix}\\ &= \begin{bmatrix} \cos \theta & -\sin \theta & t_x\\ \sin \theta & \cos \theta & t_y\\ 0 & 0 & 1\end{bmatrix}\begin{bmatrix} s_x & 0 & 0\\ 0 & s_y & 0\\ 0 & 0 & 1 \end{bmatrix} \begin{bmatrix} 1 & 0 & -t_x \\ 0 & 1 & -t_y\\ 0 & 0 & 1\end{bmatrix}\begin{bmatrix} x \ \\ y \\ w\end{bmatrix}\\ &= \begin{bmatrix} s_x\cos \theta & -s_y\sin \theta & t_x\\ s_x\sin \theta & s_y\cos\theta & t_y\\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} 1 & 0 & -t_x \\ 0 & 1 & -t_y\\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} x \ \\ y \\ w\end{bmatrix}\\ &= \begin{bmatrix} s_x\cos\theta & -s_y\sin \theta & -t_x s_x\cos + t_y s_y\sin\theta+t_x \\ s_x\sin \theta & s_y\cos \theta & -t_x s_x\sin\theta- t_ys_y\cos\theta + t_y \\ 0 & 0 & 1\end{bmatrix} \begin{bmatrix} x \ \\ y \\ w\end{bmatrix} \end{aligned} x′ y′w′ = 100010txty1 cosθsinθ0−sinθcosθ0001 sx000sy0001 100010−tx−ty1 x yw = cosθsinθ0−sinθcosθ0txty1 sx000sy0001 100010−tx−ty1 x yw = sxcosθsxsinθ0−sysinθsycosθ0txty1 100010−tx−ty1 x yw = sxcosθsxsinθ0−sysinθsycosθ0−txsxcos+tysysinθ+tx−txsxsinθ−tysycosθ+ty1 x yw
- 平移坐标,使原点位于 ( t x , t y ) (t_x, t_y) (tx,ty);
- 旋转 θ \theta θ;
- 缩放 ( s x , s y ) (s_x, s_y) (sx,sy);
- 平移回去。
angle *= (float)(3.14159265358979323846 / 180);
float alpha = cos(angle) * scale;
float beta = sin(angle) * scale;
tm[0] = alpha;
tm[1] = beta;
tm[2] = (1.f - alpha) * dx - beta * dy;
tm[3] = -beta;
tm[4] = alpha;
tm[5] = beta * dx + (1.f - alpha) * dy;
invert_affine_transform
对于参数矩阵求逆。
float D = tm[0] * tm[4] - tm[1] * tm[3];
D = D != 0.f ? 1.f / D : 0.f;
float A11 = tm[4] * D;
float A22 = tm[0] * D;
float A12 = -tm[1] * D;
float A21 = -tm[3] * D;
float b1 = -A11 * tm[2] - A12 * tm[5];
float b2 = -A21 * tm[2] - A22 * tm[5];
tm_inv[0] = A11;
tm_inv[1] = A12;
tm_inv[2] = b1;
tm_inv[3] = A21;
tm_inv[4] = A22;
tm_inv[5] = b2;
warpaffine_bilinear_c1
∣
x
s
r
c
y
s
r
c
∣
=
∣
m
00
m
01
t
x
m
10
m
11
t
y
∣
∣
x
y
1
∣
\begin{vmatrix} x_{src} \\ y_{src} \end{vmatrix} = \begin{vmatrix} m_{00} & m_{01} & t_x \\ m_{10} & m_{11} & t_y \end{vmatrix} \begin{vmatrix} {x} \\ {y} \\ 1 \end{vmatrix}
xsrcysrc
=
m00m10m01m11txty
xy1
调用同名函数。
return warpaffine_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w, tm, type, v);
warpaffine_bilinear_c1
d
s
t
(
x
,
y
)
=
s
r
c
(
M
11
x
+
M
12
y
+
M
13
,
M
21
x
+
M
22
y
+
M
23
)
\mathrm{dst}(x,y)=\mathrm{src}(M_{11}x+M_{12}y+M_{13}, M_{21}x+M_{22}y+M_{23})
dst(x,y)=src(M11x+M12y+M13,M21x+M22y+M23)
adelta
和bdelta
数组中的值和行中位置有关。
Δ
a
=
M
11
x
\Delta a = M_{11}x
Δa=M11x
Δ
b
=
M
21
x
\Delta b = M_{21}x
Δb=M21x
const unsigned char* border_color = (const unsigned char*)&v;
const int wgap = stride - w;
const unsigned char* src0 = src;
unsigned char* dst0 = dst;
#define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X), SHRT_MIN), SHRT_MAX)
#define SATURATE_CAST_INT(X) (int)::std::min(::std::max((int)((X) + ((X) >= 0.f ? 0.5f : -0.5f)), INT_MIN), INT_MAX)
std::vector<int> adelta(w);
std::vector<int> bdelta(w);
for (int x = 0; x < w; x++)
{
adelta[x] = SATURATE_CAST_INT(tm[0] * x * (1 << 10));
bdelta[x] = SATURATE_CAST_INT(tm[3] * x * (1 << 10));
}
每次取一行中的8个数。
X0
和Y0
为当前行对应的值。
X
0
=
M
12
y
+
M
13
X_0=M_{12}y + M_{13}
X0=M12y+M13
Y
0
=
M
12
y
+
M
23
Y_0=M_{12}y + M_{23}
Y0=M12y+M23
(sx_0, sy_0)
和(sx_7, sy_7)
为源块中的对角坐标。
sxy_inout=1
表示8个数均在行内,sxy_inout=2
表示8个数均在行外。
int y = 0;
for (; y < h; y++)
{
int X0 = SATURATE_CAST_INT((tm[1] * y + tm[2]) * (1 << 10));
int Y0 = SATURATE_CAST_INT((tm[4] * y + tm[5]) * (1 << 10));
int x = 0;
for (; x + 7 < w; x += 8)
{
int sxy_inout = 0;
{
int X_0 = X0 + adelta[x];
int Y_0 = Y0 + bdelta[x];
int X_7 = X0 + adelta[x + 7];
int Y_7 = Y0 + bdelta[x + 7];
short sx_0 = SATURATE_CAST_SHORT((X_0 >> 10));
short sy_0 = SATURATE_CAST_SHORT((Y_0 >> 10));
short sx_7 = SATURATE_CAST_SHORT((X_7 >> 10));
short sy_7 = SATURATE_CAST_SHORT((Y_7 >> 10));
if (((unsigned short)sx_0 < srcw - 1 && (unsigned short)sy_0 < srch - 1) && ((unsigned short)sx_7 < srcw - 1 && (unsigned short)sy_7 < srch - 1))
{
// all inside
sxy_inout = 1;
}
else if ((sx_0 < -1 && sx_7 < -1) || (sx_0 >= srcw && sx_7 >= srcw) || (sy_0 < -1 && sy_7 < -1) || (sy_0 >= srch && sy_7 >= srch))
{
// all outside
sxy_inout = 2;
}
}
源像素均在行内时:
vaddq_s32 实现4个整型数相加。[_Xl _Xh]
为
Δ
a
+
X
0
=
M
11
x
+
M
12
y
+
M
13
\Delta a + X_0 =M_{11}x + M_{12}y + M_{13}
Δa+X0=M11x+M12y+M13,[_Yl _Yh]
为
Δ
b
+
Y
0
=
M
21
x
+
M
12
y
+
M
23
\Delta b + Y_0 = M_{21}x + M_{12}y + M_{23}
Δb+Y0=M21x+M12y+M23
if (sxy_inout == 1)
{
// all inside
#if __ARM_NEON
int32x4_t _Xl = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x));
int32x4_t _Xh = vaddq_s32(vdupq_n_s32(X0), vld1q_s32(adelta.data() + x + 4));
int32x4_t _Yl = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x));
int32x4_t _Yh = vaddq_s32(vdupq_n_s32(Y0), vld1q_s32(bdelta.data() + x + 4));
f ( x , y ) = 1 ( x 2 − x 1 ) ( y 2 − y 1 ) [ x 2 − x x − x 1 ] [ f ( Q 11 ) f ( Q 12 ) f ( Q 21 ) f ( Q 22 ) ] [ y 2 − y y − y 1 ] f(x, y) = \frac{1}{(x_2-x_1)(y_2-y_1)}\begin{bmatrix} x_2 -x & x-x_1 \end{bmatrix}\begin{bmatrix} f(Q_{11}) & f(Q_{12}) \\ f(Q_{21}) & f(Q_{22})\end{bmatrix}\begin{bmatrix} y_2-y \\ y-y_1 \end{bmatrix} f(x,y)=(x2−x1)(y2−y1)1[x2−xx−x1][f(Q11)f(Q21)f(Q12)f(Q22)][y2−yy−y1]
vqshrn_n_s32 带符号的右移饱和(立即数)。将int
结果转成了short
。
_sxl
和_sxh
为对应到源图上的像素横坐标,_syl
和_syh
为纵坐标。
vdupq_n_u32 将向量元素复制到向量或标量。
_v1024m1
即
1024
−
1
1024-1
1024−1。
vreinterpretq_u32_s32 向量重新解释强制转换操作,有符号转无符号。
vmovn_u32 将每个值缩小到原始宽度的一半。
vcombine_u16 将两个u16合并成32。
_fx
和_fy
为
x
x
x 和
y
y
y 的小数部分。
_alpha0
和_alpha1
为
x
2
−
x
x_2 -x
x2−x 和
x
−
x
1
x-x_1
x−x1,_beta0
和_beta1
为
y
2
−
y
y_2 -y
y2−y 和
y
−
y
1
y-y_1
y−y1
vsubq_u16 向量减。
vmull_s16 带符号长乘(向量)。
int16x4_t _sxl = vqshrn_n_s32(_Xl, 10);
int16x4_t _sxh = vqshrn_n_s32(_Xh, 10);
int16x4_t _syl = vqshrn_n_s32(_Yl, 10);
int16x4_t _syh = vqshrn_n_s32(_Yh, 10);
uint32x4_t _v1024m1 = vdupq_n_u32((1 << 10) - 1);
uint16x8_t _fx = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Xh), _v1024m1)));
uint16x8_t _fy = vcombine_u16(vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yl), _v1024m1)), vmovn_u32(vandq_u32(vreinterpretq_u32_s32(_Yh), _v1024m1)));
uint16x8_t _alpha0 = vsubq_u16(vdupq_n_u16(1 << 10), _fx);
uint16x8_t _alpha1 = _fx;
uint16x8_t _beta0 = vsubq_u16(vdupq_n_u16(1 << 10), _fy);
uint16x8_t _beta1 = _fy;
vaddw_s16 为有符号宽加。
_a0l
和_a0h
分别为4个
Q
11
Q_{11}
Q11,_b0l
和_b0h
分别为4个
Q
21
Q_{21}
Q21。
vgetq_lane_s32 从一个向量中提取一个通道(元素)。
vld2_lane_u8 从内存中以双向量结构加载两个元素,并将其返回到结果中。 加载的值来自连续的存储器地址。 结构中未加载的元素将按原样返回结果。 n 是要加载的元素的索引。
_a0a1
和_b0b1
中原本为空,每次从指定地址向通道加载一个 N 元素结构。
Q
11
Q_{11}
Q11 和
Q
12
Q_{12}
Q12 的地址是相邻的,
Q
21
Q_{21}
Q21 和
Q
22
Q_{22}
Q22 亦然。这样 vld2_lane_u8 可以同时加载
f
(
Q
11
)
f(Q_{11})
f(Q11) 和
f
(
Q
12
)
f(Q_{12})
f(Q12),或
f
(
Q
21
)
f(Q_{21})
f(Q21) 和
f
(
Q
22
)
f(Q_{22})
f(Q22) 中的一个通道。作为对比,TNN 中的 WarpAffineCalculateOneRow 调用两次 vld1_lane_u8
vmovl_u8 左移,对读取的uint8x8_t
进行宽度扩展。
_a0_0
、_a1_0
、_b0_0
和_b1_0
分别为
f
(
Q
11
)
f(Q_{11})
f(Q11)、
f
(
Q
12
)
f(Q_{12})
f(Q12)、
f
(
Q
21
)
f(Q_{21})
f(Q21) 和
f
(
Q
22
)
f(Q_{22})
f(Q22)
int16x4_t _srcstride = vdup_n_s16(srcstride);
int32x4_t _a0l = vaddw_s16(vmull_s16(_srcstride, _syl), _sxl);
int32x4_t _a0h = vaddw_s16(vmull_s16(_srcstride, _syh), _sxh);
int32x4_t _b0l = vaddw_s16(_a0l, _srcstride);
int32x4_t _b0h = vaddw_s16(_a0h, _srcstride);
uint8x8x2_t _a0a1 = uint8x8x2_t();
uint8x8x2_t _b0b1 = uint8x8x2_t();
{
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 0), _a0a1, 0);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 0), _b0b1, 0);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 1), _a0a1, 1);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 1), _b0b1, 1);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 2), _a0a1, 2);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 2), _b0b1, 2);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0l, 3), _a0a1, 3);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0l, 3), _b0b1, 3);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 0), _a0a1, 4);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 0), _b0b1, 4);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 1), _a0a1, 5);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 1), _b0b1, 5);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 2), _a0a1, 6);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 2), _b0b1, 6);
_a0a1 = vld2_lane_u8(src0 + vgetq_lane_s32(_a0h, 3), _a0a1, 7);
_b0b1 = vld2_lane_u8(src0 + vgetq_lane_s32(_b0h, 3), _b0b1, 7);
}
uint16x8_t _a0_0 = vmovl_u8(_a0a1.val[0]);
uint16x8_t _a1_0 = vmovl_u8(_a0a1.val[1]);
uint16x8_t _b0_0 = vmovl_u8(_b0b1.val[0]);
uint16x8_t _b1_0 = vmovl_u8(_b0b1.val[1]);
vget_low_u16 返回128位输入向量的下半部分。输出是一个64位向量,其元素数为输入向量的一半。
vmlal_u16 无符号乘加。将第二和第三个向量中的对应元素相乘,然后将乘积与第一个输入向量中的对应元素相加。
vqshrn_n_u32 将整数的四字向量中的每个元素右移一个立即数,并将结果放入一个双字向量中,如果发生饱和,则置位粘滞 QC 标志(FPSCR 位[27])。
vqmovn_u16 将操作数向量的每个元素复制到目标向量的相应元素。结果元素是操作数元素宽度的一半,并且值会饱和到结果宽度。
_a00_0l
和_a00_0h
为
f
(
Q
11
)
(
x
2
−
x
)
+
f
(
Q
12
)
(
x
−
x
1
)
f(Q_{11})(x_2 -x)+ f(Q_{12})(x-x_1)
f(Q11)(x2−x)+f(Q12)(x−x1)
_b00_0l
和_b00_0h
为
f
(
Q
21
)
(
y
2
−
y
)
+
f
(
Q
22
)
(
y
−
y
1
)
f(Q_{21})(y_2 -y)+ f(Q_{22})(y-y_1)
f(Q21)(y2−y)+f(Q22)(y−y1)
uint16x4_t _a00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_a0_0), vget_low_u16(_alpha0)), vget_low_u16(_a1_0), vget_low_u16(_alpha1)), 5);
uint16x4_t _a00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_a0_0), vget_high_u16(_alpha0)), vget_high_u16(_a1_0), vget_high_u16(_alpha1)), 5);
uint16x4_t _b00_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_low_u16(_b0_0), vget_low_u16(_alpha0)), vget_low_u16(_b1_0), vget_low_u16(_alpha1)), 5);
uint16x4_t _b00_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(vget_high_u16(_b0_0), vget_high_u16(_alpha0)), vget_high_u16(_b1_0), vget_high_u16(_alpha1)), 5);
f ( x , y ) = ( a 0 α 0 + a 1 α 1 ) β 0 + ( b 0 α 0 + b 1 α 1 ) β 1 = f ( Q 11 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 12 ) ( x − x 1 ) ( y 2 − y ) + f ( Q 21 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 22 ) ( x − x 1 ) ( y − y 1 ) \begin{aligned} f(x, y) &= (a_0\alpha_0+ a_1\alpha_1)\beta_0 + (b_0\alpha_0+ b_1\alpha_1)\beta_1\\ &= f(Q_{11})(x_2 -x)(y_2 -y) + f(Q_{12})(x-x_1)(y_2 -y) \\ &\qquad+ f(Q_{21})(x_2 -x)(y_2 -y) + f(Q_{22})(x-x_1)(y-y_1) \end{aligned} f(x,y)=(a0α0+a1α1)β0+(b0α0+b1α1)β1=f(Q11)(x2−x)(y2−y)+f(Q12)(x−x1)(y2−y)+f(Q21)(x2−x)(y2−y)+f(Q22)(x−x1)(y−y1)
vqmovn_u16 结果元素是操作数元素宽度的一半,并且值会饱和到结果宽度。
vst1_u8 将向量存储到内存中。
uint16x4_t _dst_0l = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0l, vget_low_u16(_beta0)), _b00_0l, vget_low_u16(_beta1)), 15);
uint16x4_t _dst_0h = vqshrn_n_u32(vmlal_u16(vmull_u16(_a00_0h, vget_high_u16(_beta0)), _b00_0h, vget_high_u16(_beta1)), 15);
uint8x8_t _dst = vqmovn_u16(vcombine_u16(_dst_0l, _dst_0h));
vst1_u8(dst0, _dst);
dst0 += 8;
a0
、a1
、b0
和b1
为位置。4个像素插值得到结果。
#else
for (int xi = 0; xi < 8; xi++)
{
int X = X0 + adelta[x + xi];
int Y = Y0 + bdelta[x + xi];
short sx = SATURATE_CAST_SHORT((X >> 10));
short sy = SATURATE_CAST_SHORT((Y >> 10));
short fx = X & ((1 << 10) - 1);
short fy = Y & ((1 << 10) - 1);
short alpha0 = (1 << 10) - fx;
short alpha1 = fx;
short beta0 = (1 << 10) - fy;
short beta1 = fy;
const unsigned char* a0 = src0 + srcstride * sy + sx;
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
dst0 += 1;
}
#endif // __ARM_NEON
}
如果全部落在边界外,赋指定了边界值,-233
表示跳过不处理。
else if (sxy_inout == 2)
{
// all outside
if (type != -233)
{
#if __ARM_NEON
uint8x8_t _border_color = vdup_n_u8(border_color[0]);
vst1_u8(dst0, _border_color);
#else
for (int xi = 0; xi < 8; xi++)
{
dst0[xi] = border_color[0];
}
#endif // __ARM_NEON
}
else
{
// skip
}
dst0 += 8;
}
如果是在边界上,逐元素处理:
- 如果不是透明模式且源像素在边界外则直接取填充值;
- 如果是透明模式且源像素在右下边界上或右下边界外则跳过;
- 否则根据位置独立确定
a0
、a1
、b0
和b1
的值。
else // if (sxy_inout == 0)
{
for (int xi = 0; xi < 8; xi++)
{
int X = X0 + adelta[x + xi];
int Y = Y0 + bdelta[x + xi];
short sx = SATURATE_CAST_SHORT((X >> 10));
short sy = SATURATE_CAST_SHORT((Y >> 10));
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
{
dst0[0] = border_color[0];
}
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
{
// skip
}
else
{
short fx = X & ((1 << 10) - 1);
short fy = Y & ((1 << 10) - 1);
short alpha0 = (1 << 10) - fx;
short alpha1 = fx;
short beta0 = (1 << 10) - fy;
short beta1 = fy;
short sx1 = sx + 1;
short sy1 = sy + 1;
const unsigned char* a0 = src0 + srcstride * sy + sx;
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
{
a0 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
{
a1 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
{
b0 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
{
b1 = type != -233 ? border_color : dst0;
}
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
}
dst0 += 1;
}
}
}
处理行尾剩余的元素。
for (; x < w; x++)
{
int X = X0 + adelta[x];
int Y = Y0 + bdelta[x];
short sx = SATURATE_CAST_SHORT((X >> 10));
short sy = SATURATE_CAST_SHORT((Y >> 10));
if (type != -233 && (sx < -1 || sx >= srcw || sy < -1 || sy >= srch))
{
dst0[0] = border_color[0];
}
else if (type == -233 && ((unsigned short)sx >= srcw - 1 || (unsigned short)sy >= srch - 1))
{
// skip
}
else
{
short fx = X & ((1 << 10) - 1);
short fy = Y & ((1 << 10) - 1);
short alpha0 = (1 << 10) - fx;
short alpha1 = fx;
short beta0 = (1 << 10) - fy;
short beta1 = fy;
short sx1 = sx + 1;
short sy1 = sy + 1;
const unsigned char* a0 = src0 + srcstride * sy + sx;
const unsigned char* a1 = src0 + srcstride * sy + sx + 1;
const unsigned char* b0 = src0 + srcstride * (sy + 1) + sx;
const unsigned char* b1 = src0 + srcstride * (sy + 1) + sx + 1;
if ((unsigned short)sx >= srcw || (unsigned short)sy >= srch)
{
a0 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx1 >= srcw || (unsigned short)sy >= srch)
{
a1 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx >= srcw || (unsigned short)sy1 >= srch)
{
b0 = type != -233 ? border_color : dst0;
}
if ((unsigned short)sx1 >= srcw || (unsigned short)sy1 >= srch)
{
b1 = type != -233 ? border_color : dst0;
}
dst0[0] = (unsigned char)(((((unsigned short)((a0[0] * alpha0 + a1[0] * alpha1) >> 5) * beta0)) + (((unsigned short)((b0[0] * alpha0 + b1[0] * alpha1) >> 5) * beta1))) >> 15);
}
dst0 += 1;
}
dst0 += wgap;
}
#undef SATURATE_CAST_SHORT
#undef SATURATE_CAST_INT
参考资料:
- opencv ncnn warpaffine 性能测试
- OpenCV warpAffine的天坑
- 学术资讯 | TNN新版本上线!全新特性,更加好用!
- ComputeLibrary/src/core/NEON/kernels/NEWarpKernel.cpp
- Image Transformations
- Bilinear Interpolation
- Bilinear interpolation
- NEON优化
- NEON intrinsics for extracting lanes from a vector into a register
- D.9.5. VLD2_LANE
- ARM Neon 编程(一):读取与存储
- D.9.2. VLD1_LANE
- NEON Programmer’s Guide(2) —— 让编译器帮你实现NEON加速
- 使用ARM NEON Intrinsics加速Video Codec
- NEON intrinsics for converting vectors
- ARM_NEON_CNN编程
- 常用NEON 内置函数记录备用
- NEON intrinsics for loading an N-element structure
- ARM NEON 优化
- WarpAffine
- Decomposition of a nonsquare affine matrix
- Decompose affine transformation (including shear in x and y)
- Decomposing an Affine transformation
- Given this transformation matrix, how do I decompose it into translation, rotation and scale matrices?
- Image Warping
- Affine and Perspective Warping
- 2D Geometrical Transformations
- 5.2DtransformsAhandout.pdf
- Geometric Transformations
- Opencv 仿射变换原理代码解析
- NEON intrinsics for extracting lanes from a vector into a register
- NEON intrinsics for loading an N-element structure