前言
博主在[①ARM Neon]: Neon Intrinsic简介一文中介绍了ARM Neon指令的基本知识,在本篇博客中就归纳总结一些常用的,或者一些特殊好用的Neon指令,持续更新。
Neon指令的查询可以参考使用下面网址:
https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
https://gcc.gnu.org/onlinedocs/gcc-4.6.4/gcc/ARM-NEON-Intrinsics.html
加载存储操作
- vld1/vst1 variants,顺序加载存储,例如vld1q_f32,vst1q_f32:
/* Element/structure load
ini{f0, f1, f2, f3} -> in.val[0] = {f0, f1}
in.val[1] = {f2, f3}
*/
float32x2x2_t in = vld1q_f32((const float*)ini);
vst1q_f32((float*)ini, in);
- vld2/vst2 variants,交错加载存储,例如vld2q_f32,vst2q_f32:
/* Element/structure load
ini{f0, f1, f2, f3} -> in.val[0] = {f0, f2}
in.val[1] = {f1, f3}
*/
float32x2x2_t in = vld2q_f32((const float*)ini);
vst2q_f32((float*)ini, in);
- vld3/vst3 variants,交错加载存储,例如vld3q_f32,vst3q_f32:
/* Element/structure load
ini{f0, f1, f2, f3, -> in.val[0] = {f0, f3, f6, f9}
f4, f5, f6, f7, in.val[1] = {f1, f4, f7, f10}
f8, f9, f10, f11} in.val[2] = {f2, f5, f8, f11}
*/
float32x4x3_t in = vld2q_f32((const float*)ini);
vst2q_f32((float*)ini, in);
- 同理也有vld4/vst4 variants,适用于例如 float32x4x4_t 返回数据类型,这里就不作更多复述了。
特殊操作
- vdupq_n,set all lanes to the same value,例如vdupq_n_f32,将一个 float32x4x2_t 向量数据初始化为0:
float32x4x2_t a_val;
/* a_val -> {{0.f, 0.f, 0.f, 0.f},
{0.f, 0.f, 0.f, 0.f}}
*/
a_val.val[0] = vdupq_n_f32(0.f);
a_val.val[1] = vdupq_n_f32(0.f);
基本运算
- vnegq,negate,例如vnegq_f32,将一个 float32x4x2_t 向量数据中的一组数取反:
float32x4x2_t a_val;
/* a_val{{f0, f1, f2, f3}, -> {{ f0, f1, f2, f3},
{f4, f5, f6, f7}} {-f4, -f5, -f6, -f7}}
*/
a_val.val[1] = vnegq_f32(a_val.val[1]);
- vmlaq/vmlsq,multiply-accumulate/multiply-subtract,例如vmlaq_f32/vmlsq_f32,做一组数的相乘后相加或相减:
float32x4x2_t a_val, b_val, c_val;
/* a_val{{a0, a1, a2, a3}, b_val{{b0, b1, b2, b3}, c_val{{c0, c1, c2, c3},
{a4, a5, a6, a7}} {b4, b5, b6, b7}} {c4, c5, c6, c7}}
RESULT[i] = a[i] + (b[i] * c[i]), for i= 0 to 3
a_val -> {a0 + b0 * c0, a1 + b1 * c1, a2 + b2 * c2, a2 + b2 * c2},
a4, a5, a6, a7}}
*/
a_val.val[0] = vmlaq_f32(a_val.val[0], b_val.val[0], c_val.val[0]);
/*
RESULT[i] = a[i] - (b[i] * c[i]), for i= 0 to 3
a_val -> {a0 - b0 * c0, a1 - b1 * c1, a2 - b2 * c2, a2 - b2 * c2},
a4, a5, a6, a7}}
*/
a_val.val[0] = vmlsq_f32(a_val.val[0], b_val.val[0], c_val.val[0]);
- vfmaq_laneq_f32,floating-point fused multiply-add to accumulator (vector)。将某个float32x4_t与另外一个float32x4_t中的单个数据进行broadcast乘法,将结果跟第三个float32x4_t相加。
float32x4_t acc_val, a_val, b_val;
/* acc_val{c0, c1, c2, c3}, a_val{a0, a1, a2, a3}, b_val{b0, b1, b2, b3},
// arguments: (float32x4_t a, float32x4_t b, float32x4_t v, const int lane)
acc_val = vfmaq_laneq_f32(acc_val, a_val, b_val, 1); // lane = 1, 取 b_val 中的 b1
acc_val -> {c0 + a0 * b1, c1 + a1 * b1, c2 + a2 * b1, c3 + a3 * b1}