大前端CPU优化技术--NEON intrinsics开篇

最新推荐文章于 2023-09-28 13:55:55 发布

江湖修行

最新推荐文章于 2023-09-28 13:55:55 发布

阅读量763

点赞数

分类专栏： cpu 大前端端异构性能优化文章标签： cpu 性能优化大前端

本文链接：https://blog.csdn.net/jh1988abc/article/details/126612050

版权

大前端同时被 3 个专栏收录

24 篇文章 3 订阅

订阅专栏

端异构性能优化

8 篇文章 0 订阅

订阅专栏

cpu

4 篇文章 1 订阅

订阅专栏

前言

本文继续分析NEON指令，上篇文章中我们浅析了arm指令集，指令格式相关的基础知识，本文会继续深入NEON intrinsics指令的相关内容和作用浅析，内容可能会有点多，为保证大家的学习效果和良好的阅读体验，准备分上下两篇文章讲解，请各位读者耐心阅读。

intrinsics 指令介绍

Load/Store

以解交织的方式加载数据，将内存加载数据进neon寄存器

// 以解交织方式加载数据到n个向量寄存器, n为1~4
Result_t vld[n]<q>_type(Scalar_t *p_addr);

//用type类型的内存中第一个值，初始化第一个新vector的所有元素，用内存中第二个值，初始化第二个新vector的所有元素。
vld2_dup_type

以交织的方式存储数据，从neon寄存器加载数据进内存

// 将n个寄存器的N通道数据以交织方式存储到内存中, n为1~4
void vst[n]<q>_lane_type(Scalar_t *p_addr, Vector_t M, int N);
// 将元素类型为type格式的vector中指定的某个元素装入内存
vst1_lane_type

Arithmetic

整数和浮点数的加减运算

// 基本的加减操作
Result_t vadd<q>_type(Vector1_t N, Vector2_t M);
Result_t vsub<q>_type(Vector1_t N, Vector2_t M);

// L(Long)类型的指令加减运算，输出向量长度是输入的两倍。
Result_t vaddl_type(Vector1_t N, Vector2_t M);
Result_t vsubl_type(Vector1_t N, Vector2_t M);

// W(Wide)类型的指令加减运算，第一个输入向量的长度是第二个输入向量长度的两倍。
Result_t vaddw_type(Vector1_t N, Vector2_t M);
Result_t vsubw_type(Vector1_t N, Vector2_t M);

// H(half)类型的加减运算；将计算结果除以2。
Result_t vhadd<q>_type(Vector1_t N, Vector2_t M);
Result_t vhsub<q>_type(Vector1_t N, Vector2_t M);

// Q(Saturated)饱和类型的加减操作，结果超出元素的最大值时，元素就取最大值。
Result_t vqadd<q>_type(Vector1_t N, Vector2_t M);
Result_t vqsub<q>_type(Vector1_t N, Vector2_t M);

// RH(Rounding Half)类型的加减运算
Result_t vrhadd<q>_type(Vector1_t N, Vector2_t M);
Result_t vrhsub<q>_type(Vector1_t N, Vector2_t M);

// HN(half Narrow)类型的加减操作
Result_t vaddhn_type(Vector1_t N, Vector2_t M);
Result_t vsubhn_type(Vector1_t N, Vector2_t M);

// RHN(rounding half Narrow)类型的加减操作
Result_t vraddhn_type(Vector1_t N, Vector2_t M);
Result_t vrsubhn_type(Vector1_t N, Vector2_t M);

Multiply

整型和浮点型的乘法运算, 参与计算的都是向量

// 基本乘法操作
Result_t vmul<q>_type(Vector1_t N, Vector2_t M);

// l(Long)类型的乘法操作，防止溢出
Result_t vmull_type(Vector1_t N, Vector2_t M);

// QDL(Saturated, Double, Long)类型的乘法操作，当结果溢出时，取饱和值
Result_t vqdmull_type(Vector1_t N, Vector2_t M);

// 基本的乘加和乘减操作
Result_t vmla<q>_type(Vector1_t N, Vector2_t M, Vector3_t P);
Result_t vmls<q>_type(Vector1_t N, Vector2_t M, Vector3_t P);

// L(Long)类型的乘加和乘减操作
Result_t vmlal_type(Vector1_t N, Vector2_t M, Vector3_t P);
Result_t vmlsl_type(Vector1_t N, Vector2_t M, Vector3_t P);

// QDL(Saturated, Double, Long)类型的乘加和乘减操作
Result_t vqdmlal_type(Vector1_t N, Vector2_t M, Vector3_t P);
Result_t vqdmlsl_type(Vector1_t N, Vector2_t M, Vector3_t P);

// QDLH(Saturated, Double, Long, Half)类型的乘法操作
Result_t vqdmulh<q>_type(Vector1_t N, Vector2_t M);

// QRDLH(Saturated, Rounding Double, Long, Half)类型的乘法操作
Result_t vqrdmulh<q>_type(Vector1_t N, Vector2_t M);

带通道类型的乘法操作

// 基本的乘法操作
Result_t vmull_lane_type(Vector1_t N, Vector2_t M, int n);

// 基本的乘加和乘减操作
Result_t vmla<q>_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);
Result_t vmls<q>_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);

// L(long) 类型的乘加和乘减操作
Result_t vmlal_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);
Result_t vmlsl_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);

// QDL(Saturated, Double, long) 类型的乘加和乘减操作
Result_t vqdmlal_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);
Result_t vqdmlsl_lane_type(Vector1_t N, Vector2_t M, Vector3_t P, int n);

// QDH(Saturated, Double, Half) 类型的操作
Result_t vqdmulh<q>_lane_type(Vector1_t N, Vector2_t M, int n);

vqdmull_lane_type

vqdmulh_lane_type

// QRDLH(Saturated, Rounding Double, Long, Half)类型的乘法操作
vqrdmulhq_lane_type

向量和标量的乘法

// 基本的向量和标量的乘法
Result_t vmul<q>_n_type(Vector_t N, Scalar_t M);

// L(Long) 类型的向量和标量的乘法
Result_t vmull_n_type(Vector_t N, Scalar_t M);

// QDL(Saturated, Double, long) 类型的向量和标量的乘法
Result_t vqdmull_n_type(Vector_t N, Scalar_t M);

// QDH(Saturated, Double, Half) 类型的向量和标量的乘法
Result_t vqdmulh<q>_n_type(Vector_t N, Scalar_t M);

// QRDH(Saturated, Double, Half) 类型的向量和标量的乘法
Result_t vqrdmulh<q>_n_type(Vector_t N, Scalar_t M);

// L(Long) 类型的乘加和乘减操作
Result_t vmlal_n_type(Vector1_t N, Vector2_t M, Scalar_t P);
Result_t vmlsl_n_type(Vector1_t N, Vector2_t M, Scalar_t P);

// QDL(Saturated, Double, long) 类型的乘加和乘减
Result_t vqdmlal_n_type(Vector1_t N, Vector2_t M, Scalar_t P);
Result_t vqdmlsl_n_type(Vector1_t N, Vector2_t M, Scalar_t P);

// QRDLH(Saturated, Rounding Double, Long, Half)类型的乘法操作
Result_t vqrdmulhq_n_type(Vector1_t N, Vector2_t M, Scalar_t P);

比较类型

注： eq 表示相等， ge 表示大于或等于， gt 表示大于， le 表示小于或等于， lt 表示小于

逻辑比较操作，比较结果为true，输出向量的对应通道将被设置为全 1，否则设置为全0 。

Result_t vceq<q>_type(Vector1_t N, Vector2_t M);
vceqq_type
Result_t vcge<q>_type(Vector1_t N, Vector2_t M);
vcgeq_type
Result_t vcle<q>_type(Vector1_t N, Vector2_t M);
vcleq_type
Result_t vcgt<q>_type(Vector1_t N, Vector2_t M);
vcgtq_type
Result_t vclt<q>_type(Vector1_t N, Vector2_t M);
vcltq_type

向量的绝对值比较，比较结果为true时，输出向量对应通道将被设置为全1，否则设置为全0。

Result_t vcage<q>_type(Vector1_t N, Vector2_t M);
vcageq_f32
Result_t vcale<q>_type(Vector1_t N, Vector2_t M);
vcaleq_f32
Result_t vcalt<q>_type(Vector1_t N, Vector2_t M);
vcaltq_f32
Result_t vcagt<q>_type(Vector1_t N, Vector2_t M);
vcagtq_f32

逻辑类

按位与\或\非\异或操作

Result_t vand<q>_type(Vector1_t N, Vector2_t M);
Result_t vorr<q>_type(Vector1_t N, Vector2_t M);
Result_t vmvn<q>_type(Vector_t N);
Result_t veor<q>_type(Vector1_t N, Vector2_t M);

其他

// 按通道做与操作，为 true 时，将输出向量对应通道设置为全 1，否则设置为全 0
Result_t vtst<q>_type(Vector1_t N, Vector2_t M);

// M 作为 mask，标识是否对 N 做清零操作。当 M 中某位为 1, 则将 N 中对应位清零，ri = ~M & N
Result_t vbic<q>_type(Vector1_t N, Vector2_t M);

//  ri = ai | (~bi)
Result_t vorn<q>_type(Vector1_t N, Vector2_t M);

// 按位选择，参数为(N, M, P)。P 作为 mask，按位 select。当 P 中某位是 1 时，将选择 N 中对应位作为输出，否则选择 M
Result_t vbsl<q>_type(Vector1_t N, Vector2_t M, Vector3_t P);

数据类型转换

浮点数之间的转化, 以及浮点类型与整数类型之间的转化

// f32、u32、s32之间的转换。在f32转到u32时，是向下取整，且如果是负数，则转换后为0
Result_t vcvt_type1_type2
// 单精度浮点转化为整数类型
Result_t vcvt<q>_type_f32(Vector_t N);

// 整数类型转化为单精度浮点
Result_t vcvt<q>_f32_type(Vector_t N);

// f16转化为f32
Result_t vcvt_f16_f32(Vector_t N);

// f32转化为f16
Result_t vcvt_f32_f16(Vector_t N);

数据重排

向量提取

//取第2个输入vector的低n个元素放入新vector的高位，新vector剩下的元素取自第1个输入vector最高的几个元素(可实现vector内元素位置的移动)
Result_t vext<q>_type(Vector1_t N, Vector2_t M, int n);
vextq_type

查找操作

//第二个vector是索引，根据索引去第一个vector（相当于数组）中搜索相应的元素，并输出新的vector，超过范围的索引返回的是0.
Result_t vtbl[n]_type(Vector1_t N, Vector2_t M);
// 根vtbl1_type功能一样，不过搜索到的元素是用来替换第一个vector中的元素，并输出替换后的新vector，当索引超出范围时，则不替换第一个vector中相应的元素。
Result_t vtbx[n]_type(Vector1_t N, Vector2_t M, Vector3_t P);

向量翻转

Result_t vrev64<q>_type(Vector_t N);
Result_t vrev32<q>_type(Vector_t N);
Result_t vrev16<q>_type(Vector_t N);

vrev16<q>_type 按照 16bit 为块，块内数据按照 8bit 为单位进行翻转。
vrev32<q>_type 按照 32bit 为块，块内数据按照 8bit，16bit 为单位进行翻转。
vrev64<q>_type 按照 64bit 为块，块内数据按照8bit, 16bit, 32bit为单位进行翻转。

交叉和解交叉操作

// 交织操作，将两个输入vector的元素通过交叉生成一个有两个vector的矩阵
Result_t vzip<q>_type(Vector1_t N, Vector2_t M);

// 解交织操作，将两个输入vector的元素通过反交叉生成一个有两个vector的矩阵（通过这个可实现n-way 交织）
Result_t vuzp<q>_type(Vector1_t N, Vector2_t M);

Shift

立即数类型的位移

// 基本的立即数左移和右移
Result_t vshr<q>_n_type(Vector_t N, int n);
Result_t vshl<q>_n_type(Vector_t N, int n);

// R(rounding) 类型的右移操作
Result_t vrshr<q>_n_type(Vector_t N, int n);

// QL(Saturated, long) 类型的右移操作
Result_t vqshl<q>_n_type(Vector_t N, int n);

// 右移累加操作
Result_t vsra<q>_n_type(Vector1_t N, Vector2_t M, int n);

// R(rounding) 类型的右移累加操作
Result_t vrsraq_n_type(Vector1_t N, Vector2_t M, int n);

//  ri = N << n 输入vector是有符号，输出vector是无符号
Result_t vqshlu_n_type(Vector_t N, int n);
// Q(Saturated) 类型的左移操作,而且输入是有符号,输出是无符号的
Result_t vqshluq_n_type(Vector_t N, int n);


// N(Narrow) 类型的右移操作
Result_t vshrn_n_type(Vector_t N, int n);

// QN(Saturated, Narrow) 类型的右移操作, 而且输入是有符号,输出是无符号的
Result_t vqshrun_n_type(Vector_t N, int n);

// QRN(Saturated, Rounding, Narrow) 类型的右移操作, 而且输入是有符号,输出是无符号的
Result_t vqrshrun_n_type(Vector_t N, int n);

// QN(Saturated, Narrow) 类型的右移操作
Result_t vqshrn_n_type(Vector_t N, int n);

// RN(Rounding, Narrow) 类型的右移操作
Result_t vrshrn_n_type(Vector_t N, int n);

// QRN(Rounding, Rounding, Narrow) 类型的右移操作
Result_t vqrshrn_n_type(Vector_t N, int n);

// N(Narrow) 类型的左移操作
Result_t vshll_n_type(Vector_t N, int n);

非立即数类型的位移

// 左移
Result_t vshlq_type(Vector1_t N, Vector2_t M);

// Q(Saturated) 类型的左移操作
Result_t vqshl<q>_type(Vector1_t N, Vector2_t M);

// QR(Saturated, rounding) 类型的左移操作
Result_t vrshl<q>_type(Vector1_t N, Vector2_t M);

移位并插入

// 将向量 M 中各个通道先右移动 n 位, 然后将移动后元素插入到 N 对应的元素中,
// 并保持 N 中每个元素的高 n 位保持不变
Result_t vsri<q>_n_type(Vector1_t N, Vector2_t M, int n);
vsri_n_type:
vsriq_n_type:

// 将向量 M 中各个通道先左移动 n 位, 然后将移动后元素插入到 N 对应的元素中,
// 并保持 N 中第每个元素的低 n 位保持不变
Result_t vsli<q>_n_type(Vector1_t N, Vector2_t M, int n);
vsli_n_type:
vsliq_n_type: