[code=csharp]
#include <stdio.h>
#include <arm_neon.h>
int main() {
float arr[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
int len = sizeof(arr) / sizeof(arr[0]);
int dim_x4 = len / 4;
int left_x4 = len % 4;
float *p = arr; // 创建新的指针p指向数组首元素地址,p与arr独立
float32x4_t sum_vec = vdupq_n_f32(0.0); // 复制创建一个4元素vector sum_vec,并初始化元素为0
for (int dim = 0; dim < dim_x4; dim++) {
p = arr + dim * 4; // 更新指针位置
float32x4_t data_vec = vld1q_f32§; // 从地址p开始load四个元素存至vector data_vec
sum_vec = vaddq_f32(sum_vec, data_vec); // vector相加,结果存入新的vector
}
float32_t sum = vgetq_lane_f32(sum_vec, 0) + vgetq_lane_f32(sum_vec, 1) + vgetq_lane_f32(sum_vec, 2) + vgetq_lane_f32(sum_vec, 3);
p += 4; // 更新指针位置到剩余元素起始地址
for (int left = 0; left < left_x4; left++) {
sum += *(p + left); //对于剩下的少于4个的元素,依次计算累加即可
}
printf(“sum = %f\n”, sum);
return 0;
}
[/code]