1.使用NEON intrinsic指令,实现下图所示vsum传统串行函数的向量化
要求:
a) #include <arm_neon.h>
b) 函数接口声明为: int vsum_arm_neon(int *v, int n)
c) 主函数分别调用vsum和vsum_arm_neon接口,加入clock计时函数,对比向量化前后的性能。
d) 初始化长度n为1024000,数组v元素初始为0~10之间随机数
e) 编译选项使用–O1优化等级
f) 存储int元素的128位向量数据结构名称:int32x4_t
g) 向量vs记录累加结果,其初始化使用函数:vdupq_n_s32
h) 累加结果向量vs各元素的最终求和,使用:vaddvq_s32
i) 可在ARM官网大全,查看每条Intrinsic函数用法:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <arm_neon.h>
int vsum(int *v, int n) {
int sum = 0;
for (int i = 0; i < n; i++) {
sum += v[i];
}
return sum;
}
int vsum_arm_neon(int *v, int n) {
int32x4_t vs = vdupq_n_s32(0);
int32x4_t *vv = (int32x4_t*) v;
int32x4_t vtemp;
int i;
for (i = 0; i < n / 4; i++) {
vtemp = vld1q_s32(&vv[i]);
vs = vaddq_s32(vs, vtemp);
}
int32_t sum = vaddvq_s32(vs);
for (i = i * 4; i < n; i++) {
sum += v[i];
}
return sum;
}
int main() {
int n = 1024000;
int v[n];
// 初始化数组v为0~10之间的随机数
srand(time(NULL));
for (int i = 0; i < n; i++) {
v[i] = rand() % 11;
}
clock_t start_time = clock();
int sum = vsum(v, n);
clock_t end_time = clock();
printf("vsum elapsed time: %.4f seconds\n", (double) (end_time - start_time) / CLOCKS_PER_SEC);
start_time = clock();
sum = vsum_arm_neon(v, n);
end_time = clock();
printf("vsum_arm_neon elapsed time: %.4f seconds\n", (double) (end_time - start_time) / CLOCKS_PER_SEC);
return 0;
}
2.使用omp simd指导语句,对图像数据格式转换函数-bgra2rgb
(如下图所示),进行向量化改写,并对比改写前后时间。
要求:
a) 转换结果需正确
b) 打印对比bgra2rgb函数运行时间
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
using namespace std;
void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
#pragma omp parallel for simd
for (int i = 0; i < h; ++i)
{
for (int j = 0; j < w; j++)
{
dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
}
}
}
int main()
{
int nloop = 100;
const int w = 480;
const int h = 640;
uint8_t bgra_mat[w * h * 4];
uint8_t rgb_mat[w * h * 3];
srand(100);
for (int i = 0; i < w * h * 4; i++)
{
bgra_mat[i] = rand() % 256;
}
clock_t t = clock();
for (int iloop = 0; iloop < nloop; iloop++)
bgra2rgb(bgra_mat, rgb_mat, w, h);
t = clock() - t;
cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
cout << "seq cost time(clock):" << t / nloop << endl;
}
3.使用了如下图所示的omp simd语句来实现bgra2rgb函数自动向量化。通过手动向量化来实现
a) 借助interleaving交叉存取技术的ld4、st3指令
b) 可参考如下图所示的RGB to BGR的汇编层面实现过程
c)向量化函数声明如下:
void bgra2rgb_neon(const int8_t *src, int8_t *dst, int w, int h);
d)使用intrinsic函数,手动向量化图像数据结构转化函数bgra2rgb。
f)对比bgra2rgb原始函数和bgra2rgb_neon向量化函数的运行结果和时间
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <arm_neon.h>
using namespace std;
/*
void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
#pragma omp parallel for simd
for (int i = 0; i < h; ++i)
{
for (int j = 0; j < w; j++)
{
dst[(i * w + j) * 3] = src[(i * w + j) * 4 + 2];
dst[(i * w + j) * 3 + 1] = src[(i * w + j) * 4 + 1];
dst[(i * w + j) * 3 + 2] = src[(i * w + j) * 4];
}
}
}
*/
void bgra2rgb(const uint8_t *src, uint8_t *dst, int w, int h)
{
for (int i = 0; i < h; ++i)
{
for (int j = 0; j < w; j += 4)
{
uint8x8x4_t vsrc = vld4_u8(&src[(i * w + j) * 4]);
uint8x8x3_t vdst;
vdst.val[0] = vsrc.val[2]; // B
vdst.val[1] = vsrc.val[1]; // G
vdst.val[2] = vsrc.val[0]; // R
vst3_u8(&dst[(i * w + j) * 3], vdst);
}
}
}
int main()
{
int nloop = 100;
const int w = 480;
const int h = 640;
uint8_t bgra_mat[w * h * 4];
uint8_t rgb_mat[w * h * 3];
srand(100);
for (int i = 0; i < w * h * 4; i++)
{
bgra_mat[i] = rand() % 256;
}
clock_t t = clock();
for (int iloop = 0; iloop < nloop; iloop++)
bgra2rgb(bgra_mat, rgb_mat, w, h);
t = clock() - t;
cout << "bgra[4-6] data:" << (int)bgra_mat[4] << "," << (int)bgra_mat[5] << "," << (int)bgra_mat[6] << endl;
cout << "rgb[3-5] data:" << (int)rgb_mat[3] << "," << (int)rgb_mat[4] << "," << (int)rgb_mat[5] << endl;
cout << "seq cost time(clock):" << t / nloop << endl;
}
4.实现针对32位浮点型向量的neon版本除法接口
目标:
实现neon版本除法接口,并在主函数中调用该接口,保证计算结果正确。
要求:
a) 接口声明为:float32x4_t neon_vdivq_f32(float32x4_t op1, float32x4_t op2)
以实现形如r[i]=op1[i]/op2[i]
的向量高精度除法。
其中op1、op2为操作数向量,r为返回值向量。三者皆为float32x4_t
类型。
b) 在主函数中,使用长度为32的一维数组作为输入来调用neon_vdivq_f32
接口,并验证其正确性。
提示:
vrecpeq_f32
vrecpsq_f32
牛顿 - 拉夫逊迭代
#include <stdio.h>
#include <arm_neon.h>
float32x4_t vdivq_f32_neon(float32x4_t op1, float32x4_t op2)
{
float32x4_t reciprocal = vrecpeq_f32(op2); // 计算 op2 的倒数
reciprocal = vrecpsq_f32(op2, reciprocal); // 进行牛顿-拉夫逊迭代优化
return vmulq_f32(op1, reciprocal); // 返回 op1 与 op2 的逐元素除法结果
}
int main()
{
float arr_op1[4] = {1.0f, 2.0f, 3.0f, 4.0f};
float arr_op2[4] = {0.5f, 1.0f, 2.0f, 0.0f}; // 注意处理除数为0的情况
float32x4_t op1 = vld1q_f32(arr_op1);
float32x4_t op2 = vld1q_f32(arr_op2);
float32x4_t result = vdivq_f32_neon(op1, op2);
float res[4];
vst1q_f32(res, result);
for (int i = 0; i < 4; i++) {
printf("Result[%d]: %f\n", i, res[i]);
}
return 0;
}
5.使用simd和循环展开技术,对矩阵向量乘法dgemv进行改写,以实现y=Ax计算的性能优化。其中A是大小为n x m的矩阵,x、y是长度为m的向量。
要求
a) 外层i循环进行循环展开
b) 内层j循环进行simd向量化
c) 对比串行和优化版本的运行时间,确保结果正确性
提示
vdupq_n_f64:标量填充
vfmaq_f64:融合乘加操作
vaddvq_f64:向量内部求和
#include <iostream>
#include <ctime>
#include <omp.h>
#include <arm_neon.h>
using namespace std;
/*
void dgemv(const int n, const int m, const double *const A,
const double *const x, double *y)
{
for (int i = 0; i < n; i++)
{
double y_tmp = 0.0;
for (int j = 0; j < m; j++)
{
y_tmp += A[i * m + j] * x[j];
}
y[i] = y_tmp;
}
return;
}
*/
void dgemv(const int n, const int m, const double *const A,
const double *const x, double *y)
{
for (int i = 0; i < n; i +=2) // 外层i循环进行循环展开
{
float64x2_t y_tmp = vdupq_n_f64(0.0);
float64x2_t y_tmp0 = vdupq_n_f64(0.0);
for (int j = 0; j < m; j +=2) // 内层j循环进行simd向量化
{
float64x2_t x_vec = vld1q_f64(&x[j]);
y_tmp = vfmaq_f64(y_tmp, vld1q_f64(&A[i * m + j]), x_vec);
y_tmp0 = vfmaq_f64(y_tmp0, vld1q_f64(&A[(i + 1) * m + j]), x_vec);
}
y[i] = vaddvq_f64(y_tmp);
y[i + 1] = vaddvq_f64(y_tmp0);
}
}
int main()
{
int nloop = 10;
int n = 1024;
double *mat_A = new double[n * n];
double *vec_x = new double[n];
double *vec_y = new double[n];
srand(0);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
}
vec_y[i] = 0.0;
}
clock_t t = clock();
for (int iloop = 0; iloop < nloop; iloop++)
{
dgemv(n, n, mat_A, vec_x, vec_y);
}
t = clock() - t;
cout << "y[0]:" << vec_y[0] << endl;
cout << "y[1]:" << vec_y[1] << endl;
cout << "y[2]:" << vec_y[2] << endl;
cout << "y[3]:" << vec_y[3] << endl;
cout << "y[4]:" << vec_y[4] << endl;
cout << "gemv ave cost time(clock):" << t / nloop << endl;
}
6.图像旋转
目标:
a) 将512x512大小的Lena图像(上图)逆时针旋转90度
b) 使用neon intrinsic函数进行优化
提示:
先转置(借助vtrn指令)
再水平翻转(借助vrev指令)
#include <arm_neon.h>
void rotateImageNeon(uint8_t* src, uint8_t* dst, int width, int height) {
// 逐行处理
for (int y = 0; y < height; y++) {
// 每次处理16个像素,因为NEON寄存器可以同时处理16个8位整数
for (int x = 0; x < width; x += 16) {
// 加载16个像素到NEON寄存器
uint8x16_t pixels = vld1q_u8(src + y * width + x);
// 转置
pixels = vtrnq_u8(pixels, pixels);
// 水平翻转
pixels = vrev64q_u8(pixels);
// 存储结果
vst1q_u8(dst + (width - x - 16) * height + y, pixels);
}
}
}