NEON加速系列文章
前言
一、NEON矩阵乘法优化 (任意阶)
Matrix multiplication through neon.
二、使用步骤
1.Code display
代码如下(示例):
#include <stdio.h>
#include <sys/time.h>
#include <stdint.h>
#include <string.h>
#include <arm_neon.h>
double sub_time(struct timeval t1, struct timeval t0)
{
double s = t1.tv_sec - t0.tv_sec;
double us = t1.tv_usec - t0.tv_usec;
return s*1000 + us/1000;
}
#define N 16
int main(void)
{
float a1[N][N], c1[N][N], a2[N][N], c2[N][N];
for(int i = 0; i < N; i++){
for(int j = 0; j <N; j++){
a1[i][j] = 1;
c1[i][j] = 2;
a2[i][j] = 1;
c2[i][j] = 2;
}
}
float d[N][N] = {{0}};
float e[N][N] = {{0}};
int i, j, k, m;
struct timeval t1, t0;
gettimeofday(&t0, NULL);
for (i=0;i<10000;i++)
{
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
for (m=0;m<N;m++)
{
d[j][k] += a1[j][m] * c1[m][k];
}
}
}
}
gettimeofday(&t1, NULL);
printf("basic time used: %0.3f.\n", sub_time(t1,t0));
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
printf("%f\t", d[j][k]);
}
printf("\n");
}
gettimeofday(&t0, NULL);
for (i=0;i<10000;i++)
{
float32x4_t vc0 = vdupq_n_f32(0.0f);
float32x4_t vc1 = vdupq_n_f32(0.0f);
float32x4_t vc2 = vdupq_n_f32(0.0f);
float32x4_t vc3 = vdupq_n_f32(0.0f);
float32x4_t ret = vdupq_n_f32(0.0f);
for (j=0;j<N;j++)
{
// 通过neon直接计算16*16矩阵的结果
ret = vmlaq_f32(ret, vdupq_n_f32(a2[0][j]), vdupq_n_f32(c2[j][0]));
}
for (j=0;j<N;j++){
vst1q_f32(&e[j][0], ret);
vst1q_f32(&e[j][4], ret);
vst1q_f32(&e[j][8], ret);
vst1q_f32(&e[j][12], ret);
}
}
gettimeofday(&t1, NULL);
printf("neon time used: %0.3f.\n", sub_time(t1,t0));
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
printf("%f\t", e[j][k]);
}
printf("\n");
}
return 0;
}
三、其它NEON accelerate实现后续更新
总结
You are welcome to criticize and correct!!