armv8,c,neon
code
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<arm_neon.h>
#include<math.h>
double get_current_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}
void abs_c(float* src, float* out, int count)
{
for (size_t i = 0; i < count; i++)
{
out[i] = fabs(src[i]);
}
}
void abs_neon(float* src, float* out, int count)
{
float32x4_t a, c;
for (size_t i = 0; i < count; i+=4)
{
a = vld1q_f32(src);
c = vabsq_f32(a);
vst1q_f32(out, c);
src += 4;
out += 4;
}
}
void abs_assembly(float* src, float* out, int count)
{
int i = 10;
asm volatile(
"1: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"fabs v0.4s, v0.4s \n"
"subs %2, %2, #4 \n"
"st1 {v0.4s}, [%0], #16 \n"
"bgt 1b \n "
:"=r"(out) // 出现在输出列表中的变量,必须出现在输入列表中
:"r"(src),
"r"(count),
"0"(out)
:"cc", "memory", "v0"
);
}
int main(void){
int num_ = 160000;
int loop = 2;
double start, end, cur;
float* src_a = (float*)malloc(sizeof(float) * num_);
float* src_b = (float*)malloc(sizeof(float) * num_);
for (size_t i = 0; i < num_; i++)
{
src_a[i] = (rand() / (RAND_MAX + 1.0)) * 2 - 1;
src_b[i] = (rand() / (RAND_MAX + 1.0)) * 2 - 1;
}
#ifdef __aarch64__ // 宏定义在编译器中
printf("test on aarch64 plateform \n");
#endif
#ifdef __ARM_NEON // 宏定义在arm gcc 编译器中的
printf("test on ARM platform \n");
#endif
// warm up
for (size_t i = 0; i < 10; i++)
abs_c(src_a, src_b, num_);
// test for c
start = get_current_time();
for (size_t i = 0; i < loop; i++)
abs_c(src_a, src_b, num_);
end = get_current_time();
cur = (end - start) / loop;
printf("c test:%f | time:%f ms \n", 0., cur);
// test for neon
start = get_current_time();
for (size_t i = 0; i < loop; i++)
abs_neon(src_a, src_b, num_);
end = get_current_time();
cur = (end - start) / loop;
printf("neon:%f | time:%f ms \n", 1., cur);
// // test for neon assembly
// for (size_t i = 0; i < num_; i++)
// printf("%f ", src_a[i]);
// printf("\n");
// for (size_t i = 0; i < num_; i++)
// printf("%f ", src_b[i]);
start = get_current_time();
for (size_t i = 0; i < loop; i++)
abs_assembly(src_a, src_b, num_);
end = get_current_time();
cur = (end - start) / loop;
printf("assembly:%f | time:%f ms \n", 2., cur);
// // debug show
// for (size_t i = 0; i < num_; i++)
// printf("%f ", src_a[i]);
// printf("\n");
// for (size_t i = 0; i < num_; i++)
// printf("%f ", src_b[i]);
return 0;
}
输出
c test:0.000000 | time:1.094116 ms
neon:1.000000 | time:0.740479 ms
assembly:2.000000 | time:0.140991 ms
参考