#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <sys/types.h>
#define __USE_GNU
#include "common.h"
typedef unsigned int uint32_t;
struct data{
uint32_t uiV1;
uint32_t uiV2;
uint32_t uiV3;
uint32_t uiV4;
uint32_t uiV5;
uint32_t uiV6;
uint32_t uiV7;
uint32_t uiV8;
uint32_t uiV9;
uint32_t uiV10;
uint32_t uiV11;
uint32_t uiV12;
uint32_t uiV13;
uint32_t uiV14;
uint32_t uiV15;
uint32_t uiV16;
};
#define LEN 1024*1024*10
struct data astData[LEN];
void multiplydataA(unsigned N) {
unsigned i;
for(i=0; i<N; i++) {
astData[i].uiV1 = i + 1;
astData[i].uiV2 = i + 2;
astData[i].uiV3 = i + 3;
astData[i].uiV4 = i + 4;
astData[i].uiV5 = i + 5;
astData[i].uiV6 = i + 6;
astData[i].uiV7 = i + 2;
astData[i].uiV8 = i + 3;
astData[i].uiV9 = i + 4;
astData[i].uiV10 = i + 5;
astData[i].uiV11 = i + 1;
astData[i].uiV12 = i + 2;
astData[i].uiV13 = i + 3;
astData[i].uiV14 = i + 4;
astData[i].uiV15 = i + 5;
astData[i].uiV16 = i + 6;
}
}
void multiplydataB(unsigned N) {
unsigned i;
for(i=0; i<N; i++) {
astData[i].uiV1 = i + 1;
astData[i].uiV5 = i + 5;
astData[i].uiV9 = i + 4;
astData[i].uiV13 = i + 3;
astData[i].uiV2 = i + 2;
astData[i].uiV6 = i + 6;
astData[i].uiV10 = i + 5;
astData[i].uiV14 = i + 4;
astData[i].uiV3 = i + 3;
astData[i].uiV7 = i + 2;
astData[i].uiV11 = i + 1;
astData[i].uiV15 = i + 5;
astData[i].uiV4 = i + 4;
astData[i].uiV8 = i + 3;
astData[i].uiV12 = i + 2;
astData[i].uiV16 = i + 6;
}
}
int main(int argc, char ** argv)
{
long long t1, t2, diff;
unsigned N = LEN;
int i;
multiplydataA(N);
t1 = getustime();
multiplydataA(N);
t2 = getustime();
diff = t2 - t1;
printf("time1:%lld\n", diff);
t1 = getustime();
multiplydataA(N);
t2 = getustime();
diff = t2 - t1;
printf("time1:%lld\n", diff);
t1 = getustime();
multiplydataB(N);
t2 = getustime();
diff = t2 - t1;
printf("time2:%lld\n", diff);
t1 = getustime();
multiplydataB(N);
t2 = getustime();
diff = t2 - t1;
printf("time2:%lld\n", diff);
return 0;
}
注意带上 -O2
[feixd@hs-10-20-30-160 SIMD]$ gcc -o test main_test.c common.c -O2
[feixd@hs-10-20-30-160 SIMD]$ taskset -c 5 ./test
time1:57789 us
time1:57782 us
time2:85175 us
time2:84992 us
[feixd@hs-10-20-30-160 SIMD]$
cpu:
Intel® Xeon® Gold 6250 CPU @ 3.90GHz
速度差个25%
背后的原理:
intel的cpu上的增加了一批128位的寄存器,也就是16B(不再是早期的一次一个word 4或8 B了),无论取指、译码、还是取数据进寄存器,都可能是一次连续的16B,core上的指令都是流水线也就是同时有多条连续的代码在执行,如果搞进去寄存器一次的数据,被多条指令用到,是不是就节约了数据取进寄存器的时间,也就是提升了代码效率;
想想平时该如果写代码?
连续化