看看SIMD的魅力



#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <sys/types.h>
#define __USE_GNU
#include "common.h"

typedef unsigned int uint32_t;

struct data{
	uint32_t uiV1;
	uint32_t uiV2;
	uint32_t uiV3;
	uint32_t uiV4;
	uint32_t uiV5;
	uint32_t uiV6;
	uint32_t uiV7;
	uint32_t uiV8;
	uint32_t uiV9;
	uint32_t uiV10;
	uint32_t uiV11;
	uint32_t uiV12;
	uint32_t uiV13;
	uint32_t uiV14;
	uint32_t uiV15;
	uint32_t uiV16;
};



#define LEN  1024*1024*10
struct data astData[LEN];

void multiplydataA(unsigned N) {
    unsigned i;
    for(i=0; i<N; i++) {
        astData[i].uiV1 = i + 1;
		astData[i].uiV2 = i + 2;
		astData[i].uiV3 = i + 3;
		astData[i].uiV4 = i + 4;
		astData[i].uiV5 = i + 5;
		astData[i].uiV6 = i + 6;
		astData[i].uiV7 = i + 2;
		astData[i].uiV8 = i + 3;
		astData[i].uiV9 = i + 4;
		astData[i].uiV10 = i + 5;
		astData[i].uiV11 = i + 1;
		astData[i].uiV12 = i + 2;
		astData[i].uiV13 = i + 3;
		astData[i].uiV14 = i + 4;
		astData[i].uiV15 = i + 5;
		astData[i].uiV16 = i + 6;
    }
}

void multiplydataB(unsigned N) {
    unsigned i;
    for(i=0; i<N; i++) {
        astData[i].uiV1 = i + 1;
		astData[i].uiV5 = i + 5;
		astData[i].uiV9 = i + 4;
		astData[i].uiV13 = i + 3;
		
		astData[i].uiV2 = i + 2;
		astData[i].uiV6 = i + 6;
		astData[i].uiV10 = i + 5;
		astData[i].uiV14 = i + 4;
		
		astData[i].uiV3 = i + 3;
		astData[i].uiV7 = i + 2;
		astData[i].uiV11 = i + 1;
		astData[i].uiV15 = i + 5;
		
		astData[i].uiV4 = i + 4;
		astData[i].uiV8 = i + 3;
		astData[i].uiV12 = i + 2;
		astData[i].uiV16 = i + 6;
    }
}

int main(int argc, char ** argv)
{
	long long t1, t2, diff;
	unsigned N = LEN;
	int i;

    multiplydataA(N);

	t1 = getustime();
	multiplydataA(N);
	t2 = getustime();
	diff = t2 - t1;
    printf("time1:%lld\n", diff);
	
	t1 = getustime();
	multiplydataA(N);
	t2 = getustime();
	diff = t2 - t1;
    printf("time1:%lld\n", diff);
	
	t1 = getustime();
	multiplydataB(N);
	t2 = getustime();
	diff = t2 - t1;
	
	printf("time2:%lld\n", diff);
	
	t1 = getustime();
	multiplydataB(N);
	t2 = getustime();
	diff = t2 - t1;
	
	printf("time2:%lld\n", diff);
	return 0;
}

注意带上 -O2
[feixd@hs-10-20-30-160 SIMD]$ gcc -o test main_test.c common.c -O2
[feixd@hs-10-20-30-160 SIMD]$ taskset -c 5 ./test
time1:57789 us
time1:57782 us
time2:85175 us
time2:84992 us
[feixd@hs-10-20-30-160 SIMD]$

cpu:
Intel® Xeon® Gold 6250 CPU @ 3.90GHz
速度差个25%

背后的原理:
intel的cpu上的增加了一批128位的寄存器,也就是16B(不再是早期的一次一个word 4或8 B了),无论取指、译码、还是取数据进寄存器,都可能是一次连续的16B,core上的指令都是流水线也就是同时有多条连续的代码在执行,如果搞进去寄存器一次的数据,被多条指令用到,是不是就节约了数据取进寄存器的时间,也就是提升了代码效率;

想想平时该如果写代码?
连续化

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值