一 intrinsic实现
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
static inline __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3)
{
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi32(x0, x1);
t1 = _mm_unpacklo_epi32(x2, x3);
t2 = _mm_unpackhi_epi32(x0, x1);
t3 = _mm_unpackhi_epi32(x2, x3);
x0 = _mm_unpacklo_epi32(t0, t1);
x1 = _mm_unpackhi_epi32(t0, t1);
x2 = _mm_unpacklo_epi32(t2, t3);
x3 = _mm_unpackhi_epi32(t2, t3);
return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
}
int main()
{
unsigned int a[4] = {1, 1, 1, 1};
unsigned int b[4] = {2, 2, 2, 2};
unsigned int c[4] = {3, 3, 3, 3};
unsigned int d[4] = {4, 4, 4, 4};
__m128i ret = hsum4(*(__m128i*)&a, *(__m128i*)&b, *(__m128i*)&c, *(__m128i*)&d);
unsigned int *e = (unsigned int *)&ret;
fprintf(stderr, "%d %d %d %d\n", e[0], e[1], e[2], e[3]);
return 0;
}
二 汇编实现
用汇编实现上面的4个数组相加
global sumInt
sumInt:
movdqa xmm0, [rdi];//存入第一个参数
punpckldq xmm0, [rsi];//t0 ,低2字节,组合存入xmm0
movdqa xmm1, [rdx]; //存入第三个参数
punpckldq xmm1, [rcx];//t1 参数3,4的低2字节存入xmm1
movdqa xmm2, [rdi];//重新存入第一个参数
punpckhdq xmm2, [rsi];//t2, 第一个参数高2字节和低2个参数高2字节组合
movdqa xmm3, [rdx];//存入第三个参数
punpckhdq xmm3, [rcx];//t3组合高2个字节
movdqa [rdi], xmm0;//把中间结果都备份一下,到参数内存里,
movdqa [rsi], xmm1
movdqa [rdx], xmm2
movdqa [rcx], xmm3
punpcklqdq xmm0, xmm1; x0 x1 低64位组合成64位
movdqa xmm4, [rdi];//载入x0
punpckhqdq xmm4, [rsi];//x1 x0,高64位组合成64位
;movdqa xmm5, xmm4; x1
;//类似上面的参数
punpcklqdq xmm2, xmm3;//x2
movdqa xmm6, [rdx]
punpckhqdq xmm6, [rcx];//x3
;// x0 + x1
paddd xmm0, xmm4;
;// x2 + x3
paddd xmm2, xmm6;
;// x0 + x1 + x2 +x3
paddd xmm0, xmm2;
;//存储输出
vmovdqa [r8], xmm0
三 c语言实现代码
#include <stdio.h>
extern void sumInt(unsigned int *a, unsigned int *b, unsigned int *c, unsigned int *d, unsigned int *e);
int main()
{
unsigned int a[4] = {1, 1, 1, 1};
unsigned int b[4] = {2, 2, 2, 2};
unsigned int c[4] = {3, 3, 3, 3};
unsigned int d[4] = {4, 4, 4, 4};
unsigned int e[4] = {0, 0, 0, 0};
sumInt(a, b, c, d, e);
printf("%d %d %d %d\n", e[0], e[1], e[2], e[3]);
return 0;
}
四 输出结果
test# ./sumtest
4 8 12 16