前面讲过用xmm寄存器计算128位的sad。下面看看256如何计算,
使用的指令vxorpd
vmovdqa
vpsadbw
汇编代码
global sum
global sum256
sum:
xorpd xmm0, xmm0
vmovdqa xmm1, [rdi]
;vmovdqa [rsi], xmm1
psadbw xmm1, xmm0
;movd [rsi], xmm1
vmovdqa [rsi], xmm1
ret
sum256:
vxorpd ymm0, ymm0
;vdbpsadbw
vmovdqa ymm1, [rdi]
vmovdqa ymm0, [rsi]
vpsadbw ymm1, ymm1, [rsi]
vmovdqa [rsi], ymm1
ret
C 代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
extern void sum(unsigned char *buf, unsigned char *sum);
extern void sum256(unsigned char *buf, unsigned char *sum);
int main()
{
unsigned char data[32], dst[32];
memset(data, 0x02, 32);
memset(dst, 0x8, 32);
int ret = 0;
sum256(data, dst);
fprintf(stderr, "ret ## \n");
for (int i = 0; i < 32; i++)
printf("%d ", dst[i]);
return 0;
}
运行结果
48 0 0 0 0 0 0 0 48 0 0 0 0 0 0 0 48 0 0 0 0 0 0 0 48 0 0 0 0 0 0 0
最多只能计算64位,256一次算CPU成本太高。
需要自己再累加起来