SIMD CRC16 AVX512

之前写了个pshufb  + pdep版本的, 速度就快了2,3倍, 最近学习了AVX512, 就拿AVX512写了下, 果然是AVX512啊, 快了近百倍:

.code

shuf db 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

db 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16

db 47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32

db 63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48

mask1 dd 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1

mask2 dd 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3

mask3 dd 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5

mask4 dd 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7

mask5 dd 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9

mask6 dd 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11

mask7 dd 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13

mask8 dd 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15

InvertUint16_ASM proc

vmovdqu64 zmm0, zmmword ptr [rdx]

vmovdqu64 zmm2, zmmword ptr shuf

vmovdqu64 zmm7, zmmword ptr [mask1]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask2]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask3]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask4]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask5]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask6]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask7]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

add rcx, 8

vmovdqu64 zmm7, zmmword ptr [mask8]

VPERMD zmm1, zmm7, zmm0

VPSHUFBITQMB k1, zmm1, zmm2

kmovq qword ptr [rcx], k1

ret

InvertUint16_ASM endp

end

#include <intrin.h>

#include <Windows.h>

#include <stdio.h>

extern "C" void foo(unsigned short* DesBuf, unsigned short* SrcBuf);

unsigned short* InvertUint16(unsigned short* DesBuf, unsigned short* SrcBuf)

{

int i;

unsigned short temp = 0;

for (i = 0; i < 16; i++)

{

if (SrcBuf[0] & (1 << i))

{

temp |= 1 << (15 - i);

}

}

DesBuf[0] = temp;

return DesBuf;

}

int main()

{

char buff1[64]{};

char buff2[64]{};

char buff3[65] = "abcdefghijklmnopabcdefghijklmnopabcdefghijklmnopabcdefghijklmnop";

volatile unsigned short* p;

volatile DWORD64 start, end;

start = __rdtsc();

for(int count = 0; count < 10000000; count++)

for (int i = 0; i < 32; i++)

{

p = InvertUint16((unsigned short*)(buff1+i*2), (unsigned short*)(buff3 + i * 2));

}

end = __rdtsc();

printf("C:%I64u\n", end-start);

start = __rdtsc();

for (int count = 0; count < 10000000; count++)

foo((unsigned short*)buff2, (unsigned short*)buff3);

end = __rdtsc();

printf("A:%I64u\n", end - start);

printf("%d\n", memcmp((const void*)buff1, buff2, 64));

}

结果:

C:12596389379

A:129669843

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值