之前写了个pshufb + pdep版本的, 速度就快了2,3倍, 最近学习了AVX512, 就拿AVX512写了下, 果然是AVX512啊, 快了近百倍:
.code
shuf db 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
db 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16
db 47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32
db 63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48
mask1 dd 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
mask2 dd 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3
mask3 dd 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5
mask4 dd 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7
mask5 dd 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9
mask6 dd 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11
mask7 dd 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13
mask8 dd 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
InvertUint16_ASM proc
vmovdqu64 zmm0, zmmword ptr [rdx]
vmovdqu64 zmm2, zmmword ptr shuf
vmovdqu64 zmm7, zmmword ptr [mask1]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask2]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask3]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask4]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask5]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask6]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask7]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
add rcx, 8
vmovdqu64 zmm7, zmmword ptr [mask8]
VPERMD zmm1, zmm7, zmm0
VPSHUFBITQMB k1, zmm1, zmm2
kmovq qword ptr [rcx], k1
ret
InvertUint16_ASM endp
end
#include <intrin.h>
#include <Windows.h>
#include <stdio.h>
extern "C" void foo(unsigned short* DesBuf, unsigned short* SrcBuf);
unsigned short* InvertUint16(unsigned short* DesBuf, unsigned short* SrcBuf)
{
int i;
unsigned short temp = 0;
for (i = 0; i < 16; i++)
{
if (SrcBuf[0] & (1 << i))
{
temp |= 1 << (15 - i);
}
}
DesBuf[0] = temp;
return DesBuf;
}
int main()
{
char buff1[64]{};
char buff2[64]{};
char buff3[65] = "abcdefghijklmnopabcdefghijklmnopabcdefghijklmnopabcdefghijklmnop";
volatile unsigned short* p;
volatile DWORD64 start, end;
start = __rdtsc();
for(int count = 0; count < 10000000; count++)
for (int i = 0; i < 32; i++)
{
p = InvertUint16((unsigned short*)(buff1+i*2), (unsigned short*)(buff3 + i * 2));
}
end = __rdtsc();
printf("C:%I64u\n", end-start);
start = __rdtsc();
for (int count = 0; count < 10000000; count++)
foo((unsigned short*)buff2, (unsigned short*)buff3);
end = __rdtsc();
printf("A:%I64u\n", end - start);
printf("%d\n", memcmp((const void*)buff1, buff2, 64));
}
结果:
C:12596389379
A:129669843