一 memset只能当个字节memset
如果需要int64初始化,需要C语言循环完成。
#define InitMEM(tt) \
{ \
for (int i = 0; i < 1024; i++) \
tt[i] = 5; \
} \
(void)0
二 可以用vpbroadcastw avx汇编命令初始化zmm寄存器,然后zmm vmovdqu32/64 到内存
global avx512_1_memset
global avx512_2_memset
global avx512_4_memset
global avx512_8_memset
extern printf
avx512_1_memset:
mov rcx, rdx
mov r9, rdx
xor r8, r8
vpbroadcastb zmm0, rsi
loop:
vmovdqu8 [rdi+r8], zmm0
cmp r9, 64
jz deca
sub r9, 64
add r8, 64
jmp loop
deca:
add rdi, rcx,
sub rdi, 64
vmovdqu8 [rdi], zmm0
ret
avx512_2_memset:
mov rcx, rdx
mov r9, rdx
xor r8, r8
vpbroadcastw zmm0, rsi
loop1:
vmovdqu16 [rdi+r8], zmm0
cmp r9, 64
jz deca1
sub r9, 64
add r8, 64
jmp loop1
deca1:
add rdi, rcx,
sub rdi, 64
vmovdqu16 [rdi], zmm0
ret
avx512_4_memset:
mov rcx, rdx
mov r9, rdx
xor r8, r8
;vpbroadcastd zmm0, rsi
loop2:
vmovdqu32 [rdi+r8], zmm0
cmp r9, 64
jz deca2
sub r9, 64
add r8, 64
jmp loop2
deca2:
add rdi, rcx,
sub rdi, 64
vmovdqu32 [rdi], zmm0
ret
avx512_8_memset:
mov rcx, rdx
mov r9, rdx
xor r8, r8
vpbroadcastq zmm0, rsi
loop3:
vmovdqu64 [rdi+r8], zmm0
cmp r9, 64
jz deca3
sub r9, 64
add r8, 64
jmp loop3
deca3:
add rdi, rcx,
sub rdi, 64
vmovdqu64 [rdi], zmm0
ret
format:
db "src address %p", 10, 0
msg:
db "run to here %d", 10, 0
count:
dd 0
C语言调用代码
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <string.h>
extern int avx512_1_memset(void *addr, const int value, const int size);
extern int avx512_2_memset(void *addr, const int value, const int size);
extern int avx512_4_memset(void *addr, const int value, const int size);
extern int avx512_8_memset(void *addr, const int value, const int size);
#define InitMEM(tt) \
{ \
for (int i = 0; i < 1024; i++) \
tt[i] = 5; \
} \
(void)0
int main()
{
int64_t tt[1024];
memset(tt, 0x0, 8192);
printf("tt address %p\n", tt);
int count = 1000000;
struct timeval start = {0};
gettimeofday(&start, NULL);
while(count-- > 0) {
avx512_8_memset(tt, 5, 8192);
//memset(tt, 0x5, 2048);
//InitMEM(tt);
}
struct timeval end = {0};
gettimeofday(&end, NULL);
printf("cost ms %ld\n", end.tv_sec * 1000 + end.tv_usec/1000 - start.tv_sec * 1000 - start.tv_usec/1000);
for (int i = 0; i < 1024; i++)
printf("%ld ", tt[i]);
printf("\n");
return 0;
}
三 执行耗时对比
C语言的代码是用O3选项优化了,带有vectorize 编译器汇编优化的。
自己写的汇编实现