linux、memory、memcmp 几种实现和性能对比
1.6K
在这篇文章中:
- 前言
- 分析
- 1.kernel memcmp
- 2.x64 memcmp
- 3.glibc memcmp
- 4.unsigned long memcmp
- 5.benchmark
- 后记
前言
memcmp是最基本的库函数了。下文选择几版代码,来对比分析性能。
分析
1.kernel memcmp
代码选自linux4.4/lib/string.c
int memcmp(const void *cs, const void *ct, size_t count) { const unsigned char *su1, *su2; int res = 0; for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) if ((res = *su1 - *su2) != 0) break; return res; }
一个byte一个byte的循环比较。c语言的简洁明了一览无遗。
2.x64 memcmp
代码选自uksm:https://github.com/dolohow/uksm/blob/master/uksm-4.9.patch
int memcmpx86_64(void *s1, void *s2, size_t n) { size_t num = n / 8; register int res; __asm__ __volatile__ ( "testq %q3,%q3\n\t" "repe; cmpsq\n\t" "je 1f\n\t" "sbbq %q0,%q0\n\t" "orq $1,%q0\n" "1:" : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) : "0" (0) : "cc"); return res; }
因为是比对page页面的函数,size是8的整数倍,所以没有另外判断。
3.glibc memcmp
代码选自glibc-2.23/sysdeps/x86_64/memcmp.S
以下的代码是使用汇编语言实现,针对x64的加速,xmm寄存器是16byte宽的,效率更高。
ENTRY (memcmp) test %rdx, %rdx jz L(finz) cmpq $1, %rdx jle L(finr1b) subq %rdi, %rsi movq %rdx, %r10 cmpq $32, %r10 jge L(gt32) /* Handle small chunks and last block of less than 32 bytes. */ L(small): testq $1, %r10 jz L(s2b) movzbl (%rdi), %eax movzbl (%rdi, %rsi), %edx subq $1, %r10 je L(finz1) addq $1, %rdi subl %edx, %eax jnz L(exit) L(s2b): testq $2, %r10 jz L(s4b) movzwl (%rdi), %eax movzwl (%rdi, %rsi), %edx subq $2, %r10 je L(fin2_7) addq $2, %rdi cmpl %edx, %eax jnz L(fin2_7) L(s4b): testq $4, %r10 jz L(s8b) movl (%rdi), %eax movl (%rdi, %rsi), %edx subq $4, %r10 je L(fin2_7) addq $4, %rdi cmpl %edx, %eax jnz L(fin2_7) L(s8b): testq $8, %r10 jz L(s16b) movq (%rdi), %rax movq (%rdi, %rsi), %rdx subq $8, %r10 je L(fin2_7) addq $8, %rdi cmpq %rdx, %rax jnz L(fin2_7) L(s16b): movdqu (%rdi), %xmm1 movdqu (%rdi, %rsi), %xmm0 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %edx xorl %eax, %eax subl $0xffff, %edx jz L(finz) bsfl %edx, %ecx leaq (%rdi, %rcx), %rcx movzbl (%rcx), %eax movzbl (%rsi, %rcx), %edx jmp L(finz1) .p2align 4,, 4 L(finr1b): movzbl (%rdi), %eax movzbl (%rsi), %edx L(finz1): subl %edx, %eax L(exit): ret .p2align 4,, 4 L(fin2_7): cmpq %rdx, %rax jz L(finz) movq %rax, %r11 subq %rdx, %r11 bsfq %r11, %rcx sarq $3, %rcx salq $3, %rcx sarq %cl, %rax movzbl %al, %eax sarq %cl, %rdx movzbl %dl, %edx subl %edx, %eax ret .p2align 4,, 4 L(finz): xorl %eax, %eax ret /* For blocks bigger than 32 bytes 1. Advance one of the addr pointer to be 16B aligned. 2. Treat the case of both addr pointers aligned to 16B separately to avoid movdqu. 3. Handle any blocks of greater than 64 consecutive bytes with unrolling to reduce branches. 4. At least one addr pointer is 16B aligned, use memory version of pcmbeqb. */ .p2align 4,, 4 L(gt32): movq %rdx, %r11 addq %rdi, %r11 movq %rdi, %r8 andq $15, %r8 jz L(16am) /* Both pointers may be misaligned. */ movdqu (%rdi), %xmm1 movdqu (%rdi, %rsi), %xmm0 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %edx subl $0xffff, %edx jnz L(neq) neg %r8 leaq 16(%rdi, %r8), %rdi L(16am): /* Handle two 16B aligned pointers separately. */ testq $15, %rsi jz L(ATR) testq $16, %rdi jz L(A32) movdqu (%rdi, %rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi L(A32): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi jge L(mt16) /* Pre-unroll to be ready for unrolled 64B loop. */ testq $32, %rdi jz L(A64) movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi L(A64): movq %r11, %r10 andq $-64, %r10 cmpq %r10, %rdi jge L(mt32) L(A64main): movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi cmpq %rdi, %r10 jne L(A64main) L(mt32): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi jge L(mt16) L(A32main): movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqu (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi cmpq %rdi, %r10 jne L(A32main) L(mt16): subq %rdi, %r11 je L(finz) movq %r11, %r10 jmp L(small) .p2align 4,, 4 L(neq): bsfl %edx, %ecx movzbl (%rdi, %rcx), %eax addq %rdi, %rsi movzbl (%rsi,%rcx), %edx jmp L(finz1) .p2align 4,, 4 L(ATR): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi jge L(mt16) testq $16, %rdi jz L(ATR32) movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi cmpq %rdi, %r10 je L(mt16) L(ATR32): movq %r11, %r10 andq $-64, %r10 testq $32, %rdi jz L(ATR64) movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi L(ATR64): cmpq %rdi, %r10 je L(mt32) L(ATR64main): movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi cmpq %rdi, %r10 jne L(ATR64main) movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi jge L(mt16) L(ATR32res): movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi movdqa (%rdi,%rsi), %xmm0 pcmpeqb (%rdi), %xmm0 pmovmskb %xmm0, %edx subl $0xffff, %edx jnz L(neq) addq $16, %rdi cmpq %r10, %rdi jne L(ATR32res) subq %rdi, %r11 je L(finz) movq %r11, %r10 jmp L(small) /* Align to 16byte to improve instruction fetch. */ .p2align 4,, 4 END(memcmp)
4.unsigned long memcmp
方法1修改一下,单次比较unsigned long的长度。
int kmemcmp(const void *cs, const void *ct, size_t count) { const unsigned long *su1, *su2; int res = 0; for (su1 = cs, su2 = ct; 0 < count;) { if ((res = *su1 - *su2) != 0) break; su1 += 1; su2 += 1; count -= sizeof(unsigned long); } return res; }
5.benchmark
gcc使用O0,对两个4k大小的内存进行比较,一共比较2G,统计执行时间:
方法1用时大约20S。 方法2用时大约0.64S。 方法3用时大约0.20S。 方法4用时大约2.50S。 方法4的时间是方法1的八分之一左右,这个是预期的逻辑。使用XMM寄存器最快。
gcc使用O2,对两个4k大小的内存进行比较,一共比较2G,统计执行时间:
方法1用时大约3.4S。 方法2用时大约0.62S。 方法3用时大约0.20S。 方法4用时大约0.42S。 方法4的时间还是方法1的八分之一左右,但是他们都明显的缩短了时间,方法4甚至比方法2还要快一些,可见,gcc的优化效果真的好厉害。
后记
真的应了一句话:没有对比,就没有伤害。