memchr_sse.s
.text
.globl memchr_sse;
.align 4,0x90;
memchr_sse:
movd %rsi, %xmm1
mov %rdi, %rcx
punpcklbw %xmm1, %xmm1
test %rdx, %rdx
jz L_return_null
punpcklbw %xmm1, %xmm1
and $63, %rcx
pshufd $0, %xmm1, %xmm1
cmp $48, %rcx
ja L_crosscache
movdqu (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches_1
sub $16, %rdx
jbe L_return_null
add $16, %rdi
and $15, %rcx
and $-16, %rdi
add %rcx, %rdx
sub $64, %rdx
jbe L_exit_loop
jmp L_loop_prolog
.p2align 4
L_crosscache:
and $15, %rcx
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
/* Check if there is a match. */
pmovmskb %xmm0, %eax
/* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L_unaligned_no_match
/* Check which byte is a match. */
bsf %eax, %eax
sub %rax, %rdx
jbe L_return_null
add %rdi, %rax
add %rcx, %rax
ret
.p2align 4
L_unaligned_no_match:
add %rcx, %rdx
sub $16, %rdx
jbe L_return_null
add $16, %rdi
sub $64, %rdx
jbe L_exit_loop
.p2align 4
L_loop_prolog:
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L_matches16
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L_matches32
movdqa 48(%rdi), %xmm4
pcmpeqb %xmm1, %xmm4
add $64, %rdi
pmovmskb %xmm4, %eax
test %eax, %eax
jnz L_matches0
test $0x3f, %rdi
jz L_align64_loop
sub $64, %rdx
jbe L_exit_loop
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L_matches16
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L_matches32
movdqa 48(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
add $64, %rdi
test %eax, %eax
jnz L_matches0
mov %rdi, %rcx
and $-64, %rdi
and $63, %rcx
add %rcx, %rdx
.p2align 4
L_align64_loop:
sub $64, %rdx
jbe L_exit_loop
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
movdqa 32(%rdi), %xmm3
movdqa 48(%rdi), %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm1, %xmm2
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm1, %xmm4
pmaxub %xmm0, %xmm3
pmaxub %xmm2, %xmm4
pmaxub %xmm3, %xmm4
pmovmskb %xmm4, %eax
add $64, %rdi
test %eax, %eax
jz L_align64_loop
sub $64, %rdi
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L_matches16
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pcmpeqb 48(%rdi), %xmm1
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L_matches32
pmovmskb %xmm1, %eax
bsf %eax, %eax
lea 48(%rdi, %rax), %rax
ret
.p2align 4
L_exit_loop:
add $32, %rdx
jle L_exit_loop_32
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L_matches16
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L_matches32_1
sub $16, %rdx
jle L_return_null
pcmpeqb 48(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L_matches48_1
xor %rax, %rax
ret
.p2align 4
L_exit_loop_32:
add $32, %rdx
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L_matches_1
sub $16, %rdx
jbe L_return_null
pcmpeqb 16(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L_matches16_1
xor %rax, %rax
ret
.p2align 4
L_matches0:
bsf %eax, %eax
lea -16(%rax, %rdi), %rax
ret
.p2align 4
L_matches:
bsf %eax, %eax
add %rdi, %rax
ret
.p2align 4
L_matches16:
bsf %eax, %eax
lea 16(%rax, %rdi), %rax
ret
.p2align 4
L_matches32:
bsf %eax, %eax
lea 32(%rax, %rdi), %rax
ret
.p2align 4
L_matches_1:
bsf %eax, %eax
sub %rax, %rdx
jbe L_return_null
add %rdi, %rax
ret
.p2align 4
L_matches16_1:
bsf %eax, %eax
sub %rax, %rdx
jbe L_return_null
lea 16(%rdi, %rax), %rax
ret
.p2align 4
L_matches32_1:
bsf %eax, %eax
sub %rax, %rdx
jbe L_return_null
lea 32(%rdi, %rax), %rax
ret
.p2align 4
L_matches48_1:
bsf %eax, %eax
sub %rax, %rdx
jbe L_return_null
lea 48(%rdi, %rax), %rax
ret
.p2align 4
L_return_null:
xor %rax, %rax
ret
.type memchr_sse, @function;
.size memchr_sse, .-memchr_sse;
测试stub
stub.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include "common.h"
extern void *memchr_sse(const void *s, int c, size_t n);
int main(int argc, char **argv)
{
char text[1024] = {0};
void *result = NULL;
uint64_t begin, end;
memset(text, 'A', 1024);
text[1022] = '\r';
begin = get_cycle_count();
//result = memchr_sse(text, '\r', 1024);
result = memchr(text, '\r', 1024);
end = get_cycle_count();
if (result){
printf("result @ %u cost %lu\n", result - (void *)text, end - begin);
}
return 0;
}
编译
gcc -march=corei7 -O3 memchr_sse.s stub.c -o stub
测试平台:
Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
memchr 测试结果
result @ 1022 cost 1404
# ./stub
result @ 1022 cost 1600
# ./stub
result @ 1022 cost 1452
# ./stub
result @ 1022 cost 1388
# ./stub
result @ 1022 cost 1440
memchr_sse 测试结果
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 568
# ./stub
result @ 1022 cost 572
# ./stub
result @ 1022 cost 612
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 520