memchr SSE 加速

memchr_sse.s

.text
.globl memchr_sse; 
.align 4,0x90;
memchr_sse:
	movd	%rsi, %xmm1
	mov	%rdi, %rcx

	punpcklbw %xmm1, %xmm1
	test	%rdx, %rdx
	jz	L_return_null
	punpcklbw %xmm1, %xmm1

	and	$63, %rcx
	pshufd	$0, %xmm1, %xmm1

	cmp	$48, %rcx
	ja	L_crosscache

	movdqu	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax

	jnz	L_matches_1
	sub	$16, %rdx
	jbe	L_return_null
	add	$16, %rdi
	and	$15, %rcx
	and	$-16, %rdi
	add	%rcx, %rdx
	sub	$64, %rdx
	jbe	L_exit_loop
	jmp	L_loop_prolog

	.p2align 4
L_crosscache:
	and	$15, %rcx
	and	$-16, %rdi
	movdqa	(%rdi), %xmm0

	pcmpeqb	%xmm1, %xmm0
/* Check if there is a match.  */
	pmovmskb %xmm0, %eax
/* Remove the leading bytes.  */
	sar	%cl, %eax
	test	%eax, %eax
	je	L_unaligned_no_match
	/* Check which byte is a match.  */
	bsf	%eax, %eax

	sub	%rax, %rdx
	jbe	L_return_null
	add	%rdi, %rax
	add	%rcx, %rax
	ret

	.p2align 4
L_unaligned_no_match:
	add	%rcx, %rdx
	sub	$16, %rdx
	jbe	L_return_null
	add	$16, %rdi
	sub	$64, %rdx
	jbe	L_exit_loop
	.p2align 4
L_loop_prolog:
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	movdqa	48(%rdi), %xmm4
	pcmpeqb	%xmm1, %xmm4
	add	$64, %rdi
	pmovmskb %xmm4, %eax
	test	%eax, %eax
	jnz	L_matches0
	test	$0x3f, %rdi
	jz	L_align64_loop
	sub	$64, %rdx
	jbe	L_exit_loop
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	movdqa	48(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax

	add	$64, %rdi
	test	%eax, %eax
	jnz	L_matches0
	mov	%rdi, %rcx
	and	$-64, %rdi
	and	$63, %rcx
	add	%rcx, %rdx

	.p2align 4
L_align64_loop:
	sub	$64, %rdx
	jbe	L_exit_loop
	movdqa	(%rdi), %xmm0
	movdqa	16(%rdi), %xmm2
	movdqa	32(%rdi), %xmm3
	movdqa	48(%rdi), %xmm4

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm1, %xmm2
	pcmpeqb	%xmm1, %xmm3
	pcmpeqb	%xmm1, %xmm4

	pmaxub	%xmm0, %xmm3
	pmaxub	%xmm2, %xmm4
	pmaxub	%xmm3, %xmm4
	pmovmskb %xmm4, %eax

	add	$64, %rdi

	test	%eax, %eax
	jz	L_align64_loop
	sub	$64, %rdi

	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3

	pcmpeqb	48(%rdi), %xmm1
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	pmovmskb %xmm1, %eax
	bsf	%eax, %eax
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L_exit_loop:
	add	$32, %rdx
	jle	L_exit_loop_32
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32_1
	sub	$16, %rdx
	jle	L_return_null
	pcmpeqb	48(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L_matches48_1
	xor	%rax, %rax
	ret

	.p2align 4
L_exit_loop_32:
	add	$32, %rdx
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches_1
	sub	$16, %rdx
	jbe	L_return_null
	pcmpeqb	16(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L_matches16_1
	xor	%rax, %rax
	ret

	.p2align 4
L_matches0:
	bsf	%eax, %eax
	lea	-16(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches:
	bsf	%eax, %eax
	add	%rdi, %rax
	ret

	.p2align 4
L_matches16:
	bsf	%eax, %eax
	lea	16(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches32:
	bsf	%eax, %eax
	lea	32(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	add	%rdi, %rax
	ret

	.p2align 4
L_matches16_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	16(%rdi, %rax), %rax
	ret

	.p2align 4
L_matches32_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	32(%rdi, %rax), %rax
	ret

	.p2align 4
L_matches48_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L_return_null:
	xor	%rax, %rax
	ret
.type memchr_sse, @function;
.size memchr_sse, .-memchr_sse;

测试stub

stub.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include "common.h"
extern void *memchr_sse(const void *s, int c, size_t n);
int main(int argc, char **argv)
{
	char text[1024] = {0};
	void *result = NULL;
	uint64_t begin, end;

	memset(text, 'A', 1024);
	text[1022] = '\r';
	begin = get_cycle_count();
	//result = memchr_sse(text, '\r', 1024);
	result = memchr(text, '\r', 1024);
	end = get_cycle_count();

	if (result){
		printf("result @ %u cost %lu\n", result - (void *)text, end - begin);
	}
	return 0;
}


编译

gcc -march=corei7 -O3 memchr_sse.s stub.c -o stub

测试平台:

Intel(R) Xeon(R) CPU E31230 @ 3.20GHz

memchr 测试结果

result @ 1022 cost 1404
# ./stub
result @ 1022 cost 1600
# ./stub
result @ 1022 cost 1452
# ./stub
result @ 1022 cost 1388
# ./stub
result @ 1022 cost 1440

memchr_sse 测试结果
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 568
# ./stub
result @ 1022 cost 572
# ./stub
result @ 1022 cost 612
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 520


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值