memcpy优化比较

    测试:

#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <malloc.h>
typedef unsigned int u_int;

#define N 1024*1024*16
#define M 1000

void movsb_cpy1(void* Dst,void* Src,size_t Maxsize) ;
void movsd_cpy(void* Dst,void* Src,size_t Maxsize) ;
void cpp_cpy(void* Dst,void* Src,size_t Maxsize) ;
void sse_copy1(void *p1, void *p2, size_t n);
void sse_copy2(void *p1, void *p2, size_t n);
void sse_copy3(void *p1, void *p2, size_t n);
void mov_cpy0(void* Dst,void* Src,size_t Maxsize);

int main()
{
	u_int a[4] = {(1U<<24) + (2U<<16) + (3U<<8) + 4};
	u_int *p = a;

	u_int * p1 = (u_int*)_aligned_malloc(N, 16);
	u_int * p2 = (u_int*)_aligned_malloc(N, 16);
	int x;
	
	clock_t k1, k2;

	x=M;
	k1 = clock();
	while(x--)
		movsb_cpy1(p1,p2,N);
	k2 = clock();
	printf("movsb_cpy1:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);  

	x=M;
	k1 = clock();
	while(x--)
		movsd_cpy(p1,p2,N);
	k2 = clock();
	printf("movsd_cpy:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);  

	x=M;
	k1 = clock();
	while(x--)
		sse_copy1(p1,p2,N);
	k2 = clock();
	printf("sse_copy1:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); 

	x=M;
	k1 = clock();
	while(x--)
		sse_copy2(p1,p2,N);
	k2 = clock();
	printf("sse_copy2:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); 

	x=M;
	k1 = clock();
	while(x--)
		sse_copy3(p1,p2,N);
	k2 = clock();
	printf("sse_copy3:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);  

	x=M;
	k1 = clock();
	while(x--)
		memcpy(p1,p2,N);
	k2 = clock();
	printf("memcpy:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); 


	x=M;
	k1 = clock();
	while(x--)
		cpp_cpy(p1,p2,N);
	k2 = clock();
	printf("cpp_cpy:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); 

	x=M;
	k1 = clock();
	while(x--)
		mov_cpy0(p1,p2,N);
	k2 = clock();
	printf("mov_cpy0:  %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC);

	_aligned_free(p1);
	_aligned_free(p2);

	return 0;
}


void sse_copy3(void *p1, void *p2, size_t n)
{
	__asm
	{
		mov	esi,	p1
		mov edi,	p2
		mov ecx,	n
		shr	ecx,	7
LOOP1:
		prefetchnta	[p1+128]
		prefetchnta	[p1+160]
		prefetchnta	[p1+192]
		prefetchnta	[p1+124]

		movdqa	xmm0,	[esi]
		movdqa	xmm1,	[esi+16]
		movdqa	xmm2,	[esi+32]
		movdqa	xmm3,	[esi+48]
		movdqa	xmm4,	[esi+64]
		movdqa	xmm5,	[esi+80]
		movdqa	xmm6,	[esi+96]
		movdqa	xmm7,	[esi+112]

		movntdq	[edi], xmm0
		movntdq	[edi+16], xmm1
		movntdq	[edi+32], xmm2
		movntdq	[edi+48], xmm3
		movntdq	[edi+64], xmm4
		movntdq	[edi+80], xmm5
		movntdq	[edi+96], xmm6
		movntdq	[edi+112], xmm7

		add esi, 128
		add edi,128
		sub ecx,1
		jnz LOOP1


	}

}

void sse_copy1(void *p1, void *p2, size_t n)
{
	__asm
	{
		mov	esi,	p1
		mov edi,	p2
		mov ecx,	n
		shr	ecx,	7
LOOP1:
		prefetchnta	[p1+128]
		prefetchnta	[p1+160]
		prefetchnta	[p1+192]
		prefetchnta	[p1+124]

		movups	xmm0,	[esi]
		movups	xmm1,	[esi+16]
		movups	xmm2,	[esi+32]
		movups	xmm3,	[esi+48]
		movups	xmm4,	[esi+64]
		movups	xmm5,	[esi+80]
		movups	xmm6,	[esi+96]
		movups	xmm7,	[esi+112]

		movups	[edi], xmm0
		movups	[edi+16], xmm1
		movups	[edi+32], xmm2
		movups	[edi+48], xmm3
		movups	[edi+64], xmm4
		movups	[edi+80], xmm5
		movups	[edi+96], xmm6
		movups	[edi+112], xmm7

		add esi, 128
		add edi,128
		sub ecx,1
		jnz LOOP1
		END:

	}

}

void sse_copy2(void *p1, void *p2, size_t n)
{
	__asm
	{
		mov	esi,	p1
		mov edi,	p2
		mov ecx,	n
		shr	ecx,	7
LOOP1:

		movdqa	xmm0,	[esi]
		movdqa	xmm1,	[esi+16]
		movdqa	xmm2,	[esi+32]
		movdqa	xmm3,	[esi+48]
		movdqa	xmm4,	[esi+64]
		movdqa	xmm5,	[esi+80]
		movdqa	xmm6,	[esi+96]
		movdqa	xmm7,	[esi+112]

		movntdq	[edi], xmm0
		movntdq	[edi+16], xmm1
		movntdq	[edi+32], xmm2
		movntdq	[edi+48], xmm3
		movntdq	[edi+64], xmm4
		movntdq	[edi+80], xmm5
		movntdq	[edi+96], xmm6
		movntdq	[edi+112], xmm7

		add esi, 128
		add edi,128
		sub ecx,1
		jnz LOOP1
		END:

	}

}

void movsb_cpy1(void* Dst,void* Src,size_t Maxsize) 
{ 
	__asm 
	{ 
		mov esi,[Src] 
		mov edi,[Dst] 
		mov ecx, [Maxsize] 
		rep movsb 

	} 
} 

void mov_cpy0(void* Dst,void* Src,size_t Maxsize) 
{ 
	__asm 
	{ 
		mov esi,Src 
		mov edi,Dst 
		mov ecx, Maxsize 
		L:
		mov al, byte ptr[esi]
		mov byte ptr[edi], al

		sub ecx, 1
		jnz L
		

	} 
} 

void movsd_cpy(void* Dst,void* Src,size_t Maxsize) 
{ 
	__asm 
	{ 
		mov esi,[Src] 
		mov edi,[Dst] 
		mov ecx, [Maxsize] 
		shr ecx,2 
		rep movsd 
	} 
} 

//

void cpp_cpy(void* Dst,void* Src,size_t Maxsize)
{
	char *p1 = (char*)Dst;
	char *p2 = (char*)Src;
	while(Maxsize--)
		*p1 = *p2;
}





movsb_cpy1:  14.48500 // 使用了rep的两个函数性能都不错
movsd_cpy:  14.797000
sse_copy1:  16.390000 // 在这里,prefetch好像没有发挥作用
sse_copy2:  10.313000 // movdqa 与 movntqa 比movups快得多
sse_copy3:  10.343000
memcpy:  14.469000 // 标准函数其实也是优化过的
cpp_cpy:  108.656000 // c++逐个字节拷贝
mov_cpy0:  109.563000 // 汇编逐个字节拷贝,如果逐个int拷贝,就变成了26s,差不多4倍


可见,

【1】rep很不错,可以学会使用

【2】sse需要内存对齐,很重要




  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值