测试:
#include <stdio.h> #include <string.h> #include <time.h> #include <stdlib.h> #include <malloc.h> typedef unsigned int u_int; #define N 1024*1024*16 #define M 1000 void movsb_cpy1(void* Dst,void* Src,size_t Maxsize) ; void movsd_cpy(void* Dst,void* Src,size_t Maxsize) ; void cpp_cpy(void* Dst,void* Src,size_t Maxsize) ; void sse_copy1(void *p1, void *p2, size_t n); void sse_copy2(void *p1, void *p2, size_t n); void sse_copy3(void *p1, void *p2, size_t n); void mov_cpy0(void* Dst,void* Src,size_t Maxsize); int main() { u_int a[4] = {(1U<<24) + (2U<<16) + (3U<<8) + 4}; u_int *p = a; u_int * p1 = (u_int*)_aligned_malloc(N, 16); u_int * p2 = (u_int*)_aligned_malloc(N, 16); int x; clock_t k1, k2; x=M; k1 = clock(); while(x--) movsb_cpy1(p1,p2,N); k2 = clock(); printf("movsb_cpy1: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) movsd_cpy(p1,p2,N); k2 = clock(); printf("movsd_cpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) sse_copy1(p1,p2,N); k2 = clock(); printf("sse_copy1: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) sse_copy2(p1,p2,N); k2 = clock(); printf("sse_copy2: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) sse_copy3(p1,p2,N); k2 = clock(); printf("sse_copy3: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) memcpy(p1,p2,N); k2 = clock(); printf("memcpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) cpp_cpy(p1,p2,N); k2 = clock(); printf("cpp_cpy: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); x=M; k1 = clock(); while(x--) mov_cpy0(p1,p2,N); k2 = clock(); printf("mov_cpy0: %f\n", (double)(k2 - k1) / CLOCKS_PER_SEC); _aligned_free(p1); _aligned_free(p2); return 0; } void sse_copy3(void *p1, void *p2, size_t n) { __asm { mov esi, p1 mov edi, p2 mov ecx, n shr ecx, 7 LOOP1: prefetchnta [p1+128] prefetchnta [p1+160] prefetchnta [p1+192] prefetchnta [p1+124] movdqa xmm0, [esi] movdqa xmm1, [esi+16] movdqa xmm2, [esi+32] movdqa xmm3, [esi+48] movdqa xmm4, [esi+64] movdqa xmm5, [esi+80] movdqa xmm6, [esi+96] movdqa xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi,128 sub ecx,1 jnz LOOP1 } } void sse_copy1(void *p1, void *p2, size_t n) { __asm { mov esi, p1 mov edi, p2 mov ecx, n shr ecx, 7 LOOP1: prefetchnta [p1+128] prefetchnta [p1+160] prefetchnta [p1+192] prefetchnta [p1+124] movups xmm0, [esi] movups xmm1, [esi+16] movups xmm2, [esi+32] movups xmm3, [esi+48] movups xmm4, [esi+64] movups xmm5, [esi+80] movups xmm6, [esi+96] movups xmm7, [esi+112] movups [edi], xmm0 movups [edi+16], xmm1 movups [edi+32], xmm2 movups [edi+48], xmm3 movups [edi+64], xmm4 movups [edi+80], xmm5 movups [edi+96], xmm6 movups [edi+112], xmm7 add esi, 128 add edi,128 sub ecx,1 jnz LOOP1 END: } } void sse_copy2(void *p1, void *p2, size_t n) { __asm { mov esi, p1 mov edi, p2 mov ecx, n shr ecx, 7 LOOP1: movdqa xmm0, [esi] movdqa xmm1, [esi+16] movdqa xmm2, [esi+32] movdqa xmm3, [esi+48] movdqa xmm4, [esi+64] movdqa xmm5, [esi+80] movdqa xmm6, [esi+96] movdqa xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi,128 sub ecx,1 jnz LOOP1 END: } } void movsb_cpy1(void* Dst,void* Src,size_t Maxsize) { __asm { mov esi,[Src] mov edi,[Dst] mov ecx, [Maxsize] rep movsb } } void mov_cpy0(void* Dst,void* Src,size_t Maxsize) { __asm { mov esi,Src mov edi,Dst mov ecx, Maxsize L: mov al, byte ptr[esi] mov byte ptr[edi], al sub ecx, 1 jnz L } } void movsd_cpy(void* Dst,void* Src,size_t Maxsize) { __asm { mov esi,[Src] mov edi,[Dst] mov ecx, [Maxsize] shr ecx,2 rep movsd } } // void cpp_cpy(void* Dst,void* Src,size_t Maxsize) { char *p1 = (char*)Dst; char *p2 = (char*)Src; while(Maxsize--) *p1 = *p2; }
movsb_cpy1: 14.48500 // 使用了rep的两个函数性能都不错
movsd_cpy: 14.797000
sse_copy1: 16.390000 // 在这里,prefetch好像没有发挥作用
sse_copy2: 10.313000 // movdqa 与 movntqa 比movups快得多
sse_copy3: 10.343000
memcpy: 14.469000 // 标准函数其实也是优化过的
cpp_cpy: 108.656000 // c++逐个字节拷贝
mov_cpy0: 109.563000 // 汇编逐个字节拷贝,如果逐个int拷贝,就变成了26s,差不多4倍
可见,
【1】rep很不错,可以学会使用
【2】sse需要内存对齐,很重要