最近搞memcpy想用SSE指令实现,网上一搜,大部分都是错的,有些没有考虑数据不是64字节的整数倍,我特意写了个程序,对拷贝的数据不是64的整数倍的时候,超过的部分单独copy,源码如下,希望大家喜欢,欢迎测速!
void new_memcpy(void* dst, void* src, int len)
{
_asm
{
push esi
push edi
mov esi, [src] ; source array
mov edi, [dst] ; destination array
mov ecx, [len] ; number of QWORDS (8 bytes) assumes len / CACHEBLOCK is an integer
shr ecx, 6
neg ecx ; use a negative offset as a combo pointer-and-loop-counter
copyloop:
prefetchnta [esi+ 512] ; fetch ahead by 512 bytes
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
movq [edi], mm0
movntq [edi+8], mm1
movntq [edi+16], mm2
movntq [edi+24], mm3
movntq [edi+32], mm4
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add esi, 64
add edi, 64
add ecx, 1
jnz copyloop
mov ecx,[len]
and ecx,63
cmp ecx,0
je copy_end
copy_loop2:
mov dl, byte ptr [esi]
mov byte ptr [edi], dl
inc esi
inc edi
dec ecx
jne copy_loop2
copy_end:
sfence ; flush write buffer
emms
pop edi
pop esi
}
}