像memcpy是经常用到,但memcpy实际上并不是像我们知道那样一个字节一个字节的拷贝
memcpy是库函数,并不是C语言,有些场合下要自己写代码实现
1. 最简单实现方式
一个个字节的实现
void Mem_Cpy(void *pTag,const void *pSrc,int nLen)
{
unsigned char *pTagBuf = (unsigned char *)pTag;
unsigned char *pSrcBuf = (unsigned char *)pSrc;
for(int i = 0;i < nLen;i++)
{
pTagBuf[i] = pSrcBuf[i];
}
}
这个没有问题。但效率一般,大概2048*10000,需要42ms
2. 4字节拷贝替代
因为系统4字节的复制效率要高于1字节
void Mem_Cpy(void *pTag,const void *pSrc,int nLen)
{
unsigned int *pTagBuf = (unsigned int *)pTag;
unsigned int *pSrcBuf = (unsigned int *)pSrc;
int i = 0;
for(i = 0;i < nLen / 4;i++)
{
pTagBuf[i] = pSrcBuf[i];
}
i = i * 4;
if(nLen > i)
{
for(i ;i <nLen;i++)
{
(unsigned char *)pTag[i] = (unsigned char *)pSrc[i];
}
}
}
实测,同样2048*10000,只要14ms,只有原先的 1/3,说明效率还是提高不少的
3. 是否还存在更高效率?
减少循环,尽量一次多拷贝一点,那么一次性拷贝16字节,即4*4字节
#define BYTE_COPY_BUF(dst_bp, src_bp, nbytes) \
do { \
unsigned char *pDst = (unsigned char *)dst_bp; \
unsigned char *pStr = (unsigned char *)src_bp; \
for(int j = 0; j < nbytes; j++) { \
pDst[j] = pStr[j]; \
} \
} while (0)
void Mem_Cpy(void *pTag,const void *pSrc,int nLen)
{
if(nLen < 16)
{
BYTE_COPY_BUF(pTag,pSrc,nLen);
}
else
{
unsigned int *pTagBuf = (unsigned int *)pTag;
unsigned int *pSrcBuf = (unsigned int *)pSrc;
int i = 0;
for(i = 0;i < (nLen / 16);i++)
{
pTagBuf[i*4] = pSrcBuf[i*4];
pTagBuf[i*4+1] = pSrcBuf[i*4+1];
pTagBuf[i*4+2] = pSrcBuf[i*4+2];
pTagBuf[i*4+3] = pSrcBuf[i*4+3];
}
i = i * 16;
if(nLen > i)
{
BYTE_COPY_BUF(pTag+i,pSrc+i,nLen-i);
}
}
}
实测,同样2048*10000,只要8ms,说明效率还是有提高不少的。
效率是高了不少
当然也可以改成32 64 等,也还能提高点效率,但不是非常明显
显然这个方法,也有点极限
4. 利用汇编代替赋值
#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
do \
{ \
int __d0; \
asm volatile(/* Clear the direction flag, so copying goes forward. */ \
"cld\n" \
/* Copy longwords. */ \
"rep\n" \
"movsl" : \
"=D" (dst_bp), "=S" (src_bp), "=c" (__d0) : \
"0" (dst_bp), "1" (src_bp), "2" ((nbytes) / 4) : \
"memory"); \
(nbytes_left) = (nbytes) % 4; \
} while (0)
没有实测,因为ARM不支持汇编,跟实际需求比较远