在上一章《写出高性能的C代码---深入理解编译器和硬件平台》中,我写道:如果有标准的库函数可以调用,就不要自己写,绝大多数的人水平远远达不到编写标准库大牛的水平。
这章为边把memcpy这个标准库函数拉出来述说这个观点。
如果观众老爷们自己写个关于memcpy函数会是怎么样的呢?
首先我们来看下memcpy的描述
void *memcpy(void *dest, const void *src, size_t n);
DESCRIPTION
The memcpy() function copies n bytes from memory area src to memory area dest. The memory areas must not overlap. Use memmove(3)
if the memory areas do overlap.
RETURN VALUE
The memcpy() function returns a pointer to dest.
一般C初学者会写成下面这个版本:
void *memcpy(void *dest, const void *src, size_t n)
{
u8 *d, *s;
int i;
d = (u8 *)dest;
s = (u8 *)src;
for(i=0; i<size; i++)
{
*d = *s;
d++;
s++;
}
return dest;
}
C写的比较多之后则是下面这个版本:
void *memcpy(void *dest, const void *src, size_t n)
{
while(size-- >0)
{
*(u8 *)dest++ = *(u8 *)src++;
}
return dest-n;
}
emmmm,以上都是停留在代码的逻辑上面。
而开始对底层有些了解的人会写出下面这个版本:
void *memcpy(void *dest, const void *src, n_t n)
{
u32 *d32, *s32;
u16 *d16, *s16;
int cnt;
if(((int)dest &0x03==0) && ((int)src &0x03 ==0)
{
d32 = (u32 *)dest;
s32 = (u32 *)src;
cnt = n /4;
while(cnt-- > 0)
{
*d32++ = *s32++;
}
cnt = n &0x03;
while(cnt-- > 0)
{
*(u8 *)d32++ = *(u8 *)s32++;
}
}
else if( ((int)dest &0x01 == 0 ) && ((int)src &0x01 == 0))
{
d16 = (u16 *)dest;
s16 = (u16 *)src;
cnt = n / 2;
while(cnt-- > 0)
{
*d16++ = *s16++;
}
cnt = n &0x01;
while(cnt-- > 0)
{
*(u8 *)d16++ = *(u8 *)s16++;
}
}
else
{
while( n-- > 0 )
{
*(u8 *)dest++ = *(u8 *)src++;
}
}
return dest;
}
上述代码利用“对于密集的数据访问类操作,尽量使用与CPU数据总线位宽相同的局部变量”的理论,当目标地址和起始地址与位宽相同时,32位一移动。当目标地址和起始地址为位宽的一半时,16位一移动。否则8位一移动。上述代码有明显的改进地方,当数值量较大时,可以先将目标地址和起始地址运算到与位宽相同,再32位一起移动。
void *memcpy(void *dest, const void *src, n_t n)
{
u32 *d32, *s32, tmp1, tmp2;
u16 *d16, *s16;
u8 *d8, *s8;
int cnt, align;
d8 = (u8 *)dest;
s8 = (u8 *)src;
if(n < 4)
{
goto LAST_BYTES;
}
//判断dest地址是否对齐到四个字节
align = (int)d8 & 0x03;
if(align == 1)
{
*d8++ = *s8++;
*d8++ = *s8++;
*d8++ = *s8++;
n -=3;
}
else if(align == 2)
{
*d8++ = *s8++;
*d8++ = *s8++;
n -=2;
}
else if(align == 3)
{
*d8++ = *s8++;
n--;
}
if(n < 4)
{
goto LAST_BYTES;
}
//此处dest已经对齐到四个字节,再将src对齐到四个字节
align = (int)s8 &0x03;
d32 = (u32 *)d8;
s32 = (u32 *)(s8 - align);
if(align == 1)
{
tmp1 = *s32;
while(n >=4)
{
tmp2 = tmp1 >> 8;
tmp1 = *(++s32);
*d32++ = tmp2 | (tmp1 << 24);
n -=4;
}
d8 = (u8 *)d32;
s8 = (u8 *)s32 +1;
}
else if(align == 2)
{
tmp1 = *s32;
while(n >= 4)
{
tmp2 = tmp1 >> 16;
tmp1 = *(++s32);
*d32++ = tmp2 | (tmp1 << 16);
n -=4;
}
d8 = (u8 *)d32;
s8 = (u8 *)s32 +2;
}
else if(align == 3
{
tmp1 = *s32;
while(n >= 4)
{
tmp2 = tmp1 >> 24
tmp1 = *(++s32);
*d32++ = tmp2 | (tmp1 << 8);
n -=4;
}
d8 = (u8 *)d32;
s8 = (u8 *)s32 +3
}
else
{
while(n >=4)
{
*d32++ = *s32++;
n-=4;
}
d8 = (u8 *)d32;
s8 = (u8 *)s32;
}
LAST_BYTES:
if(n == 3)
{
*d8++ = *s8++;
*d8++ = *s8++;
*d8 = *s8;
}
else if(n == 2)
{
*d8++ = *s8++;
}
*d8 = *s8;
else
{
*d8 = *s8;
}
return dest;
}
上述流程为先将dest不足32位地址的部分先赋值,然后对src不足32位地址的部分进行填充,然后取相邻二个32位字节的值,进行位移和或操作合成一个字节的最终值,赋予dest,差不多就是这个意思,感兴趣的可以钻研下列标准库汇编
memcpy
__rt_memcpy //dest= src= size=r2
cmp r2,#3 //r2与3进行比较
bls _memcpy_lastbytes //r2>3则跳转到_usrmemcpy_lastbytes
ands r12,r0,#3
beq _memcpy_dest_aligned
ldrb r3,[r1],#1
cmp r12,#2
add r2,r2,r12
ldrlsb r12,[r1],#1
strb r3,[r0],#1
ldrccb r3,[r1],#1
sub r2,r2,#4
strlsb r12,[r0],#1
strccb r3,[r0],#1
_memcpy_dest_aligned
ands r3,r1,#3
beq __rt_memcpy_w
subs r2,r2,#4
bcc _memcpy_lastbytes
ldr r12,[r1,-r3]!
cmp r3,#2
beq _memcpy_src2_loop
bhi _memcpy_src3_loop
_memcpy_src1_loop
mov r3,r12,lsr #8
ldr r12,[r1,#4]!
subs r2,r2,#4
orr r3,r3,r12,lsl #24
str r3,[r0],#4
bcs _memcpy_src1_loop
add r1,r1,#1
b _memcpy_lastbytes
_memcpy_src2_loop
mov r3,r12,lsr #16
ldr r12,[r1,#4]!
subs r2,r2,#4
orr r3,r3,r12,lsl #16
str r3,[r0],#4
bcs _memcpy_src2_loop
add r1,r1,#2
b _memcpy_lastbytes
_memcpy_src3_loop
mov r3,r12,lsr #24
ldr r12,[r1,#4]!
subs r2,r2,#4
orr r3,r3,r12,lsl #8
str r3,[r0],#4
bcs _memcpy_src3_loop
add r1,r1,#3
b _memcpy_lastbytes
__rt_memcpy_w
stmfd r13!,{r4,r14}
subs r2,r2,#0x20
bcc _memcpy_small
_memcpy_aligned_loop
ldmcsia r1!,{r3,r4,r12,r14}
stmcsia r0!,{r3,r4,r12,r14}
ldmcsia r1!,{r3,r4,r12,r14}
stmcsia r0!,{r3,r4,r12,r14}
subcss r2,r2,#0x20
bcs _memcpy_aligned_loop
_memcpy_small
movs r12,r2,lsl #28
ldmcsia r1!,{r3,r4,r12,r14}
stmcsia r0!,{r3,r4,r12,r14}
ldmmiia r1!,{r3,r4}
stmmiia r0!,{r3,r4}
ldmfd r13!,{r4,r14}
movs r12,r2,lsl #30
ldrcs r3,[r1],#4
strcs r3,[r0],#4
moveq pc,r14
_memcpy_lastbytes
movs r2,r2,lsl #31 //r2=r2<<31,并影响标志位
ldrmib r2,[r1],#1 //r1地址+4后传送一个字节数值给r2;
ldrcsb r3,[r1],#1 //
ldrcsb r12,[r1],#1 //
strmib r2,[r0],#1 //
strcsb r3,[r0],#1 //
strcsb r12,[r0],#1 //
mov pc,r14 //返回dest地址