偶尔看到一个说法,说,小内存的拷贝,使用等号直接赋值比memcpy快得多。结合自己搜集到的资料,整理成此文。
事实:strcpy等函数的逐字节拷贝,memcpy是按照机器字长逐字进行拷贝的,一个字等于4(32位机)或8(64位机)个字节。CPU存取一个字节和存取一个字一样,都是在一条指令、一个内存周期内完成的。显然,按字拷贝效率更高。
先给出一个程序:
#include <stdio.h>
#define TESTSIZE 128
struct node {
char buf[TESTSIZE];
};
int main()
{
char src[TESTSIZE] = {0};
char dst[TESTSIZE];
*(struct node*)dst = *(struct node*)src;
}
编译:gcc -g -o test test.c
获得汇编:objdump -S test
可以看到有这么一些汇编,对应的是等号赋值操作:
*(struct node*)dst = *(struct node*)src;
4004b6: 48 8d 85 00 ff ff ff lea 0xffffffffffffff00(%rbp),%rax
4004bd: 48 8d 55 80 lea 0xffffffffffffff80(%rbp),%rdx
4004c1: 48 8b 0a mov (%rdx),%rcx
4004c4: 48 89 08 mov %rcx,(%rax)
4004c7: 48 8b 4a 08 mov 0x8(%rdx),%rcx
4004cb: 48 89 48 08 mov %rcx,0x8(%rax)
4004cf: 48 8b 4a 10 mov 0x10(%rdx),%rcx
4004d3: 48 89 48 10 mov %rcx,0x10(%rax)
4004d7: 48 8b 4a 18 mov 0x18(%rdx),%rcx
4004db: 48 89 48 18 mov %rcx,0x18(%rax)
4004df: 48 8b 4a 20 mov 0x20(%rdx),%rcx
4004e3: 48 89 48 20 mov %rcx,0x20(%rax)
4004e7: 48 8b 4a 28 mov 0x28(%rdx),%rcx
4004eb: 48 89 48 28 mov %rcx,0x28(%rax)
4004ef: 48 8b 4a 30 mov 0x30(%rdx),%rcx
4004f3: 48 89 48 30 mov %rcx,0x30(%rax)
4004f7: 48 8b 4a 38 mov 0x38(%rdx),%rcx
4004fb: 48 89 48 38 mov %rcx,0x38(%rax)
4004ff: 48 8b 4a 40 mov 0x40(%rdx),%rcx
400503: 48 89 48 40 mov %rcx,0x40(%rax)
400507: 48 8b 4a 48 mov 0x48(%rdx),%rcx
40050b: 48 89 48 48 mov %rcx,0x48(%rax)
40050f: 48 8b 4a 50 mov 0x50(%rdx),%rcx
400513: 48 89 48 50 mov %rcx,0x50(%rax)
400517: 48 8b 4a 58 mov 0x58(%rdx),%rcx
40051b: 48 89 48 58 mov %rcx,0x58(%rax)
40051f: 48 8b 4a 60 mov 0x60(%rdx),%rcx
400523: 48 89 48 60 mov %rcx,0x60(%rax)
400527: 48 8b 4a 68 mov 0x68(%rdx),%rcx
40052b: 48 89 48 68 mov %rcx,0x68(%rax)
40052f: 48 8b 4a 70 mov 0x70(%rdx),%rcx
400533: 48 89 48 70 mov %rcx,0x70(%rax)
400537: 48 8b 52 78 mov 0x78(%rdx),%rdx
40053b: 48 89 50 78 mov %rdx,0x78(%rax)
获得libc的memcpy汇编代码:objdump -S /lib/libc.so.6
00973a30 <memcpy>:
973a30: 8b 4c 24 0c mov 0xc(%esp),%ecx
973a34: 89 f8 mov %edi,%eax
973a36: 8b 7c 24 04 mov 0x4(%esp),%edi
973a3a: 89 f2 mov %esi,%edx
973a3c: 8b 74 24 08 mov 0x8(%esp),%esi
973a40: fc cld
973a41: d1 e9 shr %ecx
973a43: 73 01 jae 973a46 <memcpy+0x16>
973a45: a4 movsb %ds:(%esi),%es:(%edi)
973a46: d1 e9 shr %ecx
973a48: 73 02 jae 973a4c <memcpy+0x1c>
973a4a: 66 a5 movsw %ds:(%esi),%es:(%edi)
973a4c: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
973a4e: 89 c7 mov %eax,%edi
973a50: 89 d6 mov %edx,%esi
973a52: 8b 44 24 04 mov 0x4(%esp),%eax
973a56: c3 ret
973a57: 90 nop
在循环方式下,每一次MOV过后,需要:1、判断是否拷贝完成;2、跳转以便继续拷贝。
综上所述,“等号赋值”之所以比memcpy快,就是因为它省略了CPU对于判断与跳转的处理,消除了分支对CPU流水的影响。而这一切都是通过适度展开内存拷贝的循环来实现的。
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h>
#define LEN 0x20000
#define MYM 1
#define LIBM 0
char *dst;
char *src;
typedef struct memcpy_data_size
{
int a[16];
}DATA_SIZE, *P_DATA_SIZE;
void *mymemcpy(void *to, const void *from, size_t size)
{
P_DATA_SIZE dst = (P_DATA_SIZE)to;
P_DATA_SIZE src = (P_DATA_SIZE)from;
int new_len = size/sizeof(DATA_SIZE)-1;
int remain = size%sizeof(DATA_SIZE)-1;
while (new_len >= 1)
{
*dst++ = *src++;
new_len--;
}
#if 0
while (new_len >= 2)
{
*dst++ = *src++;
*dst++ = *src++;
new_len = new_len -2;
}
if(new_len == 1)
{
*dst++ = *src++;
}
#endif
while (remain >= 0)
{
*((char *)dst + remain) = *((char *)src + remain);
remain--;
}
return to;
}
int main(int argc, char const* argv[])
{
int type = 0;
struct timeval start, end;
unsigned long diff;
gettimeofday(&start, NULL);
if(argc != 2){
printf("you should run it as : ./run 1(or 0)\n");
printf("1: run my memcpy\n");
printf("0: run lib memcpy\n");
exit(0);
}
type = atoi(argv[1]);
if(MYM != type && LIBM != type){
printf("you should run it as : ./run 1(or 0)\n");
printf("1: run my memcpy\n");
printf("0: run lib memcpy\n");
exit(0);
}
dst = malloc(sizeof(char)*LEN);
if (NULL == dst) {
perror("dst malloc");
exit(1);
}
src = malloc(sizeof(char)*LEN);
if (NULL == src) {
perror("src malloc");
exit(1);
}
if(MYM == type){
mymemcpy(dst, src, LEN);
printf("my memcpy:\n");
}
else{
memcpy(dst, src, LEN);
printf("lib memcpy:\n");
}
free(dst);
free(src);
gettimeofday(&end, NULL);
diff = 1000000*(end.tv_sec - start.tv_sec)+ end.tv_usec - start.tv_usec;
printf("run time is %ld us\n",diff);
return 0;
}
被注释掉的几行代码本来是用来循环展开的,可测试结果并没发现有什么好处,故,先注释掉。
#!/bin/sh
./timememcpy 1
./timememcpy 1
./timememcpy 1
./timememcpy 1
./timememcpy 1
./timememcpy 0
./timememcpy 0
./timememcpy 0
./timememcpy 0
./timememcpy 0
运行该脚本,得结果如下:
[root@SPA c]# ./run.sh
my memcpy:
run time is 435 us
my memcpy:
run time is 237 us
my memcpy:
run time is 249 us
my memcpy:
run time is 304 us
my memcpy:
run time is 300 us
lib memcpy:
run time is 262 us
lib memcpy:
run time is 222 us
lib memcpy:
run time is 335 us
lib memcpy:
run time is 281 us
lib memcpy:
run time is 247 us
#!/bin/sh
./timememcpy 0
./timememcpy 0
./timememcpy 0
./timememcpy 0
./timememcpy 0
./timememcpy 1
./timememcpy 1
./timememcpy 1
./timememcpy 1
./timememcpy 1
再次运行,得结果:
[root@SPA c]# ./run.sh
lib memcpy:
run time is 479 us
lib memcpy:
run time is 461 us
lib memcpy:
run time is 512 us
lib memcpy:
run time is 405 us
lib memcpy:
run time is 365 us
my memcpy:
run time is 399 us
my memcpy:
run time is 314 us
my memcpy:
run time is 309 us
my memcpy:
run time is 510 us
my memcpy:
run time is 324 us