本文尝试自己实现内存拷贝函数并且对各种大小的拷贝性能进行测试,与linux系统调用memcpy进行对比,旨在深入理解内存访问方式,以及对内存访问在程序执行中占用时间比有一定的认识。
测试环境为:
* 64位linux
* Intel(R) Xeon(R) 8 Core 2GHZ
* cache大小8192KB,cache对齐字节数64字节,一次缓存分组数:8
* gcc-4.1.2,编译参数-O2
几种实现方式:
1. 无任何优化
void* mymemcpy_naive(void* dest, const void* src, size_t len)
{
char* destc = (char*)dest;
const char* srcc = (const char*)src;
while (len-- > 0) {
*destc++ = *srcc++;
}
return dest;
}
2. 以64/32/16/8字节单位进行传输
void* mymemcpy_64(void* dest, const void* src, size_t len)
{
char* destc = (char*)dest;
const char* srcc = (const char*)src;
// first do 64 bytes align
while (((size_t)srcc) & 0x3F != 0 && len > 0) {
*destc++ = *srcc++;
--len;
}
COPY64(destc, srcc);
COPY32(destc, srcc);
COPY16(destc, srcc);
COPY8(destc, srcc);
COPY4(destc, srcc);
while (len-- > 0) {
*destc++ = *srcc++;
}
return dest;
}
其中COPY64宏定义如下:
#define COPY64(destc, srcc) \
while (len >= 64) { \
register unsigned long q1 = *(unsigned long*)srcc; \
register unsigned long q2 = *(unsigned long*)(srcc+8); \
register unsigned long q3 = *(unsigned long*)(srcc+16); \
register unsigned long q4 = *(unsigned long*)(srcc+24); \
register unsigned long q5 = *(unsigned long*)(srcc+32); \
register unsigned long q6 = *(unsigned long*)(srcc+40); \
register unsigned long q7 = *(unsigned long*)(srcc+48); \
register unsigned long q8 = *(unsigned long*)(srcc+56); \
*(unsigned long*)destc = q1; \
*(unsigned long*)(destc+8) = q2; \
*(unsigned long*)(destc+16) = q3; \
*(unsigned long*)(destc+24) = q4; \
*(unsigned long*)(destc+32) = q5; \
*(unsigned long*)(destc+40) = q6; \
*(unsigned long*)(destc+48) = q7; \
*(unsigned long*)(destc+56) = q8; \
srcc+=64; \
destc+=64; \
len-=64; \
}
COPY32/COPY16/COPY8等类似COPY64定义。此处略。
3. 在传输中间加上memory barrier。代码同上,只是COPY64/COPY32/COPY16等换为COPY64B/COPY32B/COPY16B。其中COPY64定义如下:
#define COPY64B(destc, srcc) \
while (len >= 64) { \
register unsigned long q1 = *(unsigned long*)srcc; \
register unsigned long q2 = *(unsigned long*)(srcc+8); \
register unsigned long q3 = *(unsigned long*)(srcc+16); \
register unsigned long q4 = *(unsigned long*)(srcc+24); \
register unsigned long q5 = *(unsigned long*)(srcc+32); \
register unsigned long q6 = *(unsigned long*)(srcc+40); \
register unsigned long q7 = *(unsigned long*)(srcc+48); \
register unsigned long q8 = *(unsigned long*)(srcc+56); \
__memory_barrier(); \
*(unsigned long*)destc = q1; \
*(unsigned long*)(destc+8) = q2; \
*(unsigned long*)(destc+16) = q3; \
*(unsigned long*)(destc+24) = q4; \
*(unsigned long*)(destc+32) = q5; \
*(unsigned long*)(destc+40) = q6; \
*(unsigned long*)(destc+48) = q7; \
*(unsigned long*)(destc+56) = q8; \
srcc+=64; \
destc+=64; \
len-=64; \
}
__memory_barrier()定义如下:
#define __memory_barrier() asm volatile("":::"memory")
注:
* 加上memory barrier有什么效果?
实测-O2优化情况下,COPY64宏展开之后的循环体内部,会编译为以下代码:
mov 0x8(%r11),%rdx
mov 0x10(%r11),%rcx
sub $0x40,%rbp
mov 0x18(%r11),%rsi
mov 0x20(%r11),%rdi
mov 0x28(%r11),%r8
mov 0x30(%r11),%r9
mov 0x38(%r11),%r10
mov (%r11),%rax
add $0x40,%r11
mov %rdx,0x8(%rbx)
mov %rcx,0x10(%rbx)
mov %rsi,0x18(%rbx)
mov %rdi,0x20(%rbx)
mov %rax,(%rbx)
mov %r8,0x28(%rbx)
mov %r9,0x30(%rbx)
mov %r10,0x38(%rbx)
关于在mov过程中夹杂无关运算代码sub $0x40,%rbp,add $0x40,%r11,可以参考编译器为隐藏内存传输延时而打乱指令顺序的优化资料,在读内存指令后到内存实际被fetch到寄存器,该寄存器可用的过程之间是需要一定的等待周期的,在现代CPU和内存上,此周期为100-200ns之间,在这期间可以执行其它指令,否则必须等待内存读取完成。
关于mov (%r11),%rax这一句。编译器没有按照实际读取顺序从src+0开始访问,估计跟gcc优化中关于打乱指令顺序以增强寄存器藕合性有关。但是这一举动可能造成cache失效。加上memory barrier之后,编译的结果就是按顺序访问内存了。具体有没有影响请参见后面的测试数据。
4. 不使用中间变量直接赋值。代码同上,只是将COPY64B/COPY32B/COPY16B/COPY8B换成COPY64D/COPY32D/COPY16D/COPY8D,其中COPY64D定义如下:
#define COPY64D(destc, srcc) \
while (len >= 64) { \
*(unsigned long*)destc = *(unsigned long*)srcc; \
*(unsigned long*)(destc+8) = *(unsigned long*)(srcc+8); \
*(unsigned long*)(destc+16) = *(unsigned long*)(srcc+16); \
*(unsigned long*)(destc+24) = *(unsigned long*)(srcc+24); \
*(unsigned long*)(destc+32) = *(unsigned long*)(srcc+32); \
*(unsigned long*)(destc+40) = *(unsigned long*)(srcc+40); \
*(unsigned long*)(destc+48) = *(unsigned long*)(srcc+48); \
*(unsigned long*)(destc+56) = *(unsigned long*)(srcc+56); \
srcc+=64; \
destc+=64; \
len-=64; \
}
实测编译器并不能生成x86上的repz movsq之类的代码,因此这种方式展开之后也是需要借助中间寄存器的。但是区别是读和写交错进行了。具体效果请见后文数据。
函数名定义如下:
mymemcpy_ 传输字节 _ (d=直接传输 b=加memory barrier)
表头:一次性拷贝的字节数 memcpy(dest, src, N)中的N
数据:每秒钟拷贝字节数(MB/s) 这里MB是Mega BYTE
8 | 16 | 25 | 32 | 50 | 64 | 100 | 200 | 256 | 500 | 1000 | 1024 | 4096 | 16384 | 512000 | 1000000 | |
sys_memcpy | 871.06 | 1755.35 | 3361.04 | 4028.17 | 3885.78 | 6162.04 | 6724.57 | 10101.9 | 12263.3 | 13648.7 | 14932.5 | 15035.8 | 7501.12 | 7889.67 | 3903.07 | 3465.7 |
mymemcpy_naive | 690.748 | 817.927 | 782.5 | 824.169 | 893.4 | 916.579 | 897.421 | 945.511 | 962.502 | 985.278 | 996.581 | 997.625 | 963.121 | 1003.89 | 1004.52 | 982.129 |
mymemcpy_8 | 1093.51 | 1776.38 | 2620.14 | 2898.56 | 3431.27 | 3960.09 | 5772.17 | 4684.11 | 6080.54 | 7483.41 | 5143.5 | 4991.39 | 5298.27 | 5359.06 | 5146.09 | 5234.19 |
mymemcpy_8d | 1076.36 | 1684.14 | 2294.36 | 2444.5 | 2730.97 | 3009.11 | 3940.89 | 4293.32 | 4495.53 | 4977.22 | 4923.46 | 4903.33 | 5265.59 | 5349.46 | 5137.51 | 5230.2 |
mymemcpy_16 | 1242.03 | 1984.16 | 2501.14 | 3342.26 | 4139.8 | 5070.09 | 5689.43 | 8239.53 | 7837.96 | 8841.32 | 10146.2 | 9234.83 | 8045.63 | 8053.07 | 5710.49 | 6204.91 |
mymemcpy_16b | 1242.68 | 1984.72 | 2800.19 | 3340.88 | 4149.63 | 5036.35 | 5685.04 | 8234.77 | 7721.36 | 8770.93 | 10139.4 | 9164.73 | 8047.52 | 8053.32 | 5625.9 | 4903.46 |
mymemcpy_16d | 1242.86 | 2018.99 | 3154.52 | 3286.88 | 3975.27 | 4969.42 | 5771.7 | 6985.4 | 7717.84 | 8761.11 | 7201.82 | 9201.3 | 8040.79 | 8055.22 | 5666.88 | 6200.88 |
mymemcpy_32 | 897.28 | 2308.46 | 2651.23 | 3231.68 | 4719.01 | 5385.69 | 7347.54 | 8392.39 | 9747.96 | 13096.5 | 13617.4 | 10656.8 | 10582.6 | 10573.9 | 6189.46 | 6284.59 |
mymemcpy_32b | 897.659 | 2306.73 | 2657.88 | 3229.03 | 4699.63 | 5387.46 | 7345.72 | 8389.51 | 9758.69 | 13125.7 | 13652 | 13868 | 10623 | 10585.1 | 6131.78 | 6252.79 |
mymemcpy_32d | 1000.06 | 2486 | 2966.07 | 3523.38 | 4925.87 | 6062.82 | 8417.91 | 10896.9 | 12216.2 | 13648.2 | 11190.6 | 14841.4 | 8045.32 | 8053.23 | 5710.89 | 6216.18 |
mymemcpy_64 | 863.286 | 1794.37 | 2295.35 | 3400.56 | 3882.92 | 6155.23 | 7481.61 | 11199.2 | 12606.5 | 13463.7 | 14755 | 14677.9 | 12453 | 10880.5 | 3705.59 | 3818.43 |
mymemcpy_64b | 897.59 | 1794.84 | 2294.51 | 3395.98 | 4040.51 | 6155.34 | 7471 | 11195.1 | 12610.5 | 13645.9 | 14828 | 14471.4 | 12513 | 12262 | 6199.8 | 6226.03 |
mymemcpy_64d | 1002 | 1583.03 | 2509.21 | 3800.17 | 4811.31 | 6250.39 | 8782.94 | 12382.3 | 14160.7 | 14873.9 | 15363.8 | 15538.1 | 8048.33 | 8054.37 | 5688.67 | 6213.36 |
结论:
* 不加优化的memcpy仅在拷贝数据较小时速度和其它函数差不多
* 数据量在cache可以完全容纳的大小之内的情况下,传输可以达到超过理论最大值的速度以上(本机内存访问速度理论值为8GB/s)
* 如果不考虑通用性,较大或较小内存的传输可以考虑用自制函数替代系统memory copy
* 指令的乱序可能对内存访问的优化造成一定影响