[转]内存拷贝的优化方法

最新推荐文章于 2023-07-29 09:30:00 发布

shines77

最新推荐文章于 2023-07-29 09:30:00 发布

阅读量2.2k

点赞数

分类专栏： VC 文章标签：优化 dst cache null 测试 linux

VC 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

转自：http://flier.cnblogs.com/archive/2004/07/08/22352.html

http://www.blogcn.com/user8/flier_lu/index.html?id=1577440&run=.0999083

让我们回过头来看看P4架构下的Cache结构。

The IA-32 Intel Architecture Software Developer's Manual, Volume 3: System Programming Guide

     Intel的系统变成手册中第十章介绍了IA32架构下的内存缓存控制。因为CPU速度和内存速度的巨大差距，CPU厂商通过在CPU中内置和外置多级缓存提高频繁使用数据的访问速度。一般来说，在CPU和内存之间存在L1, L2和L3三级缓存(还有几种TLB缓存在此不涉及)，每级缓存的速度有一个数量级左右的差别，容量也有较大差别(实际上跟$有关，呵呵)，而L1缓存更是细分为指令缓存和数据缓存，用于不同的目的。就P4和Xeon的处理器来说，L1指令缓存由Trace Cache取代，内置在NetBust微架构中；L1数据缓存和L2缓存则封装在CPU中，根据CPU档次不同，分别在8-16K和256-512K之间；而L3缓存只在Xeon处理器中实现，也是封装在CPU中，512K-1M左右。
     可以通过查看CPU信息的软件如CPUInfo查看当前机器的缓存信息，如我的系统为：
     P4 1.7G, 8K L1 Code Cache, 12K L1 Data Cache, 256K L2 Cache。

而缓存在实现上是若干行(slot or line)组成的，每行对应内存中的一个地址上的连续数据，由高速缓存管理器控制读写中的数据载入和命中。其原理这里不多罗嗦，有兴趣的朋友可以自行查看Intel手册。需要知道的就是每个slot的长度在P4以前是32字节，P4开始改成64字节。而对缓存行的操作都是完整进行的，哪怕只读一个字节也需要将整个缓存行(64字节)全部载入，后面的优化很大程度上基于这些原理。

就缓存的工作模式来说，P4支持的有六种之多，这里就不一一介绍了。对我们优化有影响的，实际上就是写内存时缓存的表现。最常见的WT(Write-through)写通模式在写数据到内存的同时更新数据到缓存中；而WB(Write-back)写回模式，则直接写到缓存中，暂不进行较慢的内存读写。这两种模式在操作频繁操作(每秒百万次这个级别)的内存变量处理上有较大性能差别。例如通过编写驱动模块操作MTRR强行打开WB模式，在Linux的网卡驱动中曾收到不错的效果，但对内存复制的优化帮助不大，因为我们需要的是完全跳过对缓存的操作，无论是缓存定位、载入还是写入。

好在P4提供了MOVNTQ指令，使用WC(Write-combining)模式，跳过缓存直接写内存。因为我们的写内存操作是纯粹的写，写入的数据一定时间内根本不会被使用，无论使用WT还是WB模式，都会有冗余的缓存操作。优化代码如下：

global _fast_memcpy7

% define param esp + 8 + 4

% define src param + 0

% define dst param + 4

% define len param + 8

_fast_memcpy7:

push esi

push edi

mov esi, [src] ; source array

mov edi, [dst] ; destination array

mov ecx, [len] ; number of QWORDS ( 8 bytes) assumes len / CACHEBLOCK is an integer

shr ecx, 3

lea esi, [esi + ecx * 8 ] ; end of source

lea edi, [edi + ecx * 8 ] ; end of destination

neg ecx ; use a negative offset as a combo pointer - and - loop - counter

.copyloop:

movq mm0, qword [esi + ecx * 8 ]

movq mm1, qword [esi + ecx * 8 + 8 ]

movq mm2, qword [esi + ecx * 8 + 16 ]

movq mm3, qword [esi + ecx * 8 + 24 ]

movq mm4, qword [esi + ecx * 8 + 32 ]

movq mm5, qword [esi + ecx * 8 + 40 ]

movq mm6, qword [esi + ecx * 8 + 48 ]

movq mm7, qword [esi + ecx * 8 + 56 ]

movntq qword [edi + ecx * 8 ], mm0

movntq qword [edi + ecx * 8 + 8 ], mm1

movntq qword [edi + ecx * 8 + 16 ], mm2

movntq qword [edi + ecx * 8 + 24 ], mm3

movntq qword [edi + ecx * 8 + 32 ], mm4

movntq qword [edi + ecx * 8 + 40 ], mm5

movntq qword [edi + ecx * 8 + 48 ], mm6

movntq qword [edi + ecx * 8 + 56 ], mm7

add ecx, 8

jnz .copyloop

sfence ; flush write buffer

emms

pop edi

pop esi

ret

写内存的movq指令全部改为movntq指令，并在复制操作完成后，调用sfence刷新写缓存，因为缓存中内容可能已经失效了。这样一来在写内存外的载入缓存操作，以及缓存本身的操作都被省去，大大减少了冗余内存操作。按AMD的说法能有60%的性能提升，我实测也有50%左右明显的性能提升。
movntq和sfence等指令可以参考Intel的指令手册：

The IA-32 Intel Architecture Software Developer's Manual, Volume 2A: Instruction Set Reference, A-M
The IA-32 Intel Architecture Software Developer's Manual, Volume 2B: Instruction Set Reference, N-Z

在优化完写内存后，同样可以通过对读内存的操作进行优化提升性能。虽然CPU在读取数据时，会有一个自动的预读优化，但在操作连续内存区域时显式要求CPU预读数据，还是可以明显地优化性能。

global _fast_memcpy8

% define param esp + 8 + 4

% define src param + 0

% define dst param + 4

% define len param + 8

_fast_memcpy8:

push esi

push edi

mov esi, [src] ; source array

mov edi, [dst] ; destination array

mov ecx, [len] ; number of QWORDS ( 8 bytes) assumes len / CACHEBLOCK is an integer

shr ecx, 3

lea esi, [esi + ecx * 8 ] ; end of source

lea edi, [edi + ecx * 8 ] ; end of destination

neg ecx ; use a negative offset as a combo pointer - and - loop - counter

.writeloop:

prefetchnta [esi + ecx * 8 + 512 ] ; fetch ahead by 512 bytes

movq mm0, qword [esi + ecx * 8 ]

movq mm1, qword [esi + ecx * 8 + 8 ]

movq mm2, qword [esi + ecx * 8 + 16 ]

movq mm3, qword [esi + ecx * 8 + 24 ]

movq mm4, qword [esi + ecx * 8 + 32 ]

movq mm5, qword [esi + ecx * 8 + 40 ]

movq mm6, qword [esi + ecx * 8 + 48 ]

movq mm7, qword [esi + ecx * 8 + 56 ]

movntq qword [edi + ecx * 8 ], mm0

movntq qword [edi + ecx * 8 + 8 ], mm1

movntq qword [edi + ecx * 8 + 16 ], mm2

movntq qword [edi + ecx * 8 + 24 ], mm3

movntq qword [edi + ecx * 8 + 32 ], mm4

movntq qword [edi + ecx * 8 + 40 ], mm5

movntq qword [edi + ecx * 8 + 48 ], mm6

movntq qword [edi + ecx * 8 + 56 ], mm7

add ecx, 8

jnz .writeloop

sfence ; flush write buffer

emms

pop edi

pop esi

ret

增加一个简单的prefetchnta指令，提示CPU在处理当前读取内存操作的同时，预读前面512字节处的一个缓存行64字节内容。这样一来又可以有10%左右的性能提升。
最后，对正在处理的内存，可以通过显式的内存读取操作，强制性要求其载入到缓存中，因为prefetchnta指令还只是一个提示，可以被CPU忽略。这样可以再次获得60%左右的性能提示，我实测没有这么高，但是也比较明显。

global _fast_memcpy9

% define param esp + 12 + 4

% define src param + 0

% define dst param + 4

% define len param + 8

% define CACHEBLOCK 400h

_fast_memcpy9:

push esi

push edi

push ebx

mov esi, [src] ; source array

mov edi, [dst] ; destination array

mov ecx, [len] ; number of QWORDS ( 8 bytes) assumes len / CACHEBLOCK is an integer

shr ecx, 3

lea esi, [esi + ecx * 8 ] ; end of source

lea edi, [edi + ecx * 8 ] ; end of destination

neg ecx ; use a negative offset as a combo pointer - and - loop - counter

.mainloop:

mov eax, CACHEBLOCK / 16 ; note: .prefetchloop is unrolled 2X

add ecx, CACHEBLOCK ; move up to end of block

.prefetchloop:

mov ebx, [esi + ecx * 8 - 64 ] ; read one address in this cache line...

mov ebx, [esi + ecx * 8 - 128 ] ; ... and one in the previous line

sub ecx, 16 ; 16 QWORDS = 2 64 - byte cache lines

dec eax

jnz .prefetchloop

mov eax, CACHEBLOCK / 8

.writeloop:

prefetchnta [esi + ecx * 8 + 512 ] ; fetch ahead by 512 bytes

movq mm0, qword [esi + ecx * 8 ]

movq mm1, qword [esi + ecx * 8 + 8 ]

movq mm2, qword [esi + ecx * 8 + 16 ]

movq mm3, qword [esi + ecx * 8 + 24 ]

movq mm4, qword [esi + ecx * 8 + 32 ]

movq mm5, qword [esi + ecx * 8 + 40 ]

movq mm6, qword [esi + ecx * 8 + 48 ]

movq mm7, qword [esi + ecx * 8 + 56 ]

movntq qword [edi + ecx * 8 ], mm0

movntq qword [edi + ecx * 8 + 8 ], mm1

movntq qword [edi + ecx * 8 + 16 ], mm2

movntq qword [edi + ecx * 8 + 24 ], mm3

movntq qword [edi + ecx * 8 + 32 ], mm4

movntq qword [edi + ecx * 8 + 40 ], mm5

movntq qword [edi + ecx * 8 + 48 ], mm6

movntq qword [edi + ecx * 8 + 56 ], mm7

add ecx, 8

dec eax

jnz .writeloop

or ecx, ecx ; assumes integer number of cacheblocks

jnz .mainloop

sfence ; flush write buffer

emms

pop ebx

pop edi

pop esi

ret

至此，一个完整的内存复制函数的优化流程就结束了，通过对缓存的了解和使用，一次又一次地超越自己，最终获得一个较为令人满意地结果。（号称300%性能提示，实测175%-200%，也算相当不错了）

在编写测试代码的时候需要注意两点：

一是计时精度的问题，需要使用高精度的物理计数器，避免误差。推荐使用rdtsc指令，然后根据CPU主频计算时间。CPU主频可以通过高精度计时器动态计算，我这儿偷懒直接从注册表里面读取了 :P
代码如下：

#ifdef WIN32
typedef __int64 uint64_t;
#else
#include <stdint.h>
#endif

   bool GetPentiumClockEstimateFromRegistry(uint64_t& frequency)
   {
     HKEY hKey;

frequency = 0;

LONG rc = ::RegOpenKeyEx(HKEY_LOCAL_MACHINE, "Hardware/Description/System/CentralProcessor/0", 0, KEY_READ, &hKey);

     if(rc == ERROR_SUCCESS)
     {
       DWORD cbBuffer = sizeof (DWORD);
       DWORD freq_mhz;

rc = ::RegQueryValueEx(hKey, "~MHz", NULL, NULL, (LPBYTE)(&freq_mhz), &cbBuffer);

if (rc == ERROR_SUCCESS)
frequency = freq_mhz * MEGA;

RegCloseKey (hKey);
}

return frequency > 0;
}

   void getTimeStamp(uint64_t& timeStamp)
   {
#ifdef WIN32
     __asm
     {
       push  edx
         push  ecx
         mov   ecx, timeStamp
         //_emit 0Fh // RDTSC
         //_emit 31h
         rdtsc
         mov   [ecx], eax
         mov   [ecx+4], edx
         pop   ecx
         pop   edx
     }
#else
     __asm__ __volatile__ ("rdtsc" : "=A" (timeStamp));
#endif
   }

二是测试内存复制的缓冲区的大小，如果缓冲区过小，第一次拷贝两个缓冲区时就会导致所有数据都被载入L2缓存中，得出比普通内存操作高一个数量级的数值。例如我的L2缓冲为256K，如果我用两个128K的缓冲区对着拷贝，无论循环多少次，速度都在普通内存复制的10倍左右。因此设置一个较大的值是必要的。

shines77

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
[转]内存拷贝的优化方法

转自：http://flier.cnblogs.com/archive/2004/07/08/22352.htmlhttp://www.blogcn.com/user8/flier_lu/index.html?id=1577440&run=.0999083让我们回过头来看看P4架构下的Cache结构。 The IA-32 Intel Architecture Softw
复制链接

扫一扫