xmemcpy改进版,利用movdqu速度快的特点,利用内联和常量化来提高对于小内存的memcpy性能优化
xmemcpy来自github /progs/C/c_progs/memcpy.c ,不知道是不是原作者,这里进行了部分改进
------2016-2-28注意1:以下内容的缓冲区由于反复读取,总在L1cache中,类似于栈内存,如果总是在超出cache的内存中,则由于内存速度拖累,改进版与memcpy很难拉开差距,但是仍然有一定的效果
------2016-2-28注意2:DEBUG下速度会很慢,除非关闭/GS或用 #pragma runtime_checks( "s", restore ) (此编译杂注对模板无效)
------2016-3-5 注意3:参看zmemcpy改进版,对debug模式有相当大的提高 http://blog.csdn.net/superzmy/article/details/50810343
预期结果:
All time to memcpy 80 * 100M is 0.248s in 3GHz (xmemcopy)
All time to memcpy 80 * 100M is 0.476s in 3GHz (xmemcpy)
All time to memcpy 80 * 100M is 0.778s in 3GHz (xmemcpy unknownSize)
All time to memcpy 80 * 100M is 0.232s in 3GHz (movdq)
All time to memcpy 80 * 100M is 0.257s in 3GHz (movdq unalign)
All time to memcpy 81 * 100M is 0.298s in 3GHz (movdq)
All time to memcpy 81 * 100M is 0.264s in 3GHz (movdq unalign)
All time to memcpy 400 * 100M is 1.334s in 3GHz (xmemcopy)
All time to memcpy 400 * 100M is 1.236s in 3GHz (xmemcopy unalign)
All time to memcpy 400 * 100M is 1.819s in 3GHz (xmemcpy)
All time to memcpy 400 * 100M is 3.051s in 3GHz (rep movs)
All time to memcpy 400 * 100M is 2.984s in 3GHz (rep movs unalign)
All time to memcpy 400 * 100M is 3.015s in 3GHz (rep movs handwrite asm)
All time to memcpy 401 * 100M is 3.093s in 3GHz (rep movs)
All time to memcpy 401 * 100M is 3.193s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.216s in 3GHz (rep movs handwrite asm)
All time to memcpy 4000 * 100M is 15.254s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.824s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.828s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.779s in 3GHz (call _memcpy unalign)
All time to memcpy 400 * 100M is 2.554s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.777s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.725s in 3GHz (call _memcpy unalign)
All time to memcpy 4000 * 100M is 14.379s in 3GHz (call _memcpy)
以上代码vs2013编译 E3 1230V2上运行
// ConsoleApplication3.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <windows.h>
#include <intrin.h>
#include <assert.h>
char data80[80] = "abcdefghijklmnopqrstuvwxyz0123456789";
char data400[400] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"012345678901234567890123456789012345678";
char data4000[4000] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;
char data401[401] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;
char data81[81] = "abcdefghijklmnopqrstuvwxyz0123456789";
// optimize memcpy less than 120bytes
// char a[32], b[32]; a = b; is faster than memcpy(a, b, sizeof(b));
namespace com
{
const static size_t _MAXSIZE_ = 80;
extern void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);
};
inline void *xmemcpy(void *dest, const void *src, size_t len);
namespace com
{
template <size_t size>
struct xmemcpy_t
{
int data[size];
};
template <>
struct xmemcpy_t<0>
{
};
template <size_t size>
class xmemcopy
{
public:
inline static void * copy(void *dest, const void *src)
{
if (size > _MAXSIZE_)
{
size_t i = 0;
for (; i + _MAXSIZE_ <= size; i += _MAXSIZE_)
xmemcopy<_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
if (size % _MAXSIZE_)
xmemcopy<size % _MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
return dest;
}
typedef xmemcpy_t<((size - 1) % _MAXSIZE_ + 1) / sizeof(int)> type_t;
*((type_t *)dest) = *((type_t *)src);
if ((size%sizeof(int)) > 0) {
((char *)dest)[size - 1] = ((char *)src)[size - 1];
}
if ((size%sizeof(int)) > 1) {
((char *)dest)[size - 2] = ((char *)src)[size - 2];
}
if ((size%sizeof(int)) > 2) {
((char *)dest)[size - 3] = ((char *)src)[size - 3];
}
return dest;
}
};
template <>
class xmemcopy<0>
{
public:
static void * copy(void *dest, const void *src) { return dest; }
};
void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);
template <size_t len>
void init() {
g_base[len] = xmemcopy<len>::copy;
init<len - 1>();
}
template <>
void init<0>() {
g_base[0] = xmemcopy<0>::copy;
}
struct xmem_monitor
{
xmem_monitor()
{
init<_MAXSIZE_>();
}
};
static xmem_monitor g_monitor;
}
inline void *xmemcpy(void *dest, const void *src, size_t len)
{
if (len <= com::_MAXSIZE_) {
return com::g_base[len](dest, src);
}
else if (len <= com::_MAXSIZE_ * 10)
{
size_t i = 0;
for (; i + com::_MAXSIZE_ < len; i += com::_MAXSIZE_)
com::xmemcopy<com::_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
com::g_base[len - i]((char*)dest + i, (const char*)src + i);
return dest;
}
return ::memcpy(dest, src, len);
}
int _tmain(int argc, _TCHAR* argv[])
{
SetProcessAffinityMask(GetCurrentProcess(), 2);
char buffer[10000] = {};
com::xmemcopy<com::_MAXSIZE_ * 2>::copy(buffer, data400);
if (memcmp(buffer, data400, com::_MAXSIZE_ * 2))
__asm int 3;
com::xmemcopy<com::_MAXSIZE_ * 2 + 1>::copy(buffer, data400);
if (memcmp(buffer, data400, com::_MAXSIZE_ * 2 + 1))
__asm int 3;
com::xmemcopy<400>::copy(buffer, data400);
if(memcmp(buffer, data400, 400))
__asm int 3;
char* volatile pb = buffer;
char* volatile pb1 = buffer + 1;
size_t volatile size40 = sizeof(data80);
size_t volatile size41 = sizeof(data81);
assert((int)pb % 4 == 0);
assert((int)pb1 % 4 == 1);
assert((int)data80 % 8 == 0);
assert((int)data400 % 8 == 0);
assert((int)data4000 % 8 == 0);
for (int i = 0; i < 10; ++i)
{
memcpy(pb, data80, size40);
memcpy(pb, data81, size41);
memcpy(pb, data400, sizeof(data400));
memcpy(pb, data401, sizeof(data401));
memcpy(pb, data4000, sizeof(data4000));
}
printf("\n");
enum { Count = 100000000 };
#if(1)
{
auto& dest = data80;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
com::xmemcopy<sizeof(dest)>::copy(pb, dest);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
xmemcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
xmemcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy unknownSize)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb1, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data81;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data81;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb1, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
//
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
com::xmemcopy<sizeof(dest)>::copy(pb, dest);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
com::xmemcopy<sizeof(dest)>::copy(pb1, dest);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
#endif
memset(pb, 0, 400);
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
xmemcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb1, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data400;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
{
__asm
{
mov edi, dword ptr[pb];
mov ecx, size data400 / 4;
mov esi, dest;
rep movs dword ptr es : [edi], dword ptr[esi];
}
}
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data401;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, sizeof(dest));
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data401;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
{
__asm
{
mov edi, dword ptr[pb];
mov ecx, size data401 / 4;
mov esi, dest;
rep movs dword ptr es : [edi], dword ptr[esi];
movs byte ptr es : [edi], byte ptr[esi]
}
}
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
{
__asm
{
mov edi, dword ptr[pb];
mov ecx, size data80 / 4;
mov esi, dest;
rep movs dword ptr es : [edi], dword ptr[esi];
}
}
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data4000;
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
{
__asm
{
mov edi, dword ptr[pb];
mov ecx, size data4000 / 4;
mov esi, dest;
rep movs dword ptr es : [edi], dword ptr[esi];
}
}
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data80;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data81;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data81;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb1, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data400;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data401;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data401;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb1, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);
}
{
auto& dest = data4000;
size_t volatile size = sizeof(dest);
__int64 t = __rdtsc();
for (int i = 0; i < Count; ++i)
memcpy(pb, dest, size);
t = __rdtsc() - t;
printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
}
return 0;
}