#include
<
stdio.h
>
#include < stdlib.h >
/**/ /* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)
... {
register unsigned long int dummy;
__asm__ __volatile__(
"rep; movsb"
:"=&D"(to), "=&S"(from), "=&c"(dummy)
:"0" (to), "1" (from),"2" (n)
: "memory");
}
/**/ /* linux kernel __memcpy (from: /include/asm/string.h) */
static inline void * __memcpy( void * to, const void * from, size_t n)
... {
int d0, d1, d2;
if ( n < 4 ) ...{
small_memcpy(to,from,n);
}
else
__asm__ __volatile__(
"rep ; movsl "
"testb $2,%b4 "
"je 1f "
"movsw "
"1: testb $1,%b4 "
"je 2f "
"movsb "
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return(to);
}
#ifdef HAVE_3DNOW
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define PREFETCH "prefetchnta"
#elif defined ( HAVE_3DNOW )
#define PREFETCH "prefetch"
#else
#define PREFETCH "/nop"
#endif
#undef MOVNTQ
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#else
#define MOVNTQ "movq"
#endif
#undef MIN_LEN
#ifdef HAVE_MMX1
#define MIN_LEN 0x800 /* 2K blocks */
#else
#define MIN_LEN 0x40 /* 64-byte blocks */
#endif
static void * big_memcpy( void * to, const void * from , size_t len) ... {
void *retval;
size_t i;
retval = to;
if(len >= MIN_LEN)
...{
register unsigned long int delta;
/**//* Align destinition to MMREG_SIZE -boundary */
delta = ((unsigned long int)to)&7;
if(delta)
...{
delta=8-delta;
len -= delta;
small_memcpy(to, from, delta);
}
i = len >> 6; /**//* len/64 */
len &= 63;
/**//*
This algorithm is top effective when the code consequently
reads and writes blocks which have size of cache line.
Size of cache line is processor-dependent.
It will, however, be a minimum of 32 bytes on any processors.
It would be better to have a number of instructions which
perform reading and writing to be multiple to a number of
processor's decoders, but it's not always possible.
*/
for(; i>0; i--)
...{
__asm__ __volatile__ (
PREFETCH" 320(%0) "
"movq (%0), %%mm0 "
"movq 8(%0), %%mm1 "
"movq 16(%0), %%mm2 "
"movq 24(%0), %%mm3 "
"movq 32(%0), %%mm4 "
"movq 40(%0), %%mm5 "
"movq 48(%0), %%mm6 "
"movq 56(%0), %%mm7 "
MOVNTQ" %%mm0, (%1) "
MOVNTQ" %%mm1, 8(%1) "
MOVNTQ" %%mm2, 16(%1) "
MOVNTQ" %%mm3, 24(%1) "
MOVNTQ" %%mm4, 32(%1) "
MOVNTQ" %%mm5, 40(%1) "
MOVNTQ" %%mm6, 48(%1) "
MOVNTQ" %%mm7, 56(%1) "
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#ifdef HAVE_MMX2
/**//* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
/**//* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
}
/**//*
* Now do the tail of the block
*/
if(len) small_memcpy(to, from, len);
return retval;
}
char src[ 1270 ];
char dest[ 1270 ];
void init_src()
... {
int i;
for(i = 0; i < 1260; i++)
src[i] = i+1;
}
void output( int beg, int end)
... {
int i;
for(i = beg-1; i < end; i++)
printf(" %d, ", dest[i]);
printf(" ");
}
void test_last16()
... {
// __memcpy(dest, src, 15);
big_memcpy(dest, src, 1100);
output(1, 1100);
}
int main()
... {
init_src();
test_last16();
return 0;
}
#include < stdlib.h >
/**/ /* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)
... {
register unsigned long int dummy;
__asm__ __volatile__(
"rep; movsb"
:"=&D"(to), "=&S"(from), "=&c"(dummy)
:"0" (to), "1" (from),"2" (n)
: "memory");
}
/**/ /* linux kernel __memcpy (from: /include/asm/string.h) */
static inline void * __memcpy( void * to, const void * from, size_t n)
... {
int d0, d1, d2;
if ( n < 4 ) ...{
small_memcpy(to,from,n);
}
else
__asm__ __volatile__(
"rep ; movsl "
"testb $2,%b4 "
"je 1f "
"movsw "
"1: testb $1,%b4 "
"je 2f "
"movsb "
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return(to);
}
#ifdef HAVE_3DNOW
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define PREFETCH "prefetchnta"
#elif defined ( HAVE_3DNOW )
#define PREFETCH "prefetch"
#else
#define PREFETCH "/nop"
#endif
#undef MOVNTQ
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#else
#define MOVNTQ "movq"
#endif
#undef MIN_LEN
#ifdef HAVE_MMX1
#define MIN_LEN 0x800 /* 2K blocks */
#else
#define MIN_LEN 0x40 /* 64-byte blocks */
#endif
static void * big_memcpy( void * to, const void * from , size_t len) ... {
void *retval;
size_t i;
retval = to;
if(len >= MIN_LEN)
...{
register unsigned long int delta;
/**//* Align destinition to MMREG_SIZE -boundary */
delta = ((unsigned long int)to)&7;
if(delta)
...{
delta=8-delta;
len -= delta;
small_memcpy(to, from, delta);
}
i = len >> 6; /**//* len/64 */
len &= 63;
/**//*
This algorithm is top effective when the code consequently
reads and writes blocks which have size of cache line.
Size of cache line is processor-dependent.
It will, however, be a minimum of 32 bytes on any processors.
It would be better to have a number of instructions which
perform reading and writing to be multiple to a number of
processor's decoders, but it's not always possible.
*/
for(; i>0; i--)
...{
__asm__ __volatile__ (
PREFETCH" 320(%0) "
"movq (%0), %%mm0 "
"movq 8(%0), %%mm1 "
"movq 16(%0), %%mm2 "
"movq 24(%0), %%mm3 "
"movq 32(%0), %%mm4 "
"movq 40(%0), %%mm5 "
"movq 48(%0), %%mm6 "
"movq 56(%0), %%mm7 "
MOVNTQ" %%mm0, (%1) "
MOVNTQ" %%mm1, 8(%1) "
MOVNTQ" %%mm2, 16(%1) "
MOVNTQ" %%mm3, 24(%1) "
MOVNTQ" %%mm4, 32(%1) "
MOVNTQ" %%mm5, 40(%1) "
MOVNTQ" %%mm6, 48(%1) "
MOVNTQ" %%mm7, 56(%1) "
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#ifdef HAVE_MMX2
/**//* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
/**//* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
}
/**//*
* Now do the tail of the block
*/
if(len) small_memcpy(to, from, len);
return retval;
}
char src[ 1270 ];
char dest[ 1270 ];
void init_src()
... {
int i;
for(i = 0; i < 1260; i++)
src[i] = i+1;
}
void output( int beg, int end)
... {
int i;
for(i = beg-1; i < end; i++)
printf(" %d, ", dest[i]);
printf(" ");
}
void test_last16()
... {
// __memcpy(dest, src, 15);
big_memcpy(dest, src, 1100);
output(1, 1100);
}
int main()
... {
init_src();
test_last16();
return 0;
}