使用SSE指令集优化memcpy

最新推荐文章于 2025-02-13 21:51:01 发布

mydear_11000

最新推荐文章于 2025-02-13 21:51:01 发布

阅读量3.3k

点赞数 1

GCC 4.4.6 编译测试通过

# gcc fast_memcpy.c -o fast_memcpy

[plain] view plain copy

#include <stdio.h>
#include <stdint.h>
/**
* Copy 16 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov16(uint8_t *dst, const uint8_t *src)
{
asm volatile ("movdqu (%[src]), %%xmm0\n\t"
"movdqu %%xmm0, (%[dst])\n\t"
:
: [src] "r" (src),
[dst] "r"(dst)
: "xmm0", "memory");
}
/**
* Copy 32 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov32(uint8_t *dst, const uint8_t *src)
{
asm volatile ("movdqu (%[src]), %%xmm0\n\t"
"movdqu 16(%[src]), %%xmm1\n\t"
"movdqu %%xmm0, (%[dst])\n\t"
"movdqu %%xmm1, 16(%[dst])"
:
: [src] "r" (src),
[dst] "r"(dst)
: "xmm0", "xmm1", "memory");
}
/**
* Copy 48 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov48(uint8_t *dst, const uint8_t *src)
{
asm volatile ("movdqu (%[src]), %%xmm0\n\t"
"movdqu 16(%[src]), %%xmm1\n\t"
"movdqu 32(%[src]), %%xmm2\n\t"
"movdqu %%xmm0, (%[dst])\n\t"
"movdqu %%xmm1, 16(%[dst])"
"movdqu %%xmm2, 32(%[dst])"
:
: [src] "r" (src),
[dst] "r"(dst)
: "xmm0", "xmm1", "memory");
}
/**
* Copy 64 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov64(uint8_t *dst, const uint8_t *src)
{
asm volatile ("movdqu (%[src]), %%xmm0\n\t"
"movdqu 16(%[src]), %%xmm1\n\t"
"movdqu 32(%[src]), %%xmm2\n\t"
"movdqu 48(%[src]), %%xmm3\n\t"
"movdqu %%xmm0, (%[dst])\n\t"
"movdqu %%xmm1, 16(%[dst])\n\t"
"movdqu %%xmm2, 32(%[dst])\n\t"
"movdqu %%xmm3, 48(%[dst])"
:
: [src] "r" (src),
[dst] "r"(dst)
: "xmm0", "xmm1", "xmm2", "xmm3","memory");
}
/**
* Copy 128 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov128(uint8_t *dst, const uint8_t *src)
{
asm volatile ("movdqu (%[src]), %%xmm0\n\t"
"movdqu 16(%[src]), %%xmm1\n\t"
"movdqu 32(%[src]), %%xmm2\n\t"
"movdqu 48(%[src]), %%xmm3\n\t"
"movdqu 64(%[src]), %%xmm4\n\t"
"movdqu 80(%[src]), %%xmm5\n\t"
"movdqu 96(%[src]), %%xmm6\n\t"
"movdqu 112(%[src]), %%xmm7\n\t"
"movdqu %%xmm0, (%[dst])\n\t"
"movdqu %%xmm1, 16(%[dst])\n\t"
"movdqu %%xmm2, 32(%[dst])\n\t"
"movdqu %%xmm3, 48(%[dst])\n\t"
"movdqu %%xmm4, 64(%[dst])\n\t"
"movdqu %%xmm5, 80(%[dst])\n\t"
"movdqu %%xmm6, 96(%[dst])\n\t"
"movdqu %%xmm7, 112(%[dst])"
:
: [src] "r" (src),
[dst] "r"(dst)
: "xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7", "memory");
}
/**
* Copy 256 bytes from one location to another using optimised SSE
* instructions. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
*/
static inline void
mov256(uint8_t *dst, const uint8_t *src)
{
/*
* There are 16XMM registers, but this function does not use
* them all so that it can still be compiled as 32bit
* code. The performance increase was neglible if all 16
* registers were used.
*/
mov128(dst, src);
mov128(dst + 128, src + 128);
}
/**
* Copy bytes from one location to another. The locations should not overlap.
*
* @param s1
* Pointer to the destination of the data.
* @param s2
* Pointer to the source data.
* @param n
* Number of bytes to copy.
* @return
* s1
*/
void *
fast_memcpy(void *s1, const void *s2, size_t n)
{
uint8_t *dst = (uint8_t *)s1;
const uint8_t *src = (const uint8_t *)s2;
/* We can't copy < 16 bytes using XMM registers so do it manually. */
if (n < 16) {
if (n & 0x01) {
*dst = *src;
dst += 1;
src += 1;
}
if (n & 0x02) {
*(uint16_t *)dst = *(const uint16_t *)src;
dst += 2;
src += 2;
}
if (n & 0x04) {
*(uint32_t *)dst = *(const uint32_t *)src;
dst += 4;
src += 4;
}
if (n & 0x08) {
*(uint64_t *)dst = *(const uint64_t *)src;
}
return dst;
}
/* Special fast cases for <= 128 bytes */
if (n <= 32) {
mov16(dst, src);
mov16(dst - 16 + n, src - 16 + n);
return s1;
}
if (n <= 64) {
mov32(dst, src);
mov32(dst - 32 + n, src - 32 + n);
return s1;
}
if (n <= 128) {
mov64(dst, src);
mov64(dst - 64 + n, src - 64 + n);
return s1;
}
/*
* For large copies > 128 bytes. This combination of 256, 64 and 16 byte
* copies was found to be faster than doing 128 and 32 byte copies as
* well.
*/
for (; n >= 256; n -= 256, dst += 256, src += 256) {
mov256(dst, src);
}
/*
* We split the remaining bytes (which will be less than 256) into
* 64byte (2^6) chunks.
* Using incrementing integers in the case labels of a switch statement
* enourages the compiler to use a jump table. To get incrementing
* integers, we shift the 2 relevant bits to the LSB position to first
* get decrementing integers, and then subtract.
*/
switch (3 - (n >> 6)) {
case 0x00:
mov64(dst, src);
n -= 64;
dst += 64;
src += 64; /* fallthrough */
case 0x01:
mov64(dst, src);
n -= 64;
dst += 64;
src += 64; /* fallthrough */
case 0x02:
mov64(dst, src);
n -= 64;
dst += 64;
src += 64; /* fallthrough */
default:
;
}
/*
* We split the remaining bytes (which will be less than 64) into
* 16byte (2^4) chunks, using the same switch structure as above.
*/
switch (3 - (n >> 4)) {
case 0x00:
mov16(dst, src);
n -= 16;
dst += 16;
src += 16; /* fallthrough */
case 0x01:
mov16(dst, src);
n -= 16;
dst += 16;
src += 16; /* fallthrough */
case 0x02:
mov16(dst, src);
n -= 16;
dst += 16;
src += 16; /* fallthrough */
default:
;
}
/* Copy any remaining bytes, without going beyond end of buffers */
if (n != 0) {
mov16(dst - 16 + n, src - 16 + n);
}
return s1;
}
uint8_t src[1024] = {0};
uint8_t dst[1024] = {0};
int main(int argc, char **argv)
{
fast_memcpy(dst, src, 1024);
return 0;
}