大并发服务器内存转换的灵活运用,memcpy的思考

最新推荐文章于 2023-04-11 12:54:48 发布

lys07962000

最新推荐文章于 2023-04-11 12:54:48 发布

阅读量683

点赞数

分类专栏： C++/C 服务器开发

C++/C 同时被 2 个专栏收录

48 篇文章 0 订阅

订阅专栏

服务器开发

17 篇文章 1 订阅

订阅专栏

来自：http://blog.csdn.net/xiaofei_hah0000/article/details/8959167

在很多的网络开发中，经常会碰到一些内存转换，如下面的场景：

[cpp] view plain copy print ?

#define PACKAGE_PARSE_ERROR -1
#define PACKAGE_PARSE_OK 0
int parse_package( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
if( !buf || buf_len < 16 ){
return PACKAGE_PARSE_ERROR;
}
memcpy( a, buf, 4 );
memcpy( b, buf + 4, 4 );
memcpy( c, buf + 8, 4 );
memcpy( d, buf + 12, 4 );
return PACKAGE_PARSE_OK;
}

#define PACKAGE_PARSE_ERROR -1
#define PACKAGE_PARSE_OK 0

int parse_package( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
        if( !buf || buf_len < 16 ){
                return PACKAGE_PARSE_ERROR;
        }
        memcpy( a, buf, 4 );
        memcpy( b, buf + 4, 4 );
        memcpy( c, buf + 8, 4 );
        memcpy( d, buf + 12, 4 );

        return PACKAGE_PARSE_OK;
}

这是网络解包的过程中的一个调用，封包的过程则是逆过程。

像这样的应用其实完全可以用整型强制转换来代替，而且效率会至少提高一倍。

为了说明问题，我们举个简单的例子：

[cpp] view plain copy print ?

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
int main()
{
int s;
char buffer[4];
memcpy(&s, buffer, 4 );
s = *(int*)(buffer);
return 0;
}

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>

int main()
{
        int s;
        char buffer[4];

        memcpy(&s, buffer, 4 );
        s = *(int*)(buffer);
        return 0;
}

第10行和第11行的效果是一样的，10行采用的是内存复制，11行采用的是强制转换，为了方便比较，我们看一下汇编代码：

[cpp] view plain copy print ?

pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
leaq -16(%rbp), %rcx
leaq -4(%rbp), %rax
movl $4, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
leaq -16(%rbp), %rax
movl (%rax), %eax
movl %eax, -4(%rbp)
movl $0, %eax
leave

        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        subq    $16, %rsp
        leaq    -16(%rbp), %rcx
        leaq    -4(%rbp), %rax
        movl    $4, %edx
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    memcpy
        leaq    -16(%rbp), %rax
        movl    (%rax), %eax
        movl    %eax, -4(%rbp)
        movl    $0, %eax
        leave

代码中可以看出，内存复制方法占用了7-12行，共6行，强制转换占用了13-15行，共3行，指令上少了一半。

深究一下其实还不止，因为第12行其实是一个函数调用，必然会有栈的迁移，所以强制转换的效率比内存复制起码高一倍。

再看看glibc 的memcpy函数实现：

[cpp] view plain copy print ?

void *memcpy (void *dstpp, const void *srcpp, size_t len )
{
unsigned long int dstp = (long int) dstpp;
unsigned long int srcp = (long int) srcpp;
if (len >= OP_T_THRES)
{
len -= (-dstp) % OPSIZ;
BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
WORD_COPY_FWD (dstp, srcp, len, len);
}
BYTE_COPY_FWD (dstp, srcp, len);
return dstpp;
}

void *memcpy (void *dstpp, const void *srcpp, size_t len )
{
  unsigned long int dstp = (long int) dstpp;
  unsigned long int srcp = (long int) srcpp;

  if (len >= OP_T_THRES)
    {
      len -= (-dstp) % OPSIZ;
      BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
      PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
      WORD_COPY_FWD (dstp, srcp, len, len);
    }

  BYTE_COPY_FWD (dstp, srcp, len);

  return dstpp;
}

9-11行分别是三种处理方法，取决于 len 与 OP_T_THRES的比较，一般 OP_T_THRES 是8或16，对于len 小于OP_T_THRES的内存复制，glibc采用的是字节方式转换，即遍历每个字节，第个字节都要经过 “内存--寄存器--内存” 这个过程,CPU指令上可以说多了平空多了一倍。

从上面的分析可以看出，强制转换是节省了很大的运算时间，效率上至少提高一倍。不要小看这样的提升，在每秒几万并发的情况下，尤其每个并发都存在解包和封包的过程，这样的处理可以给我们带来相当大的性能提升。

开头中提到的解包过程，我们可以巧秒地运用强制转换，下面列出两种方法：

[cpp] view plain copy print ?

int parse_package( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
if( !buf || buf_len < 16 ){
return PACKAGE_PARSE_ERROR;
}
memcpy( a, buf, 4 );
memcpy( b, buf + 4, 4 );
memcpy( c, buf + 8, 4 );
memcpy( d, buf + 12, 4 );
return PACKAGE_PARSE_OK;
}

int parse_package( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
        if( !buf || buf_len < 16 ){
                return PACKAGE_PARSE_ERROR;
        }
        memcpy( a, buf, 4 );
        memcpy( b, buf + 4, 4 );
        memcpy( c, buf + 8, 4 );
        memcpy( d, buf + 12, 4 );

        return PACKAGE_PARSE_OK;
}

[cpp] view plain copy print ?

int parse_package2( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
int* ibuf;
if( !buf || buf_len < 16 ){
return PACKAGE_PARSE_ERROR;
}
ibuf = buf;
*a = ibuf[0];
*b = ibuf[1];
*c = ibuf[2];
*d = ibuf[3];
return PACKAGE_PARSE_OK;
}

int parse_package2( int* a, int* b, int* c, int* d, char* buf, int buf_len )
{
        int* ibuf;
        if( !buf || buf_len < 16 ){
                return PACKAGE_PARSE_ERROR;
        }

        ibuf = buf;
        *a = ibuf[0];
        *b = ibuf[1];
        *c = ibuf[2];
        *d = ibuf[3];

        return PACKAGE_PARSE_OK;
}

parse_package汇编代码：

[cpp] view plain copy print ?

parse_package:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $48, %rsp
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
movq %rdx, -24(%rbp)
movq %rcx, -32(%rbp)
movq %r8, -40(%rbp)
movl %r9d, -44(%rbp)
cmpq $0, -40(%rbp)
je .L2
cmpl $15, -44(%rbp)
jg .L3
.L2:
movl $-1, %eax
jmp .L4.
L3:
movq -40(%rbp), %rcx
movq -8(%rbp), %rax
movl $4, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
movq -40(%rbp), %rax
leaq 4(%rax), %rcx
movq -16(%rbp), %rax
movl $4, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
movq -40(%rbp), %rax
leaq 8(%rax), %rcx
movq -24(%rbp), %rax
movl $4, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
movq -40(%rbp), %rax
leaq 12(%rax), %rcx
movq -32(%rbp), %rax
movl $4, %edx
movq %rcx, %rsi
movq %rax, %rdi
call memcpy
movl $0, %eax

parse_package:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        subq    $48, %rsp
        movq    %rdi, -8(%rbp)
        movq    %rsi, -16(%rbp)
        movq    %rdx, -24(%rbp)
        movq    %rcx, -32(%rbp)
        movq    %r8, -40(%rbp)
        movl    %r9d, -44(%rbp)
        cmpq    $0, -40(%rbp)
        je      .L2
        cmpl    $15, -44(%rbp)
        jg      .L3
.L2:
        movl    $-1, %eax
        jmp     .L4.
L3:
        movq    -40(%rbp), %rcx
        movq    -8(%rbp), %rax
        movl    $4, %edx
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    memcpy
        movq    -40(%rbp), %rax
        leaq    4(%rax), %rcx
        movq    -16(%rbp), %rax
        movl    $4, %edx
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    memcpy
        movq    -40(%rbp), %rax
        leaq    8(%rax), %rcx
        movq    -24(%rbp), %rax
        movl    $4, %edx
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    memcpy
        movq    -40(%rbp), %rax
        leaq    12(%rax), %rcx
        movq    -32(%rbp), %rax
        movl    $4, %edx
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    memcpy
        movl    $0, %eax

L3段是我们的主段落，对a的赋值：

24-28行都是在“压栈”，为了memcpy函数内取出来，加上29行一共是6条，memcpy 解栈指令数>=3, 去处指令数>=4,不加算返回指令，一共指令数>6+3+4=13。

parse_package2汇编代码：

[cpp] view plain copy print ?

parse_package2:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movq %rcx, -48(%rbp)
movq %r8, -56(%rbp)
movl %r9d, -60(%rbp)
cmpq $0, -56(%rbp)
je .L7
cmpl $15, -60(%rbp)
jg .L8
.L7:
movl $-1, %eax
jmp .L9
.L8:
movq -56(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movl (%rax), %edx
movq -24(%rbp), %rax
movl %edx, (%rax)
movq -8(%rbp), %rax
addq $4, %rax
movl (%rax), %edx
movq -32(%rbp), %rax
movl %edx, (%rax)
movq -8(%rbp), %rax
addq $8, %rax
movl (%rax), %edx
movq -40(%rbp), %rax
movl %edx, (%rax)
movq -8(%rbp), %rax
addq $12, %rax
movl (%rax), %edx
movq -48(%rbp), %rax
movl %edx, (%rax)
movl $0, %eax

parse_package2:
.LFB1:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        movq    %rdi, -24(%rbp)
        movq    %rsi, -32(%rbp)
        movq    %rdx, -40(%rbp)
        movq    %rcx, -48(%rbp)
        movq    %r8, -56(%rbp)
        movl    %r9d, -60(%rbp)
        cmpq    $0, -56(%rbp)
        je      .L7     
        cmpl    $15, -60(%rbp)
        jg      .L8     
.L7:
        movl    $-1, %eax
        jmp     .L9     

.L8:
        movq    -56(%rbp), %rax
        movq    %rax, -8(%rbp)
        movq    -8(%rbp), %rax
        movl    (%rax), %edx
        movq    -24(%rbp), %rax
        movl    %edx, (%rax)
        movq    -8(%rbp), %rax
        addq    $4, %rax
        movl    (%rax), %edx
        movq    -32(%rbp), %rax
        movl    %edx, (%rax)
        movq    -8(%rbp), %rax
        addq    $8, %rax
        movl    (%rax), %edx
        movq    -40(%rbp), %rax
        movl    %edx, (%rax)
        movq    -8(%rbp), %rax
        addq    $12, %rax
        movl    (%rax), %edx
        movq    -48(%rbp), %rax
        movl    %edx, (%rax)
        movl    $0, %eax

L8是主段落，对a的赋值：

26-29行，一共4行解决。

这个例子中强制转换（parse_package2) 比内存复制（parse_package)要少2倍的CPU指令，性能至少可以提高2倍。

因此，我们的开发中应该尽量减少对内存复制的使用，而应该采用强制转换，现在64位服务器上，我们甚至可以用8个字节的long，就像下面这样：

[cpp] view plain copy print ?

long lv;
char buffer[ 8 ];
memcpy( &lv, buffer, 8 );
lv = *(long*)(buffer);

long lv;
char buffer[ 8 ];

memcpy( &lv, buffer, 8 );
lv = *(long*)(buffer);

这样就能更好的利用CPU的多字节指令提高性能。

lys07962000

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
大并发服务器内存转换的灵活运用,memcpy的思考

来自：http://blog.csdn.net/xiaofei_hah0000/article/details/8959167
复制链接

扫一扫

专栏目录

大并发服务器内存转换的灵活运用,memcpy的思考

“相关推荐”对你有帮助么？