Nginx内存池实现源码分析

概述

nginx内存分配将内存需求分成了两种:(1)大块内存(2)小内存。内存大小的判定依据是申请的内存是否比同页大小与pool的size两者都小。
对于大块内存,单独利用malloc来申请,并且使用单向链表管理起来
对于小块内存,则从已有的pool数据区中划分出一部分出来,这里的内存划分出去后没有特殊的结构来保存,而是等到申请对象生命周期结束后一起释放。小块内存的存储方式非常类似于sk_buffer,通过tail,end指针来表示多少内存已经被分配出去。

内存池存储结构

数据结构设计(src/core/ngx_palloc.h)

点击(此处)折叠或打开

  1. typedef struct {
  2.     u_char *last; /*已用空间的结尾*/
  3.     u_char *end; /*可用空间的结尾*/
  4.     ngx_pool_t *next; /*下一个可用的pool*/
  5.     ngx_uint_t failed; /*本pool分配内存失败次数*/
  6. } ngx_pool_data_t;


  7. struct ngx_pool_s {
  8.     ngx_pool_data_t d; /*pool内部信息,定义见上面*/
  9.     size_t max; /*可分配的最大空间*/
  10.     ngx_pool_t *current;/*当前pool*/
  11.     ngx_chain_t *chain; /*链表管理的buf*/
  12.     ngx_pool_large_t *large; /*大块内存单向链表头指针*/
  13.     ngx_pool_cleanup_t *cleanup; /*空间释放回调函数*/
  14.     ngx_log_t *log; /*日志句柄*/
  15. };

内存分配总流程


图1:nginx内存池分配流程示意图

图2: nginx内存池存储结构示意图


创建内存池

接口是( src/core/ngx_palloc.c

点击(此处)折叠或打开

  1. ngx_pool_t *
  2. ngx_create_pool(size_t size, ngx_log_t *log)
  3. {
  4.     ngx_pool_t *p;

  5.     p = ngx_memalign(NGX_POOL_ALIGNMENT, size, log); /*可以简单理解成分配了size大小的内存*/
  6.     if (p == NULL) {
  7.         return NULL;
  8.     }

  9.     p->d.last = (u_char *) p + sizeof(ngx_pool_t); /*已经保留了pool头部的空间*/
  10.     p->d.end = (u_char *) p + size; /*end指向分配全部空间的末尾*/
  11.     p->d.next = NULL;
  12.     p->d.failed = 0;

  13.     size = size - sizeof(ngx_pool_t); /*可供分配的空间大小,已经刨去了头部所占空间*/
  14.     p->max = (size < NGX_MAX_ALLOC_FROM_POOL) ? size : NGX_MAX_ALLOC_FROM_POOL;

  15.     p->current = p; /*current指向当前pool*/
  16.     p->chain = NULL;
  17.     p->large = NULL;
  18.     p->cleanup = NULL;
  19.     p->log = log;

  20.     return p;
  21. }

大块内存分配

大块内存的判定方法
step 1) 创建内存池时计算内存池分配内存的max值
点击( 此处 )折叠或打开
  1. size = size - sizeof(ngx_pool_t);
  2. /*pool的max取值为pool可用空间NGX_MAX_ALLOC_FROM_POOL之间的小值*/
  3. p->max = (size < NGX_MAX_ALLOC_FROM_POOL) ? size : NGX_MAX_ALLOC_FROM_POOL;
step 2)申请内存是根据申请内存的大小判定是否是大内存块

点击(此处)折叠或打开

  1. void *
  2. ngx_palloc(ngx_pool_t *pool, size_t size)
  3. {
  4.     u_char *m;
  5.     ngx_pool_t *p;

  6.     if (size <= pool->max) {

  7.         p = pool->current;

  8.         do {
  9.             m = ngx_align_ptr(p->d.last, NGX_ALIGNMENT);

  10.             if ((size_t) (p->d.end - m) >= size) {
  11.                 p->d.last = m + size;

  12.                 return m;
  13.             }

  14.             p = p->d.next;

  15.         } while (p);

  16.         return ngx_palloc_block(pool, size);
  17.     }

  18.     return ngx_palloc_large(pool, size); /*超过pool->max的判定为大内存,调用ngx_palloc_large申请内存*/
  19. }
ngx_palloc_large的实现

点击(此处)折叠或打开

  1. static void *
  2. ngx_palloc_large(ngx_pool_t *pool, size_t size)
  3. {
  4.     void *p;
  5.     ngx_uint_t n;
  6.     ngx_pool_large_t *large;

  7.     /*申请指定大小的内存*/
  8.     p = ngx_alloc(size, pool->log);
  9.     if (p == NULL) {
  10.         return NULL;
  11.     }

  12.     n = 0;

  13.     /*寻找大块内存的合适挂载weizhi*/
  14.     for (large = pool->large; large; large = large->next) {
  15.         if (large->alloc == NULL) {
  16.             large->alloc = p;
  17.             return p;
  18.         }

  19.         if (n++ > 3) {
  20.             break;
  21.         }
  22.     }

  23.     /*重新分配一个large节点,查到pool largelianb*/
  24.     large = ngx_palloc(pool, sizeof(ngx_pool_large_t));
  25.     if (large == NULL) {
  26.         ngx_free(p);
  27.         return NULL;
  28.     }

  29.     large->alloc = p;
  30.     large->next = pool->large;
  31.     pool->large = large;

  32.     return p;
  33. }

大块内存释放

大块内存的释放需要用户显式调用ngx_free,函数的实现如下:

点击(此处)折叠或打开

  1. ngx_int_t
  2. ngx_pfree(ngx_pool_t *pool, void *p)
  3. {
  4.     ngx_pool_large_t *l;

  5.     /*遍历链表,通过地址比较确认是否是需要释放的大块内存*/
  6.     for (l = pool->large; l; l = l->next) {
  7.         if (p == l->alloc) {
  8.             ngx_log_debug1(NGX_LOG_DEBUG_ALLOC, pool->log, 0,
  9.                            "free: %p", l->alloc);
  10.             ngx_free(l->alloc); /*ngx_freeji*/
  11.             l->alloc = NULL; /*注意: 这里并未释放large结构体的内存,可备后用*/

  12.             return NGX_OK;
  13.         }
  14.     }

  15.     return NGX_DECLINED;
  16. }

他山之石: sk_buff

sk_buff的数据结构与操作

点击(此处)折叠或打开

  1. /**
  2.  *    struct sk_buff - socket buffer
  3.  *    @next: Next buffer in list
  4.  *    @prev: Previous buffer in list
  5.  *    @sk: Socket we are owned by
  6.  *    @tstamp: Time we arrived
  7.  *    @dev: Device we arrived on/are leaving by
  8.  *    @transport_header: Transport layer header
  9.  *    @network_header: Network layer header
  10.  *    @mac_header: Link layer header
  11.  *    @_skb_dst: destination entry
  12.  *    @sp: the security path, used for xfrm
  13.  *    @cb: Control buffer. Free for use by every layer. Put private vars here
  14.  *    @len: Length of actual data
  15.  *    @data_len: Data length
  16.  *    @mac_len: Length of link layer header
  17.  *    @hdr_len: writable header length of cloned skb
  18.  *    @csum: Checksum (must include start/offset pair)
  19.  *    @csum_start: Offset from skb->head where checksumming should start
  20.  *    @csum_offset: Offset from csum_start where checksum should be stored
  21.  *    @local_df: allow local fragmentation
  22.  *    @cloned: Head may be cloned (check refcnt to be sure)
  23.  *    @nohdr: Payload reference only, must not modify header
  24.  *    @pkt_type: Packet class
  25.  *    @fclone: skbuff clone status
  26.  *    @ip_summed: Driver fed us an IP checksum
  27.  *    @priority: Packet queueing priority
  28.  *    @users: User count - see {datagram,tcp}.c
  29.  *    @protocol: Packet protocol from driver
  30.  *    @truesize: Buffer size
  31.  *    @head: Head of buffer
  32.  *    @data: Data head pointer
  33.  *    @tail: Tail pointer
  34.  *    @end: End pointer
  35.  *    @destructor: Destruct function
  36.  *    @mark: Generic packet mark
  37.  *    @nfct: Associated connection, if any
  38.  *    @ipvs_property: skbuff is owned by ipvs
  39.  *    @peeked: this packet has been seen already, so stats have been
  40.  *        done for it, don't do them again
  41.  *    @nf_trace: netfilter packet trace flag
  42.  *    @nfctinfo: Relationship of this skb to the connection
  43.  *    @nfct_reasm: netfilter conntrack re-assembly pointer
  44.  *    @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  45.  *    @iif: ifindex of device we arrived on
  46.  *    @queue_mapping: Queue mapping for multiqueue devices
  47.  *    @tc_index: Traffic control index
  48.  *    @tc_verd: traffic control verdict
  49.  *    @ndisc_nodetype: router type (from link layer)
  50.  *    @dma_cookie: a cookie to one of several possible DMA operations
  51.  *        done by skb DMA functions
  52.  *    @secmark: security marking
  53.  *    @vlan_tci: vlan tag control information
  54.  */

  55. struct sk_buff {
  56.     /* These two members must be first. */
  57.     struct sk_buff        *next;
  58.     struct sk_buff        *prev;

  59.     struct sock        *sk;
  60.     ktime_t            tstamp;
  61.     struct net_device    *dev;

  62.     unsigned long        _skb_dst;
  63. #ifdef CONFIG_XFRM
  64.     struct    sec_path    *sp;
  65. #endif
  66.     /*
  67.      * This is the control buffer. It is free to use for every
  68.      * layer. Please put your private variables there. If you
  69.      * want to keep them across layers you have to do a skb_clone()
  70.      * first. This is owned by whoever has the skb queued ATM.
  71.      */
  72.     char            cb[48];

  73.     unsigned int        len,
  74.                 data_len;
  75.     __u16            mac_len,
  76.                 hdr_len;
  77.     union {
  78.         __wsum        csum;
  79.         struct {
  80.             __u16    csum_start;
  81.             __u16    csum_offset;
  82.         };
  83.     };
  84.     __u32            priority;
  85.     kmemcheck_bitfield_begin(flags1);
  86.     __u8            local_df:1,
  87.                 cloned:1,
  88.                 ip_summed:2,
  89.                 nohdr:1,
  90.                 nfctinfo:3;
  91.     __u8            pkt_type:3,
  92.                 fclone:2,
  93.                 ipvs_property:1,
  94.                 peeked:1,
  95.                 nf_trace:1;
  96.     __be16            protocol:16;
  97.     kmemcheck_bitfield_end(flags1);

  98.     void            (*destructor)(struct sk_buff *skb);
  99. #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  100.     struct nf_conntrack    *nfct;
  101.     struct sk_buff        *nfct_reasm;
  102. #endif
  103. #ifdef CONFIG_BRIDGE_NETFILTER
  104.     struct nf_bridge_info    *nf_bridge;
  105. #endif

  106.     int            iif;
  107. #ifdef CONFIG_NET_SCHED
  108.     __u16            tc_index;    /* traffic control index */
  109. #ifdef CONFIG_NET_CLS_ACT
  110.     __u16            tc_verd;    /* traffic control verdict */
  111. #endif
  112. #endif

  113.     kmemcheck_bitfield_begin(flags2);
  114.     __u16            queue_mapping:16;
  115. #ifdef CONFIG_IPV6_NDISC_NODETYPE
  116.     __u8            ndisc_nodetype:2;
  117. #endif
  118.     kmemcheck_bitfield_end(flags2);

  119.     /* 0/14 bit hole */

  120. #ifdef CONFIG_NET_DMA
  121.     dma_cookie_t        dma_cookie;
  122. #endif
  123. #ifdef CONFIG_NETWORK_SECMARK
  124.     __u32            secmark;
  125. #endif

  126.     __u32            mark;

  127.     __u16            vlan_tci;

  128.     sk_buff_data_t        transport_header;
  129.     sk_buff_data_t        network_header;
  130.     sk_buff_data_t        mac_header;
  131.     /* These elements must be at the end, see alloc_skb() for details. */
  132.     sk_buff_data_t        tail;
  133.     sk_buff_data_t        end;
  134.     unsigned char        *head,
  135.                 *data;
  136.     unsigned int        truesize;
  137.     atomic_t        users;
  138. };

再来看看sk_buff的数据指针的作用(注:sk_buff的示意图来自网络):

图3: sk_buff的数据指针移动过程                            

sk_buff移动指针的操作有:

点击(此处)折叠或打开

  1. /*
  2.  *    Add data to an sk_buff
  3.  */
  4. extern unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
  5. static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
  6. {
  7.     unsigned char *tmp = skb_tail_pointer(skb);
  8.     SKB_LINEAR_ASSERT(skb);
  9.     skb->tail += len;
  10.     skb->len += len;
  11.     return tmp;
  12. }

  13. extern unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
  14. static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
  15. {
  16.     skb->data -= len;
  17.     skb->len += len;
  18.     return skb->data;
  19. }

  20. extern unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
  21. static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
  22. {
  23.     skb->len -= len;
  24.     BUG_ON(skb->len < skb->data_len);
  25.     return skb->data += len;
  26. }

  27. extern unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta);

  28. static inline unsigned char *__pskb_pull(struct sk_buff *skb, unsigned int len)
  29. {
  30.     if (len > skb_headlen(skb) &&
  31.      !__pskb_pull_tail(skb, len - skb_headlen(skb)))
  32.         return NULL;
  33.     skb->len -= len;
  34.     return skb->data += len;
  35. }

  36. static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len)
  37. {
  38.     return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
  39. }

  40. static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
  41. {
  42.     if (likely(len <= skb_headlen(skb)))
  43.         return 1;
  44.     if (unlikely(len > skb->len))
  45.         return 0;
  46.     return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
  47. }

  48. /**
  49.  *    skb_headroom - bytes at buffer head
  50.  *    @skb: buffer to check
  51.  *
  52.  *    Return the number of bytes of free space at the head of an &sk_buff.
  53.  */
  54. static inline unsigned int skb_headroom(const struct sk_buff *skb)
  55. {
  56.     return skb->data - skb->head;
  57. }

  58. /**
  59.  *    skb_tailroom - bytes at buffer end
  60.  *    @skb: buffer to check
  61.  *
  62.  *    Return the number of bytes of free space at the tail of an sk_buff
  63.  */
  64. static inline int skb_tailroom(const struct sk_buff *skb)
  65. {
  66.     return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
  67. }

  68. /**
  69.  *    skb_reserve - adjust headroom
  70.  *    @skb: buffer to alter
  71.  *    @len: bytes to move
  72.  *
  73.  *    Increase the headroom of an empty &sk_buff by reducing the tail
  74.  *    room. This is only allowed for an empty buffer.
  75.  */
  76. static inline void skb_reserve(struct sk_buff *skb, int len)
  77. {
  78.     skb->data += len;
  79.     skb->tail += len;
  80. }

sk_buff的核心思想之一是预先分配好足够的内存来容纳网络数据包的头部和数据体、以及其他的填充字段,在网络收包或者发包的过程中,剥离包头、添加包头的操作可以通过移动指针(而非不断地拷贝到新的内存空间)实现,由于收发包是网络模块最频繁的操作,因此这样的优化效果极为重要。

nginx的小块内存分配也是通过last和tail两个指针配合来完成内存分配的,与sk_buff的思想有一定的想通之处。

小块内存分配

小块内存分配的流程是: 首先从当前pool开始遍历所有的pools,寻找是否有足够的空间分配,如果有,则分配之(调整last指针,并返回调整前的last指针);否则需要重新分配一个block,连接到当前的pool链表中。


点击(此处)折叠或打开

  1. void *
  2. ngx_palloc(ngx_pool_t *pool, size_t size)
  3. {
  4.     u_char *m;
  5.     ngx_pool_t *p;

  6.     if (size <= pool->max) {

  7.         p = pool->current;

  8.         /*从current pool开始,遍历所有pool,查找这些pool中是否有足够的空间fenpe*/
  9.         do {
  10.             m = ngx_align_ptr(p->d.last, NGX_ALIGNMENT);

  11.             /*有足够的空间分配,则返回未分配空间的首地址,并调整last指针*/
  12.             if ((size_t) (p->d.end - m) >= size) {
  13.                 p->d.last = m + size;

  14.                 return m;
  15.             }

  16.             p = p->d.next;

  17.         } while (p);

  18.         /*未找到合适的pool,需要重新分配一个*/
  19.         return ngx_palloc_block(pool, size);
  20.     }

  21.     return ngx_palloc_large(pool, size);
  22. }
再看看ngx_palloc_block的实现

点击(此处)折叠或打开

  1. static void *
  2. ngx_palloc_block(ngx_pool_t *pool, size_t size)
  3. {
  4.     u_char *m;
  5.     size_t psize;
  6.     ngx_pool_t *p, *new, *current;

  7.     psize = (size_t) (pool->d.end - (u_char *) pool);

  8.     m = ngx_memalign(NGX_POOL_ALIGNMENT, psize, pool->log);/*可以简单理解成重新分配一块sizedaxia*/
  9.     if (m == NULL) {
  10.         return NULL;
  11.     }

  12.     new = (ngx_pool_t *) m;

  13.     new->d.end = m + psize;
  14.     new->d.next = NULL;
  15.     new->d.failed = 0;

  16.     m += sizeof(ngx_pool_data_t);
  17.     m = ngx_align_ptr(m, NGX_ALIGNMENT);
  18.     new->d.last = m + size;

  19.     /*从pool的current指针开始,查找下一个合适的current*/
  20.     current = pool->current;

  21.     for (p = current; p->d.next; p = p->d.next) {
  22.         if (p->d.failed++ > 4) {
  23.             current = p->d.next; /*四次分配失败,则表明当前的pool内存紧张,不适合继续充当第一个查找的pool,current后移*/
  24.         }
  25.     }/*请注意,这里一定会遍历到链表的末尾*/

  26.     /*将新分配的pool添加到链表的末尾*/
  27.     p->d.next = new;

  28.     pool->current = current ? current : new;

  29.     return m;
  30. }

与HAProxy的内存池管理简单对比

通过本文和前文《 HAProxy内存池实现源码分析》,可以知道nginx和haproxy的内存池管理各有千秋,我们可以根据自身应用的需求借鉴两者的方法。这里的对比不考虑两者代码的优雅性。

由于未对nginx和haproxy的源码全景进入细致的分析,下面的理解可能有错误或不准确之处。

nginx内存池策略的优点:
1)同时支持大块内存和零星内存的申请;
2)零星内存的申请效率较高;
3)对于变长的内存,基本不浪费碎片
缺点:
1)实现较为复杂
2)大块内存的复用程度不好,对于频繁使用的大块内存,系统调用开销仍然较大

haproxy内存池策略的优点:
1)大块内存的申请和释放效率高,复用率高
2)对于大块内存链表,不需要额外的空间管理(也就是不需要nginx的large结构体)
3)实现简单
缺点:
1)不支持小片内存从内存池中分配
2)由于pool定长并且每次分配一个trunk,内存浪费的碎片可能较多

参考文档

http://www.oschina.net/question/234345_42068
http://www.tbdata.org/archives/1390
http://simohayha.iteye.com/blog/545192

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值