10.2 发送缓存管理

最新推荐文章于 2023-01-27 20:20:24 发布

Remy1119

最新推荐文章于 2023-01-27 20:20:24 发布

阅读量1.6k

点赞数 1

分类专栏： TCP协议 TCP协议详解文章标签： linux内核 tcp 网络

本文链接：https://blog.csdn.net/u011130578/article/details/45024909

版权

TCP协议详解同时被 2 个专栏收录

61 篇文章 64 订阅

订阅专栏

TCP协议

60 篇文章 14 订阅

订阅专栏

应用进程使用TCP发送的数据会先放入发送缓存中，TCP的发送缓存是一个skb队列。这个队列存在的意义是：保证应用进程交付TCP的数据能够可靠地交付目的端。在收到对端的ACK之前，发送缓存中的数据不能删除。

10.2.1 使用缓存

对发送缓存的使用是从tcp_sendmsg函数开始的：

 1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017         size_t size)
1018 {           
1019     struct iovec *iov;
1020     struct tcp_sock *tp = tcp_sk(sk);
1021     struct sk_buff *skb;
1022     int iovlen, flags, err, copied = 0;
1023     int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024     bool sg;
1025     long timeo;
...
1106             if (copy <= 0) {
1107 new_segment:
1108                 /* Allocate new segment. If the interface is SG,
1109                  * allocate skb fitting to single page.
1110                  */
1111                 if (!sk_stream_memory_free(sk)) //检查已占用内存是否达到限制
1112                     goto wait_for_sndbuf;
1113
1114                 skb = sk_stream_alloc_skb(sk,
1115                               select_size(sk, sg),
1116                               sk->sk_allocation); //申请内存
1117                 if (!skb)
1118                     goto wait_for_memory;
...
1133                 skb_entail(sk, skb);
...
1149             } else {    //使用skb的非线性区
1150                 bool merge = true;
1151                 int i = skb_shinfo(skb)->nr_frags;
1152                 struct page_frag *pfrag = sk_page_frag(sk);
1153 
1154                 if (!sk_page_frag_refill(sk, pfrag))
1155                     goto wait_for_memory;
...
1168                 if (!sk_wmem_schedule(sk, copy))
1169                     goto wait_for_memory;
1170 
1171                 err = skb_copy_to_page_nocache(sk, from, skb,
1172                                    pfrag->page,
1173                                    pfrag->offset,
1174                                    copy);

sk_stream_memory_free 函数：

 743 static inline bool sk_stream_memory_free(const struct sock *sk)
 744 {                             
 745     return sk->sk_wmem_queued < sk->sk_sndbuf;
 746 }

当发送队列中占用缓存总数小于发送缓存大小时，则发送缓存尚有剩余空间。

sk_stream_alloc_skb函数用于申请skb：

 754 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 755 {
 756     struct sk_buff *skb;
 757
 758     /* The TCP header must be at least 32-bit aligned.  */
 759     size = ALIGN(size, 4);
 760
 761     skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 762     if (skb) {
 763         if (sk_wmem_schedule(sk, skb->truesize)) {     //检查是否允许使用skb->truesize大小的内存
 764             skb_reserve(skb, sk->sk_prot->max_header);
 765             /*
 766              * Make sure that we have exactly size bytes
 767              * available to the caller, no more, no less.
 768              */
 769             skb->reserved_tailroom = skb->end - skb->tail - size;
 770             return skb;
 771         }
 772         __kfree_skb(skb);
 773     } else {    //内存紧张
 774         sk->sk_prot->enter_memory_pressure(sk);//调用tcp_enter_memory_pressure函数更新Linux MIB，并设置tcp_memory_pressure = 1
 775         sk_stream_moderate_sndbuf(sk);    //缩小发送缓存大小的上限
 776     }
 777     return NULL;
 778 }

sk_wmem_schedule：

1361 static inline bool sk_has_account(struct sock *sk)
1362 {
1363     /* return true if protocol supports memory accounting */
1364     return !!sk->sk_prot->memory_allocated;//指向tcp_memory_allocated，记录TCP使用的所有内存总数
1365 }   
1366         
1367 static inline bool sk_wmem_schedule(struct sock *sk, int size)
1368 {   
1369     if (!sk_has_account(sk))//对于TCP此判断不为真
1370         return true;
1371     return size <= sk->sk_forward_alloc ||
1372         __sk_mem_schedule(sk, size, SK_MEM_SEND);
1373 }

__sk_mem_schedule用于增加预分配内存和已分配内存:

1923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924 {
1925     struct proto *prot = sk->sk_prot;
1926     int amt = sk_mem_pages(size);
1927     long allocated;
1928     int parent_status = UNDER_LIMIT;
1929     
1930     sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;  //增加预分配内存
1931     
1932     allocated = sk_memory_allocated_add(sk, amt, &parent_status);    //tcp_memory_allocated += amt
1933         
1934     /* Under limit. */
1935     if (parent_status == UNDER_LIMIT &&
1936             allocated <= sk_prot_mem_limits(sk, 0)) {    //sk_prot_mem_limits(sk, 0)的值等于net.ipv4.tcp_mem[0]内核参数
1937         sk_leave_memory_pressure(sk);
1938         return 1;
1939     }   
1940     
1941     /* Under pressure. (we or our parents) */
1942     if ((parent_status > SOFT_LIMIT) ||
1943             allocated > sk_prot_mem_limits(sk, 1))    //sk_prot_mem_limits(sk, 1)的值等于net.ipv4.tcp_mem[1]内核参数
1944         sk_enter_memory_pressure(sk);    //设置tcp_memory_pressure为1
1945
1946     /* Over hard limit (we or our parents) */
1947     if ((parent_status == OVER_LIMIT) ||
1948             (allocated > sk_prot_mem_limits(sk, 2)))//sk_prot_mem_limits(sk, 2)的值等于net.ipv4.tcp_mem[2]内核参数
1949         goto suppress_allocation;   //内存压力很重
1950
1951     /* guarantee minimum buffer size under pressure */
1952     if (kind == SK_MEM_RECV) {
1953         if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])//prot->sysctl_rmem[0]的值等于net.ipv4.tcp_rmem[0]内核参数
1954             return 1;
1955
1956     } else { /* SK_MEM_SEND */
1957         if (sk->sk_type == SOCK_STREAM) {
1958             if (sk->sk_wmem_queued < prot->sysctl_wmem[0])//prot->sysctl_wmem[0]的值等于net.ipv4.tcp_wmem[0]内核参数
1959                 return 1;
1960         } else if (atomic_read(&sk->sk_wmem_alloc) <
1961                prot->sysctl_wmem[0])
1962                 return 1;
1963     }
1964
1965     if (sk_has_memory_pressure(sk)) {
1966         int alloc;
1967
1968         if (!sk_under_memory_pressure(sk))  //tcp_memory_pressure == 0
1969             return 1;
1970         alloc = sk_sockets_allocated_read_positive(sk);    //返回当前已分配的TCP socket的数量
1971         if (sk_prot_mem_limits(sk, 2) > alloc *
1972             sk_mem_pages(sk->sk_wmem_queued +
1973                  atomic_read(&sk->sk_rmem_alloc) +
1974                  sk->sk_forward_alloc))    //net.ipv4.tcp_mem[2] > 发送队列占用内存 + 接收队列占用内存 + 预分配剩余内存
1975             return 1;
1976     }
1977
1978 suppress_allocation:
1979
1980     if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981         sk_stream_moderate_sndbuf(sk);  //缩小snd_buf
1982
1983         /* Fail only if socket is _under_ its sndbuf.
1984          * In this case we cannot block, so that we have to fail.
1985          */
1986         if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)    //为什么是超过限制返回1而不是未超过？
1987             return 1;
1988     }
1989
1990     trace_sock_exceed_buf_limit(sk, prot, allocated);
1991    //没有分配成功，恢复分配前的内存计数
1992     /* Alas. Undo changes. */
1993     sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995     sk_memory_allocated_sub(sk, amt);
1996
1997     return 0;
1998 }

skb_entail函数会将skb放入发送队列，并更新缓存信息：

 596 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 597 {       
 598     struct tcp_sock *tp = tcp_sk(sk);
 599     struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 600         
 601     skb->csum    = 0;
 602     tcb->seq     = tcb->end_seq = tp->write_seq;
 603     tcb->tcp_flags = TCPHDR_ACK;
 604     tcb->sacked  = 0;
 605     skb_header_release(skb);
 606     tcp_add_write_queue_tail(sk, skb);
 607     sk->sk_wmem_queued += skb->truesize; //更新sk_wmem_queued
 608     sk_mem_charge(sk, skb->truesize);
 609     if (tp->nonagle & TCP_NAGLE_PUSH)
 610         tp->nonagle &= ~TCP_NAGLE_PUSH;
 611 }

sk_mem_charge函数会更新预分配缓存的值：

1401 static inline void sk_mem_charge(struct sock *sk, int size)
1402 {       
1403     if (!sk_has_account(sk))
1404         return;
1405     sk->sk_forward_alloc -= size;
1406 }

成功申请非线性区的空间后要使用 sk_page_frag_refill函数更新内存信息：

1796 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797 {                                  
1798     int order;                     
1799                                    
1800     if (pfrag->page) {
1801         if (atomic_read(&pfrag->page->_count) == 1) {
1802             pfrag->offset = 0;
1803             return true;
1804         }       
1805         if (pfrag->offset < pfrag->size)    //page中还有剩余空间
1806             return true;
1807         put_page(pfrag->page);
1808     }                          
1809                     
1810     /* We restrict high order allocations to users that can afford to wait */
1811     order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812             
1813     do {
1814         gfp_t gfp = sk->sk_allocation;
1815                 
1816         if (order)
1817             gfp |= __GFP_COMP | __GFP_NOWARN;
1818         pfrag->page = alloc_pages(gfp, order);
1819         if (likely(pfrag->page)) {
1820             pfrag->offset = 0;
1821             pfrag->size = PAGE_SIZE << order;
1822             return true;
1823         }
1824     } while (--order >= 0);
1825    //申请page不成功
1826     sk_enter_memory_pressure(sk);
1827     sk_stream_moderate_sndbuf(sk);
1828     return false;
1829 }

向非线性区填充数据后要使用skb_copy_to_page_nocache更新缓存信息：

1832 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
1833                        struct sk_buff *skb,
1834                        struct page *page,
1835                        int off, int copy)
1836 {               
1837     int err;
1838             
1839     err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
1840                        copy, skb->len);
1841     if (err)
1842         return err;
1843             
1844     skb->len         += copy;
1845     skb->data_len        += copy;
1846     skb->truesize        += copy;
1847     sk->sk_wmem_queued   += copy;
1848     sk_mem_charge(sk, copy);
1849     return 0;
1850 }

tcp_transmit_skb函数在发送skb时会占用sk->sk_wmem_alloc缓存：

 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 829                 gfp_t gfp_mask)
 830 {
...
 890     skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
 891               tcp_wfree : sock_wfree;    //skb释放时调用tcp_wfree或sock_wfree
 892     atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 ...

10.2.2 释放缓存

tcp_transmit_skb发送出去的skb被释放时（网卡驱动在发送完毕数据后释放skb，或IP发送队列满导致丢包时）会调用tcp_wfree或sock_wfree函数，并更新sk->sk_wmem_alloc的数值：

 791 void tcp_wfree(struct sk_buff *skb)
 792 {
 793     struct sock *sk = skb->sk;
 794     struct tcp_sock *tp = tcp_sk(sk);
 795 
 796     if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
 797         !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
 798         unsigned long flags;
 799         struct tsq_tasklet *tsq;
 800 
 801         /* Keep a ref on socket.
 802          * This last ref will be released in tcp_tasklet_func()
 803          */
 804         atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
 805 
 806         /* queue this socket to tasklet queue */
 807         local_irq_save(flags);
 808         tsq = &__get_cpu_var(tsq_tasklet);
 809         list_add(&tp->tsq_node, &tsq->head);
 810         tasklet_schedule(&tsq->tasklet);
 811         local_irq_restore(flags);
 812     } else {
 813         sock_wfree(skb);
 814     }
 815 }

sock_wfree函数：

1534 void sock_wfree(struct sk_buff *skb)
1535 {
1536     struct sock *sk = skb->sk;
1537     unsigned int len = skb->truesize;
1538 
1539     if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540         /*
1541          * Keep a reference on sk_wmem_alloc, this will be released
1542          * after sk_write_space() call
1543          */
1544         atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545         sk->sk_write_space(sk);
1546         len = 1;
1547     }
1548     /*
1549      * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550      * could not do because of in-flight packets
1551      */
1552     if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553         __sk_free(sk);
1554 }

收到对端的ACK后，tcp_ack函数会调用tcp_clean_rtx_queue释放发送缓存中的skb，tcp_clean_rtx_queue函数会调用sk_wmem_free_skb释放skb并更新内存信息：

1408 static inline void sk_mem_uncharge(struct sock *sk, int size)
1409 {       
1410     if (!sk_has_account(sk))
1411         return;
1412     sk->sk_forward_alloc += size;
1413 }
1414
1415 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1416 {
1417     sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1418     sk->sk_wmem_queued -= skb->truesize;
1419     sk_mem_uncharge(sk, skb->truesize);
1420     __kfree_skb(skb);
1421 }

收到ACK后TCP会调用tcp_data_snd_check函数尝试扩大发送缓存：

 4688 static bool tcp_should_expand_sndbuf(const struct sock *sk)
4689 {
4690     const struct tcp_sock *tp = tcp_sk(sk);
4691
4692     /* If the user specified a specific send buffer setting, do
4693      * not modify it.
4694      */
4695     if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4696         return false;
4697
4698     /* If we are under global TCP memory pressure, do not expand.  */
4699     if (sk_under_memory_pressure(sk))
4700         return false;
4701
4702     /* If we are under soft global TCP memory pressure, do not expand.  */
4703     if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4704         return false;
4705
4706     /* If we filled the congestion window, do not expand.  */
4707     if (tp->packets_out >= tp->snd_cwnd)
4708         return false;
4709
4710     return true;
4711 }
...
4719 static void tcp_new_space(struct sock *sk)
4720 {
4721     struct tcp_sock *tp = tcp_sk(sk);
4722
4723     if (tcp_should_expand_sndbuf(sk)) {
4724         int sndmem = SKB_TRUESIZE(max_t(u32,
4725                         tp->rx_opt.mss_clamp,          
4726                         tp->mss_cache) +               
4727                       MAX_TCP_HEADER);               
4728         int demanded = max_t(unsigned int, tp->snd_cwnd,
4729                      tp->reordering + 1);           
4730         sndmem *= 2 * demanded;        
4731         if (sndmem > sk->sk_sndbuf)    
4732             sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4733         tp->snd_cwnd_stamp = tcp_time_stamp;
4734     }
4735
4736     sk->sk_write_space(sk);    //指向sk_stream_write_space函数，通知应用进程发送缓存有空余，可以发送数据
4737 }
4738
4739 static void tcp_check_space(struct sock *sk)
4740 {
4741     if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {    //发送队列减小
4742         sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4743         if (sk->sk_socket &&
4744             test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))    //有进程在等待发送缓存的空间
4745             tcp_new_space(sk);    //尝试扩大发送缓存
4746     }
4747 }
4748
4749 static inline void tcp_data_snd_check(struct sock *sk)
4750 {
4751     tcp_push_pending_frames(sk);   
4752     tcp_check_space(sk);
4753 }

扩大发送缓存的条件：

（1）由于ACK确认了数据并删除了skb使得发送队列减小

（2）应用进程在向内核写入数据时由于内存不足而等待

（3）应用进程没有使用SO_SNDBUF socket选项设置snd_buf大小

（4）全局TCP缓存没有处于极度紧张状态

（5）全局TCP缓存没有处于相对紧张状态（全局已分配TCP内存小于net.ipv4.tcp_mem[0]）

（6）TCP已经发送并在网络中的包数小于拥塞窗口