应用进程使用TCP发送的数据会先放入发送缓存中,TCP的发送缓存是一个skb队列。这个队列存在的意义是:保证应用进程交付TCP的数据能够可靠地交付目的端。在收到对端的ACK之前,发送缓存中的数据不能删除。
10.2.1 使用缓存
对发送缓存的使用是从tcp_sendmsg函数开始的:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018 {
1019 struct iovec *iov;
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 struct sk_buff *skb;
1022 int iovlen, flags, err, copied = 0;
1023 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024 bool sg;
1025 long timeo;
...
1106 if (copy <= 0) {
1107 new_segment:
1108 /* Allocate new segment. If the interface is SG,
1109 * allocate skb fitting to single page.
1110 */
1111 if (!sk_stream_memory_free(sk)) //检查已占用内存是否达到限制
1112 goto wait_for_sndbuf;
1113
1114 skb = sk_stream_alloc_skb(sk,
1115 select_size(sk, sg),
1116 sk->sk_allocation); //申请内存
1117 if (!skb)
1118 goto wait_for_memory;
...
1133 skb_entail(sk, skb);
...
1149 } else { //使用skb的非线性区
1150 bool merge = true;
1151 int i = skb_shinfo(skb)->nr_frags;
1152 struct page_frag *pfrag = sk_page_frag(sk);
1153
1154 if (!sk_page_frag_refill(sk, pfrag))
1155 goto wait_for_memory;
...
1168 if (!sk_wmem_schedule(sk, copy))
1169 goto wait_for_memory;
1170
1171 err = skb_copy_to_page_nocache(sk, from, skb,
1172 pfrag->page,
1173 pfrag->offset,
1174 copy);
sk_stream_memory_free
函数:
743 static inline bool sk_stream_memory_free(const struct sock *sk)
744 {
745 return sk->sk_wmem_queued < sk->sk_sndbuf;
746 }
当发送队列中占用缓存总数小于发送缓存大小时,则发送缓存尚有剩余空间。
sk_stream_alloc_skb函数用于申请skb:
754 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
755 {
756 struct sk_buff *skb;
757
758 /* The TCP header must be at least 32-bit aligned. */
759 size = ALIGN(size, 4);
760
761 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
762 if (skb) {
763 if (sk_wmem_schedule(sk, skb->truesize)) { //检查是否允许使用skb->truesize大小的内存
764 skb_reserve(skb, sk->sk_prot->max_header);
765 /*
766 * Make sure that we have exactly size bytes
767 * available to the caller, no more, no less.
768 */
769 skb->reserved_tailroom = skb->end - skb->tail - size;
770 return skb;
771 }
772 __kfree_skb(skb);
773 } else { //内存紧张
774 sk->sk_prot->enter_memory_pressure(sk);//调用tcp_enter_memory_pressure函数更新Linux MIB,并设置tcp_memory_pressure = 1
775 sk_stream_moderate_sndbuf(sk); //缩小发送缓存大小的上限
776 }
777 return NULL;
778 }
sk_wmem_schedule:
1361 static inline bool sk_has_account(struct sock *sk)
1362 {
1363 /* return true if protocol supports memory accounting */
1364 return !!sk->sk_prot->memory_allocated;//指向tcp_memory_allocated,记录TCP使用的所有内存总数
1365 }
1366
1367 static inline bool sk_wmem_schedule(struct sock *sk, int size)
1368 {
1369 if (!sk_has_account(sk))//对于TCP此判断不为真
1370 return true;
1371 return size <= sk->sk_forward_alloc ||
1372 __sk_mem_schedule(sk, size, SK_MEM_SEND);
1373 }
__sk_mem_schedule用于增加预分配内存和已分配内存:
1923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924 {
1925 struct proto *prot = sk->sk_prot;
1926 int amt = sk_mem_pages(size);
1927 long allocated;
1928 int parent_status = UNDER_LIMIT;
1929
1930 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; //增加预分配内存
1931
1932 allocated = sk_memory_allocated_add(sk, amt, &parent_status); //tcp_memory_allocated += amt
1933
1934 /* Under limit. */
1935 if (parent_status == UNDER_LIMIT &&
1936 allocated <= sk_prot_mem_limits(sk, 0)) { //sk_prot_mem_limits(sk, 0)的值等于net.ipv4.tcp_mem[0]内核参数
1937 sk_leave_memory_pressure(sk);
1938 return 1;
1939 }
1940
1941 /* Under pressure. (we or our parents) */
1942 if ((parent_status > SOFT_LIMIT) ||
1943 allocated > sk_prot_mem_limits(sk, 1)) //sk_prot_mem_limits(sk, 1)的值等于net.ipv4.tcp_mem[1]内核参数
1944 sk_enter_memory_pressure(sk); //设置tcp_memory_pressure为1
1945
1946 /* Over hard limit (we or our parents) */
1947 if ((parent_status == OVER_LIMIT) ||
1948 (allocated > sk_prot_mem_limits(sk, 2)))//sk_prot_mem_limits(sk, 2)的值等于net.ipv4.tcp_mem[2]内核参数
1949 goto suppress_allocation; //内存压力很重
1950
1951 /* guarantee minimum buffer size under pressure */
1952 if (kind == SK_MEM_RECV) {
1953 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])//prot->sysctl_rmem[0]的值等于net.ipv4.tcp_rmem[0]内核参数
1954 return 1;
1955
1956 } else { /* SK_MEM_SEND */
1957 if (sk->sk_type == SOCK_STREAM) {
1958 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])//prot->sysctl_wmem[0]的值等于net.ipv4.tcp_wmem[0]内核参数
1959 return 1;
1960 } else if (atomic_read(&sk->sk_wmem_alloc) <
1961 prot->sysctl_wmem[0])
1962 return 1;
1963 }
1964
1965 if (sk_has_memory_pressure(sk)) {
1966 int alloc;
1967
1968 if (!sk_under_memory_pressure(sk)) //tcp_memory_pressure == 0
1969 return 1;
1970 alloc = sk_sockets_allocated_read_positive(sk); //返回当前已分配的TCP socket的数量
1971 if (sk_prot_mem_limits(sk, 2) > alloc *
1972 sk_mem_pages(sk->sk_wmem_queued +
1973 atomic_read(&sk->sk_rmem_alloc) +
1974 sk->sk_forward_alloc)) //net.ipv4.tcp_mem[2] > 发送队列占用内存 + 接收队列占用内存 + 预分配剩余内存
1975 return 1;
1976 }
1977
1978 suppress_allocation:
1979
1980 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981 sk_stream_moderate_sndbuf(sk); //缩小snd_buf
1982
1983 /* Fail only if socket is _under_ its sndbuf.
1984 * In this case we cannot block, so that we have to fail.
1985 */
1986 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) //为什么是超过限制返回1而不是未超过?
1987 return 1;
1988 }
1989
1990 trace_sock_exceed_buf_limit(sk, prot, allocated);
1991 //没有分配成功,恢复分配前的内存计数
1992 /* Alas. Undo changes. */
1993 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995 sk_memory_allocated_sub(sk, amt);
1996
1997 return 0;
1998 }
skb_entail函数会将skb放入发送队列,并更新缓存信息:
596 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
597 {
598 struct tcp_sock *tp = tcp_sk(sk);
599 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
600
601 skb->csum = 0;
602 tcb->seq = tcb->end_seq = tp->write_seq;
603 tcb->tcp_flags = TCPHDR_ACK;
604 tcb->sacked = 0;
605 skb_header_release(skb);
606 tcp_add_write_queue_tail(sk, skb);
607 sk->sk_wmem_queued += skb->truesize; //更新sk_wmem_queued
608 sk_mem_charge(sk, skb->truesize);
609 if (tp->nonagle & TCP_NAGLE_PUSH)
610 tp->nonagle &= ~TCP_NAGLE_PUSH;
611 }
sk_mem_charge函数会更新预分配缓存的值:
1401 static inline void sk_mem_charge(struct sock *sk, int size)
1402 {
1403 if (!sk_has_account(sk))
1404 return;
1405 sk->sk_forward_alloc -= size;
1406 }
成功申请非线性区的空间后要使用
sk_page_frag_refill函数更新内存信息:
1796 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797 {
1798 int order;
1799
1800 if (pfrag->page) {
1801 if (atomic_read(&pfrag->page->_count) == 1) {
1802 pfrag->offset = 0;
1803 return true;
1804 }
1805 if (pfrag->offset < pfrag->size) //page中还有剩余空间
1806 return true;
1807 put_page(pfrag->page);
1808 }
1809
1810 /* We restrict high order allocations to users that can afford to wait */
1811 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812
1813 do {
1814 gfp_t gfp = sk->sk_allocation;
1815
1816 if (order)
1817 gfp |= __GFP_COMP | __GFP_NOWARN;
1818 pfrag->page = alloc_pages(gfp, order);
1819 if (likely(pfrag->page)) {
1820 pfrag->offset = 0;
1821 pfrag->size = PAGE_SIZE << order;
1822 return true;
1823 }
1824 } while (--order >= 0);
1825 //申请page不成功
1826 sk_enter_memory_pressure(sk);
1827 sk_stream_moderate_sndbuf(sk);
1828 return false;
1829 }
向非线性区填充数据后要使用skb_copy_to_page_nocache更新缓存信息:
1832 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
1833 struct sk_buff *skb,
1834 struct page *page,
1835 int off, int copy)
1836 {
1837 int err;
1838
1839 err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
1840 copy, skb->len);
1841 if (err)
1842 return err;
1843
1844 skb->len += copy;
1845 skb->data_len += copy;
1846 skb->truesize += copy;
1847 sk->sk_wmem_queued += copy;
1848 sk_mem_charge(sk, copy);
1849 return 0;
1850 }
tcp_transmit_skb函数在发送skb时会占用sk->sk_wmem_alloc缓存:
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
...
890 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
891 tcp_wfree : sock_wfree; //skb释放时调用tcp_wfree或sock_wfree
892 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
...
10.2.2 释放缓存
tcp_transmit_skb发送出去的skb被释放时(网卡驱动在发送完毕数据后释放skb,或IP发送队列满导致丢包时)会调用tcp_wfree或sock_wfree函数,并更新sk->sk_wmem_alloc的数值:
791 void tcp_wfree(struct sk_buff *skb)
792 {
793 struct sock *sk = skb->sk;
794 struct tcp_sock *tp = tcp_sk(sk);
795
796 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
797 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
798 unsigned long flags;
799 struct tsq_tasklet *tsq;
800
801 /* Keep a ref on socket.
802 * This last ref will be released in tcp_tasklet_func()
803 */
804 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
805
806 /* queue this socket to tasklet queue */
807 local_irq_save(flags);
808 tsq = &__get_cpu_var(tsq_tasklet);
809 list_add(&tp->tsq_node, &tsq->head);
810 tasklet_schedule(&tsq->tasklet);
811 local_irq_restore(flags);
812 } else {
813 sock_wfree(skb);
814 }
815 }
sock_wfree函数:
1534 void sock_wfree(struct sk_buff *skb)
1535 {
1536 struct sock *sk = skb->sk;
1537 unsigned int len = skb->truesize;
1538
1539 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540 /*
1541 * Keep a reference on sk_wmem_alloc, this will be released
1542 * after sk_write_space() call
1543 */
1544 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545 sk->sk_write_space(sk);
1546 len = 1;
1547 }
1548 /*
1549 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550 * could not do because of in-flight packets
1551 */
1552 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553 __sk_free(sk);
1554 }
收到对端的ACK后,tcp_ack函数会调用tcp_clean_rtx_queue释放发送缓存中的skb,tcp_clean_rtx_queue函数会调用sk_wmem_free_skb释放skb并更新内存信息:
1408 static inline void sk_mem_uncharge(struct sock *sk, int size)
1409 {
1410 if (!sk_has_account(sk))
1411 return;
1412 sk->sk_forward_alloc += size;
1413 }
1414
1415 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1416 {
1417 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1418 sk->sk_wmem_queued -= skb->truesize;
1419 sk_mem_uncharge(sk, skb->truesize);
1420 __kfree_skb(skb);
1421 }
收到ACK后TCP会调用tcp_data_snd_check函数尝试扩大发送缓存:
4688 static bool tcp_should_expand_sndbuf(const struct sock *sk)
4689 {
4690 const struct tcp_sock *tp = tcp_sk(sk);
4691
4692 /* If the user specified a specific send buffer setting, do
4693 * not modify it.
4694 */
4695 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4696 return false;
4697
4698 /* If we are under global TCP memory pressure, do not expand. */
4699 if (sk_under_memory_pressure(sk))
4700 return false;
4701
4702 /* If we are under soft global TCP memory pressure, do not expand. */
4703 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4704 return false;
4705
4706 /* If we filled the congestion window, do not expand. */
4707 if (tp->packets_out >= tp->snd_cwnd)
4708 return false;
4709
4710 return true;
4711 }
...
4719 static void tcp_new_space(struct sock *sk)
4720 {
4721 struct tcp_sock *tp = tcp_sk(sk);
4722
4723 if (tcp_should_expand_sndbuf(sk)) {
4724 int sndmem = SKB_TRUESIZE(max_t(u32,
4725 tp->rx_opt.mss_clamp,
4726 tp->mss_cache) +
4727 MAX_TCP_HEADER);
4728 int demanded = max_t(unsigned int, tp->snd_cwnd,
4729 tp->reordering + 1);
4730 sndmem *= 2 * demanded;
4731 if (sndmem > sk->sk_sndbuf)
4732 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4733 tp->snd_cwnd_stamp = tcp_time_stamp;
4734 }
4735
4736 sk->sk_write_space(sk); //指向sk_stream_write_space函数,通知应用进程发送缓存有空余,可以发送数据
4737 }
4738
4739 static void tcp_check_space(struct sock *sk)
4740 {
4741 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { //发送队列减小
4742 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4743 if (sk->sk_socket &&
4744 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) //有进程在等待发送缓存的空间
4745 tcp_new_space(sk); //尝试扩大发送缓存
4746 }
4747 }
4748
4749 static inline void tcp_data_snd_check(struct sock *sk)
4750 {
4751 tcp_push_pending_frames(sk);
4752 tcp_check_space(sk);
4753 }
扩大发送缓存的条件:
(1)由于ACK确认了数据并删除了skb使得发送队列减小
(2)应用进程在向内核写入数据时由于内存不足而等待
(3)应用进程没有使用SO_SNDBUF socket选项设置snd_buf大小
(4)全局TCP缓存没有处于极度紧张状态
(5)全局TCP缓存没有处于相对紧张状态(全局已分配TCP内存小于net.ipv4.tcp_mem[0])
(6)TCP已经发送并在网络中的包数小于拥塞窗口