TCP收到对端发送的数据后,通常不能立即交付应用进程。在应用进程取走数据之前,数据需要保存在接收缓存之中。如果应用进程取数据的速度比TCP从对端收数据的速度慢,则接收缓存中的数据会越来越多。因此在skb被放入接收缓存之前必须检查接收缓存能容纳的内存数,如果超出限制则必须丢弃skb。
10.3.1 缓存占用
tcp_rcv_established中会检查接收缓存的使用情况:
5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5077 const struct tcphdr *th, unsigned int len)
5078 {
5079 struct tcp_sock *tp = tcp_sk(sk);
...
5201 if (!eaten) {
...
5205 if ((int)skb->truesize > sk->sk_forward_alloc) //剩余空间无法容纳skb
5206 goto step5;//进入慢速路径
...
5221 /* Bulk data transfer: receiver */
5222 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5223 &fragstolen);
...
5265 step5:
...
5275 tcp_data_queue(sk, skb);
...
tcp_queue_rcv
函数:
4244 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4245 bool *fragstolen)
4246 {
4247 int eaten;
4248 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4249
4250 __skb_pull(skb, hdrlen);
4251 eaten = (tail &&
4252 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4253 tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4254 if (!eaten) {
4255 __skb_queue_tail(&sk->sk_receive_queue, skb);
4256 skb_set_owner_r(skb, sk);
4257 }
4258 return eaten;
4259 }
skb_set_owner_r函数:
1995 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
1996 {
1997 skb_orphan(skb);
1998 skb->sk = sk;
1999 skb->destructor = sock_rfree;
2000 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
2001 sk_mem_charge(sk, skb->truesize); //sk->sk_forward_alloc -= size
2002 }
在tcp_data_queue
函数中:
4300 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4301 {
4302 const struct tcphdr *th = tcp_hdr(skb);
4303 struct tcp_sock *tp = tcp_sk(sk);
4304 int eaten = -1;
4305 bool fragstolen = false;
...
4344 if (eaten <= 0) {
4345 queue_and_out:
4346 if (eaten < 0 &&
4347 tcp_try_rmem_schedule(sk, skb, skb->truesize))//检查是否可以占用接收缓存的skb->truesize大小的空间
4348 goto drop;
4349
4350 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4351 }
...
4415 tcp_data_queue_ofo(sk, skb);
4416 }
tcp_try_rmem_schedule会试着调整接收缓存空间来接收数据
:
4061 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4062 unsigned int size)
4063 {
4064 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || //已分配内存超过限制
4065 !sk_rmem_schedule(sk, skb, size)) { //接收缓存无法容纳size大小的数据
4066
4067 if (tcp_prune_queue(sk) < 0) //整理接收队列
4068 return -1;
4069
4070 if (!sk_rmem_schedule(sk, skb, size)) { //再次检查缓存空间是否够用
4071 if (!tcp_prune_ofo_queue(sk)) //清空乱序队列,释放缓存空间
4072 return -1;
4073
4074 if (!sk_rmem_schedule(sk, skb, size)) //再次检查缓存空间是否够用
4075 return -1;
4076 }
4077 }
4078 return 0;
4079 }
sk_rmem_schedule
用于检查缓存空间是否够用:
1375 static inline bool
1376 sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
1377 {
1378 if (!sk_has_account(sk))
1379 return true;
1380 return size<= sk->sk_forward_alloc || //剩余预分配内存够用
1381 __sk_mem_schedule(sk, size, SK_MEM_RECV) || //<span style="color:#000000;">增加预分配内存和已分配内存</span>
1382 skb_pfmemalloc(skb); //skb中的内存是用PFMEMALLOC方式申请的,这种方式申请的是紧急内存
1383 }
tcp_prune_queue
和tcp_prune_ofo_queue分别用于整理接收队列和乱序队列:
<span style="color:#000000;">4594 static bool tcp_prune_ofo_queue(struct sock *sk)
4595 {
4596 struct tcp_sock *tp = tcp_sk(sk);
4597 bool res = false;
4598
4599 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4600 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4601 __skb_queue_purge(&tp->out_of_order_queue); //释放乱序队列中的所有数据
4602
4603 /* Reset SACK state. A conforming SACK implementation will
4604 * do the same at a timeout based retransmit. When a connection
4605 * is in a sad state like this, we care only about integrity
4606 * of the connection not performance.
4607 */
4608 if (tp->rx_opt.sack_ok)
4609 tcp_sack_reset(&tp->rx_opt);
4610 sk_mem_reclaim(sk); //更新缓存空间信息
4611 res = true;
4612 }
4613 return res;
4614 }
...</span>
4623 static int tcp_prune_queue(struct sock *sk)
4624 {
4625 struct tcp_sock *tp = tcp_sk(sk);
4626
4627 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4628
4629 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
4630
4631 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4632 tcp_clamp_window(sk); //试图缩小接收缓存大小并更新最大通告窗口大小
4633 else if (sk_under_memory_pressure(sk))
4634 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); //缩小最大通告窗口大小
4635
4636 tcp_collapse_ofo_queue(sk); //合并乱序队列中连续的数据块以节省空间
4637 if (!skb_queue_empty(&sk->sk_receive_queue))
4638 tcp_collapse(sk, &sk->sk_receive_queue,
4639 skb_peek(&sk->sk_receive_queue),
4640 NULL,
4641 tp->copied_seq, tp->rcv_nxt); //合并接收队列中未被读取的数据
4642 sk_mem_reclaim(sk); //更新缓存空间信息
4643
4644 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) //接收缓存够用了
4645 return 0;
4646
4647 /* Collapsing did not help, destructive actions follow.
4648 * This must not ever occur. */
4649
4650 tcp_prune_ofo_queue(sk); //清理乱序队列
...
4662 tp->pred_flags = 0; //内存紧张,禁用快速处理路径
4663 return -1;
4664 }
tcp_clamp_window用于更新最大通告窗口大小:
410 static void tcp_clamp_window(struct sock *sk)
411 {
412 struct tcp_sock *tp = tcp_sk(sk);
413 struct inet_connection_sock *icsk = inet_csk(sk);
414
415 icsk->icsk_ack.quick = 0;
416
417 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && //接收缓存大小小于最大接收缓存大小
418 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && //应用进程没有设置接收缓存大小
419 !sk_under_memory_pressure(sk) && //不处于内存压力之下
420 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { //全局已分配TCP内存小于最低限制
421 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
422 sysctl_tcp_rmem[2]);
423 }
424 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
425 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
426 }
当剩余预分配内存比较多时
sk_mem_reclaim函数会回收一部分
预分配内存:
1385 static inline void sk_mem_reclaim(struct sock *sk)
1386 {
1387 if (!sk_has_account(sk))
1388 return;
1389 if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
1390 __sk_mem_reclaim(sk);
1391 }
__sk_mem_reclaim:
2005 void __sk_mem_reclaim(struct sock *sk)
2006 {
2007 sk_memory_allocated_sub(sk,
2008 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2009 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2010
2011 if (sk_under_memory_pressure(sk) &&
2012 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2013 sk_leave_memory_pressure(sk);
2014 }
TCP调用
tcp_data_queue_ofo
函数将skb放入乱序队列时也会使用skb_set_owner_r函数更新接收缓存信息:
4121 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4122 {
4123 struct tcp_sock *tp = tcp_sk(sk);
4124 struct sk_buff *skb1;
4125 u32 seq, end_seq;
...
4239 end:
4240 if (skb)
4241 skb_set_owner_r(skb, sk);
4242 }
10.3.2 缓存释放
应用进程在tcp_sendmsg函数中将数据读完毕后,接收缓存中的skb就会被释放。skb释放时会调用在skb_set_owner_r函数中设置的sock_rfree函数:
1560 void sock_rfree(struct sk_buff *skb)
1561 {
1562 struct sock *sk = skb->sk;
1563 unsigned int len = skb->truesize;
1564
1565 atomic_sub(len, &sk->sk_rmem_alloc);
1566 sk_mem_uncharge(sk, len); //释放预分配内存
1567 }
接收缓冲区放入数据或移除数据后,由tcp_rcv_space_adjust函数更新内存信息:
522 void tcp_rcv_space_adjust(struct sock *sk)
523 {
524 struct tcp_sock *tp = tcp_sk(sk);
525 int time;
526 int space;
527
528 if (tp->rcvq_space.time == 0)
529 goto new_measure;
530
531 time = tcp_time_stamp - tp->rcvq_space.time;
532 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
533 return;
534
535 space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
536
537 space = max(tp->rcvq_space.space, space);
538
539 if (tp->rcvq_space.space != space) { //有新的数据被应用进程copy出去
540 int rcvmem;
541
542 tp->rcvq_space.space = space;
543
544 if (sysctl_tcp_moderate_rcvbuf &&
545 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
546 int new_clamp = space;
547
548 /* Receive space grows, normalize in order to
549 * take into account packet headers and sk_buff
550 * structure overhead.
551 */
552 space /= tp->advmss;
553 if (!space)
554 space = 1;
555 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
556 while (tcp_win_from_space(rcvmem) < tp->advmss)
557 rcvmem += 128;
558 space *= rcvmem;
559 space = min(space, sysctl_tcp_rmem[2]);
560 if (space > sk->sk_rcvbuf) {
561 sk->sk_rcvbuf = space;
562
563 /* Make the window clamp follow along. */
564 tp->window_clamp = new_clamp;
565 }
566 }
567 }
568
569 new_measure:
570 tp->rcvq_space.seq = tp->copied_seq;
571 tp->rcvq_space.time = tcp_time_stamp;
572 }
上面我们了解了接收队列和乱序队列的管理,接下来看看其它类型的接收队列(prequeue队列、异步等待队列、backlog队列)的缓存管理。
10.3.3 其它队列
(1)prequeue队列:
1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920 {
1921 struct tcp_sock *tp = tcp_sk(sk);
1922
1923 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1924 return false;
1925
1926 if (skb->len <= tcp_hdrlen(skb) &&
1927 skb_queue_len(&tp->ucopy.prequeue) == 0)
1928 return false;
1929
1930 skb_dst_force(skb);
1931 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1932 tp->ucopy.memory += skb->truesize;
1933 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1934 struct sk_buff *skb1;
1935
1936 BUG_ON(sock_owned_by_user(sk));
1937
1938 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1939 sk_backlog_rcv(sk, skb1);
1940 NET_INC_STATS_BH(sock_net(sk),
1941 LINUX_MIB_TCPPREQUEUEDROPPED);
1942 }
1943
1944 tp->ucopy.memory = 0;
...
可见prequeue队列的缓存管理很简单:超出限制则删除全部数据。
(2)异步等待队列:放入这个队列中的skb不纳入缓存管理。
(3)backlog队列:
1961 int tcp_v4_rcv(struct sk_buff *skb)
1962 {
...
2039 } else if (unlikely(sk_add_backlog(sk, skb,
2040 sk->sk_rcvbuf + sk->sk_sndbuf))) {
...
sk_add_backlog函数会检查发送缓存和接收缓存的和是否有可用空间:
768 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb,
769 unsigned int limit)
770 {
771 unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
772
773 return qsize > limit;
774 }
775
776 /* The per-socket spinlock must be held here. */
777 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
778 unsigned int limit)
779 {
780 if (sk_rcvqueues_full(sk, skb, limit))
781 return -ENOBUFS;
782
783 __sk_add_backlog(sk, skb);
784 sk->sk_backlog.len += skb->truesize;
785 return 0;
786 }