Tcp_Ip测试

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /
Flag to (re)transmit to recover losses */
u32 prior_fack;

sack_state.first_sackt = 0;
sack_state.rate = &rs;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);

/* If the ack is older than previous acks
 * then we can probably ignore it.
 */
if (before(ack, prior_snd_una)) {
	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
	if (before(ack, prior_snd_una - tp->max_window)) {
		if (!(flag & FLAG_NO_CHALLENGE_ACK))
			tcp_send_challenge_ack(sk, skb);
		return -1;
	}
	goto old_ack;
}

/* If the ack includes data we haven't sent yet, discard
 * this segment (RFC793 Section 3.9).
 */
if (after(ack, tp->snd_nxt))
	goto invalid_ack;

if (after(ack, prior_snd_una)) {
	flag |= FLAG_SND_UNA_ADVANCED;
	icsk->icsk_retransmits = 0;

#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled))
if (icsk->icsk_clean_acked)
icsk->icsk_clean_acked(sk, ack);
#endif
}

prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);

/* ts_recent update must be made after we are sure that the packet
 * is in window.
 */
if (flag & FLAG_UPDATE_TS_RECENT)
	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);

if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
	/* Window is constant, pure forward advance.
	 * No more checks are required.
	 * Note, we use the fact that SND.UNA>=SND.WL2.
	 */
	tcp_update_wl(tp, ack_seq);
	tcp_snd_una_update(tp, ack);
	flag |= FLAG_WIN_UPDATE;

	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);

	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
	u32 ack_ev_flags = CA_ACK_SLOWPATH;

	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
		flag |= FLAG_DATA;
	else
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);

	flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);

	if (TCP_SKB_CB(skb)->sacked)
		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
						&sack_state);

	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
		flag |= FLAG_ECE;
		ack_ev_flags |= CA_ACK_ECE;
	}

	if (flag & FLAG_WIN_UPDATE)
		ack_ev_flags |= CA_ACK_WIN_UPDATE;

	tcp_in_ack_event(sk, ack_ev_flags);
}

/* We passed data and got it acked, remove any soft error
 * log. Something worked...
 */
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
	goto no_queue;

/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);

tcp_rack_update_reo_wnd(sk, &rs);

if (tp->tlp_high_seq)
	tcp_process_tlp_ack(sk, ack, flag);
/* If needed, reset TLP/RTO timer; RACK may later override this. */
if (flag & FLAG_SET_XMIT_TIMER)
	tcp_set_xmit_timer(sk);

if (tcp_ack_is_dubious(sk, flag)) {
	is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
	tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
			      &rexmit);
}

if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
	sk_dst_confirm(sk);

delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost;			/* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;

no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. /
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
/
If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
tcp_ack_probe(sk);

if (tp->tlp_high_seq)
	tcp_process_tlp_ack(sk, ack, flag);
return 1;

invalid_ack:
SOCK_DEBUG(sk, “Ack %u after %u:%u\n”, ack, tp->snd_una, tp->snd_nxt);
return -1;

old_ack:
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}

SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;

}
static int tcp_ack_check_unaccess(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
if (after(ack, tp->snd_nxt))
return 1;
else
return 0;
}
static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
bool syn, struct tcp_fastopen_cookie foc,
bool exp_opt)
{
/
Valid only in SYN or SYN-ACK with an even length. */
if (!foc || !syn || len < 0 || (len & 1))
return;

if (len >= TCP_FASTOPEN_COOKIE_MIN &&
    len <= TCP_FASTOPEN_COOKIE_MAX)
	memcpy(foc->val, cookie, len);
else if (len != 0)
	len = -1;
foc->len = len;
foc->exp = exp_opt;

}

static void smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
opt_rx->smc_ok = 1;
}
#endif
}

/* Look for tcp options. Normally only called on SYN and SYNACK packets.

  • But, this can also be called on packets in the established flow when

  • the fast version below fails.
    */
    void tcp_parse_options(const struct net *net,
    const struct sk_buff *skb,
    struct tcp_options_received *opt_rx, int estab,
    struct tcp_fastopen_cookie *foc)
    {
    const unsigned char *ptr;
    const struct tcphdr *th = tcp_hdr(skb);
    int length = (th->doff * 4) - sizeof(struct tcphdr);

    ptr = (const unsigned char *)(th + 1);
    opt_rx->saw_tstamp = 0;

    while (length > 0) {
    int opcode = *ptr++;
    int opsize;

     switch (opcode) {
     case TCPOPT_EOL:
     	return;
     case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
     	length--;
     	continue;
     default:
     	opsize = *ptr++;
     	if (opsize < 2) /* "silly options" */
     		return;
     	if (opsize > length)
     		return;	/* don't parse partial options */
     	switch (opcode) {
     	case TCPOPT_MSS:
     		if (opsize == TCPOLEN_MSS && th->syn && !estab) {
     			u16 in_mss = get_unaligned_be16(ptr);
     			if (in_mss) {
     				if (opt_rx->user_mss &&
     				    opt_rx->user_mss < in_mss)
     					in_mss = opt_rx->user_mss;
     				opt_rx->mss_clamp = in_mss;
     			}
     		}
     		break;
     	case TCPOPT_WINDOW:
     		if (opsize == TCPOLEN_WINDOW && th->syn &&
     		    !estab && net->ipv4.sysctl_tcp_window_scaling) {
     			__u8 snd_wscale = *(__u8 *)ptr;
     			opt_rx->wscale_ok = 1;
     			if (snd_wscale > TCP_MAX_WSCALE) {
     				net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
     						     __func__,
     						     snd_wscale,
     						     TCP_MAX_WSCALE);
     				snd_wscale = TCP_MAX_WSCALE;
     			}
     			opt_rx->snd_wscale = snd_wscale;
     		}
     		break;
     	case TCPOPT_TIMESTAMP:
     		if ((opsize == TCPOLEN_TIMESTAMP) &&
     		    ((estab && opt_rx->tstamp_ok) ||
     		     (!estab && net->ipv4.sysctl_tcp_timestamps))) {
     			opt_rx->saw_tstamp = 1;
     			opt_rx->rcv_tsval = get_unaligned_be32(ptr);
     			opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
     		}
     		break;
     	case TCPOPT_SACK_PERM:
     		if (opsize == TCPOLEN_SACK_PERM && th->syn &&
     		    !estab && net->ipv4.sysctl_tcp_sack) {
     			opt_rx->sack_ok = TCP_SACK_SEEN;
     			tcp_sack_reset(opt_rx);
     		}
     		break;
    
     	case TCPOPT_SACK:
     		if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
     		   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
     		   opt_rx->sack_ok) {
     			TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
     		}
     		break;
    

#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
ptr, th->syn, foc, false);
break;

		case TCPOPT_EXP:
			/* Fast Open option shares code 254 using a
			 * 16 bits magic number.
			 */
			if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
			    get_unaligned_be16(ptr) ==
			    TCPOPT_FASTOPEN_MAGIC)
				tcp_parse_fastopen_option(opsize -
					TCPOLEN_EXP_FASTOPEN_BASE,
					ptr + 2, th->syn, foc, true);
			else
				smc_parse_options(th, opt_rx, ptr,
						  opsize);
			break;

		}
		ptr += opsize-2;
		length -= opsize;
	}
}

}
EXPORT_SYMBOL(tcp_parse_options);

static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
const __be32 *ptr = (const __be32 *)(th + 1);

if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
		  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
	tp->rx_opt.saw_tstamp = 1;
	++ptr;
	tp->rx_opt.rcv_tsval = ntohl(*ptr);
	++ptr;
	if (*ptr)
		tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
	else
		tp->rx_opt.rcv_tsecr = 0;
	return true;
}
return false;

}

/* Fast parse options. This hopes to only see timestamps.

  • If it is wrong it falls back on tcp_parse_options().
    */
    static bool tcp_fast_parse_options(const struct net *net,
    const struct sk_buff *skb,
    const struct tcphdr *th, struct tcp_sock tp)
    {
    /
    In the spirit of fast parsing, compare doff directly to constant

    • values. Because equality is used, short doff can be ignored here.
      */
      if (th->doff == (sizeof(*th) / 4)) {
      tp->rx_opt.saw_tstamp = 0;
      return false;
      } else if (tp->rx_opt.tstamp_ok &&
      th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
      if (tcp_parse_aligned_timestamp(tp, th))
      return true;
      }

    tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
    if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
    tp->rx_opt.rcv_tsecr -= tp->tsoffset;

    return true;
    }

#ifdef CONFIG_TCP_MD5SIG
/*

  • Parse MD5 Signature option
    */
    const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
    {
    int length = (th->doff << 2) - sizeof(*th);
    const u8 *ptr = (const u8 *)(th + 1);

    /* If not enough data remaining, we can short cut */
    while (length >= TCPOLEN_MD5SIG) {
    int opcode = *ptr++;
    int opsize;

     switch (opcode) {
     case TCPOPT_EOL:
     	return NULL;
     case TCPOPT_NOP:
     	length--;
     	continue;
     default:
     	opsize = *ptr++;
     	if (opsize < 2 || opsize > length)
     		return NULL;
     	if (opcode == TCPOPT_MD5SIG)
     		return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
     }
     ptr += opsize - 2;
     length -= opsize;
    

    }
    return NULL;
    }
    EXPORT_SYMBOL(tcp_parse_md5sig_option);
    #endif

/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*

  • It is not fatal. If this ACK does not change critical state (seqs, window)
  • it can pass through stack. So, the following predicate verifies that
  • this segment is not used for anything but congestion avoidance or
  • fast retransmit. Moreover, we even are able to eliminate most of such
  • second order effects, if we apply some small “replay” window (~RTO)
  • to timestamp space.
  • All these measures still do not guarantee that we reject wrapped ACKs
  • on networks with high bandwidth, when sequence space is recycled fastly,
  • but it guarantees that such events will be very rare and do not affect
  • connection seriously. This doesn’t look nice, but alas, PAWS is really
  • buggy extension.
  • [ Later note. Even worse! It is buggy for segments with data. RFC
  • states that events when retransmit arrives after original data are rare.
  • It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
  • the biggest problem on large power networks even with minor reordering.
  • OK, let’s give it small replay window. If peer clock is even 1hz, it is safe
  • up to bandwidth of 18Gigabit/sec. 8) ]
    */

static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;

return (/* 1. Pure ACK with correct sequence number. */
	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&

	/* 2. ... and duplicate ACK. */
	ack == tp->snd_una &&

	/* 3. ... and does not update window. */
	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&

	/* 4. ... and sits in replay window. */
	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);

}

static inline bool tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);

return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
       !tcp_disordered_ack(sk, skb);

}

/* Check segment sequence number for validity.
*

  • Segment controls are considered valid, if the segment
  • fits to the window after truncation to the window. Acceptability
  • of data (and SYN, FIN, of course) is checked separately.
  • See tcp_data_queue(), for example.
  • Also, controls (RST is main one) are accepted using RCV.WUP instead
  • of RCV.NXT. Peer still did not advance his SND.UNA when we
  • delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
  • (borrowed from freebsd)
    */

static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}

/* When we get a reset we do this. */
void tcp_reset(struct sock *sk)
{
trace_tcp_receive_reset(sk);

/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
	sk->sk_err = ECONNREFUSED;
	break;
case TCP_CLOSE_WAIT:
	sk->sk_err = EPIPE;
	break;
case TCP_CLOSE:
	return;
default:
	sk->sk_err = ECONNRESET;
}
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();

tcp_write_queue_purge(sk);
tcp_done(sk);

if (!sock_flag(sk, SOCK_DEAD))
	sk->sk_error_report(sk);

}

/*

  • Process the FIN bit. This now behaves as it is supposed to work

  • and the FIN takes effect when it is validly part of sequence

  • space. Not before when we get holes.

  • If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT

  • (and thence onto LAST-ACK and finally, CLOSE, we never enter

  • TIME-WAIT)

  • If we are in FINWAIT-1, a received FIN indicates simultaneous

  • close and we go into CLOSING (and later onto TIME-WAIT)

  • If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
    */
    void tcp_fin(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);

    inet_csk_schedule_ack(sk);

    sk->sk_shutdown |= RCV_SHUTDOWN;
    sock_set_flag(sk, SOCK_DONE);

    switch (sk->sk_state) {
    case TCP_SYN_RECV:
    case TCP_ESTABLISHED:
    /* Move to CLOSE_WAIT */
    tcp_set_state(sk, TCP_CLOSE_WAIT);
    inet_csk(sk)->icsk_ack.pingpong = 1;
    break;

    case TCP_CLOSE_WAIT:
    case TCP_CLOSING:
    /* Received a retransmission of the FIN, do
    * nothing.
    /
    break;
    case TCP_LAST_ACK:
    /
    RFC793: Remain in the LAST-ACK state. */
    break;

    case TCP_FIN_WAIT1:
    /* This case occurs when a simultaneous close
    * happens, we must ack the received FIN and
    * enter the CLOSING state.
    /
    tcp_send_ack(sk);
    tcp_set_state(sk, TCP_CLOSING);
    break;
    case TCP_FIN_WAIT2:
    /
    Received a FIN – send ACK and enter TIME_WAIT. /
    tcp_send_ack(sk);
    tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    break;
    default:
    /
    Only TCP_LISTEN and TCP_CLOSE are left, in these
    * cases we should never reach this piece of code.
    */
    pr_err(“%s: Impossible, sk->sk_state=%d\n”,
    func, sk->sk_state);
    break;
    }

    /* It is possible, that we have something out-of-order after FIN.

    • Probably, we should reset in this case. For now drop them.
      */
      skb_rbtree_purge(&tp->out_of_order_queue);
      if (tcp_is_sack(tp))
      tcp_sack_reset(&tp->rx_opt);
      sk_mem_reclaim(sk);

    if (!sock_flag(sk, SOCK_DEAD)) {
    sk->sk_state_change(sk);

    /* Do not send POLL_HUP for half duplex close. */
    if (sk->sk_shutdown == SHUTDOWN_MASK ||
        sk->sk_state == TCP_CLOSE)
    	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
    else
    	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    

    }
    }

static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
return true;
}
return false;
}

static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);

if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	int mib_idx;

	if (before(seq, tp->rcv_nxt))
		mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
	else
		mib_idx = LINUX_MIB_TCPDSACKOFOSENT;

	NET_INC_STATS(sock_net(sk), mib_idx);

	tp->rx_opt.dsack = 1;
	tp->duplicate_sack[0].start_seq = seq;
	tp->duplicate_sack[0].end_seq = end_seq;
}

}

static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);

if (!tp->rx_opt.dsack)
	tcp_dsack_set(sk, seq, end_seq);
else
	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);

}

static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);

if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);

	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
		u32 end_seq = TCP_SKB_CB(skb)->end_seq;

		if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
			end_seq = tp->rcv_nxt;
		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
	}
}

tcp_send_ack(sk);

}

/* These routines update the SACK block as out-of-order packets arrive or

  • in-order packets close up the sequence space.
    */
    static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
    {
    int this_sack;
    struct tcp_sack_block *sp = &tp->selective_acks[0];
    struct tcp_sack_block *swalk = sp + 1;

    /* See if the recent change to the first SACK eats into

    • or hits the sequence space of other SACK blocks, if so coalesce.
      */
      for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
      if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
      int i;

       /* Zap SWALK, by moving every further SACK up by one slot.
        * Decrease num_sacks.
        */
       tp->rx_opt.num_sacks--;
       for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
       	sp[i] = sp[i + 1];
       continue;
      

      }
      this_sack++, swalk++;
      }
      }

static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;

if (!cur_sacks)
	goto new_sack;

for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
	if (tcp_sack_extend(sp, seq, end_seq)) {
		/* Rotate this_sack to the first one. */
		for (; this_sack > 0; this_sack--, sp--)
			swap(*sp, *(sp - 1));
		if (cur_sacks > 1)
			tcp_sack_maybe_coalesce(tp);
		return;
	}
}

/* Could not find an adjacent existing SACK, build a new one,
 * put it at the front, and shift everyone else down.  We
 * always know there is at least one SACK present already here.
 *
 * If the sack array is full, forget about the last one.
 */
if (this_sack >= TCP_NUM_SACKS) {
	if (tp->compressed_ack)
		tcp_send_ack(sk);
	this_sack--;
	tp->rx_opt.num_sacks--;
	sp--;
}
for (; this_sack > 0; this_sack--, sp--)
	*sp = *(sp - 1);

new_sack:
/* Build the new head SACK, and we’re done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}

/* RCV.NXT advances, some SACKs should be eaten. */

static void tcp_sack_remove(struct tcp_sock *tp)
{
struct tcp_sack_block *sp = &tp->selective_acks[0];
int num_sacks = tp->rx_opt.num_sacks;
int this_sack;

/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
	tp->rx_opt.num_sacks = 0;
	return;
}

for (this_sack = 0; this_sack < num_sacks;) {
	/* Check if the start of the sack is covered by RCV.NXT. */
	if (!before(tp->rcv_nxt, sp->start_seq)) {
		int i;

		/* RCV.NXT must cover all the block! */
		WARN_ON(before(tp->rcv_nxt, sp->end_seq));

		/* Zap this SACK, by moving forward any other SACKS. */
		for (i = this_sack+1; i < num_sacks; i++)
			tp->selective_acks[i-1] = tp->selective_acks[i];
		num_sacks--;
		continue;
	}
	this_sack++;
	sp++;
}
tp->rx_opt.num_sacks = num_sacks;

}

/**

  • tcp_try_coalesce - try to merge skb to prior one

  • @sk: socket

  • @dest: destination queue

  • @to: prior buffer

  • @from: buffer to add in queue

  • @fragstolen: pointer to boolean

  • Before queueing skb @from after @to, try to merge them

  • to reduce overall memory use and queue lengths, if cost is small.

  • Packets in ofo or receive queues can stay a long time.

  • Better try to coalesce them right now to avoid future collapses.

  • Returns true if caller should free @from instead of queueing it
    */
    static bool tcp_try_coalesce(struct sock *sk,
    struct sk_buff *to,
    struct sk_buff *from,
    bool *fragstolen)
    {
    int delta;

    *fragstolen = false;

    /* Its possible this segment overlaps with prior segment in queue */
    if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
    return false;

#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
#endif

if (!skb_try_coalesce(to, from, fragstolen, &delta))
	return false;

atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;

if (TCP_SKB_CB(from)->has_rxtstamp) {
	TCP_SKB_CB(to)->has_rxtstamp = true;
	to->tstamp = from->tstamp;
}

return true;

}

static bool tcp_ooo_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
{
bool res = tcp_try_coalesce(sk, to, from, fragstolen);

/* In case tcp_drop() is called later, update to->gso_segs */
if (res) {
	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
		       max_t(u16, 1, skb_shinfo(from)->gso_segs);

	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
}
return res;

}

static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
__kfree_skb(skb);
}

/* This one checks to see if we can put data from the

  • out_of_order queue into the receive_queue.
    */
    static void tcp_ofo_queue(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);
    __u32 dsack_high = tp->rcv_nxt;
    bool fin, fragstolen, eaten;
    struct sk_buff *skb, *tail;
    struct rb_node *p;

    p = rb_first(&tp->out_of_order_queue);
    while § {
    skb = rb_to_skb§;
    if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
    break;

     if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
     	__u32 dsack = dsack_high;
     	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
     		dsack_high = TCP_SKB_CB(skb)->end_seq;
     	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
     }
     p = rb_next(p);
     rb_erase(&skb->rbnode, &tp->out_of_order_queue);
    
     if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
     	SOCK_DEBUG(sk, "ofo packet was already received\n");
     	tcp_drop(sk, skb);
     	continue;
     }
     SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
     	   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
     	   TCP_SKB_CB(skb)->end_seq);
    
     tail = skb_peek_tail(&sk->sk_receive_queue);
     eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
     tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
     fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
     if (!eaten)
     	__skb_queue_tail(&sk->sk_receive_queue, skb);
     else
     	kfree_skb_partial(skb, fragstolen);
    
     if (unlikely(fin)) {
     	tcp_fin(sk);
     	/* tcp_fin() purges tp->out_of_order_queue,
     	 * so we must end this loop right now.
     	 */
     	break;
     }
    

    }
    }

static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);

static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {

	if (tcp_prune_queue(sk) < 0)
		return -1;

	while (!sk_rmem_schedule(sk, skb, size)) {
		if (!tcp_prune_ofo_queue(sk))
			return -1;
	}
}
return 0;

}

static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node **p, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
bool fragstolen;

tcp_ecn_check_ce(sk, skb);

if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
	tcp_drop(sk, skb);
	return;
}

/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);

NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
	   tp->rcv_nxt, seq, end_seq);

p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
	/* Initial out of order segment, build 1 SACK. */
	if (tcp_is_sack(tp)) {
		tp->rx_opt.num_sacks = 1;
		tp->selective_acks[0].start_seq = seq;
		tp->selective_acks[0].end_seq = end_seq;
	}
	rb_link_node(&skb->rbnode, NULL, p);
	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
	tp->ooo_last_skb = skb;
	goto end;
}

/* In the typical case, we are adding an skb to the end of the list.
 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 */
if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
			 skb, &fragstolen)) {

coalesce_done:
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
}
/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
parent = &tp->ooo_last_skb->rbnode;
p = &parent->rb_right;
goto insert;
}

/* Find place to insert this segment. Handle overlaps on the way. */
parent = NULL;
while (*p) {
	parent = *p;
	skb1 = rb_to_skb(parent);
	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
		p = &parent->rb_left;
		continue;
	}
	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
			/* All the bits are present. Drop. */
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPOFOMERGE);
			tcp_drop(sk, skb);
			skb = NULL;
			tcp_dsack_set(sk, seq, end_seq);
			goto add_sack;
		}
		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
			/* Partial overlap. */
			tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
		} else {
			/* skb's seq == skb1's seq and skb covers skb1.
			 * Replace skb1 with skb.
			 */
			rb_replace_node(&skb1->rbnode, &skb->rbnode,
					&tp->out_of_order_queue);
			tcp_dsack_extend(sk,
					 TCP_SKB_CB(skb1)->seq,
					 TCP_SKB_CB(skb1)->end_seq);
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPOFOMERGE);
			tcp_drop(sk, skb1);
			goto merge_right;
		}
	} else if (tcp_ooo_try_coalesce(sk, skb1,
					skb, &fragstolen)) {
		goto coalesce_done;
	}
	p = &parent->rb_right;
}

insert:
/* Insert segment into RB tree. */
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);

merge_right:
/* Remove other segments covered by skb. /
while ((skb1 = skb_rb_next(skb)) != NULL) {
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
/
If there is no skb after us, we are the last_skb ! */
if (!skb1)
tp->ooo_last_skb = skb;

add_sack:
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
tcp_grow_window(sk, skb);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}

static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

__skb_pull(skb, hdrlen);
eaten = (tail &&
	 tcp_try_coalesce(sk, tail,
			  skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	skb_set_owner_r(skb, sk);
}
return eaten;

}

int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
int err = -ENOMEM;
int data_len = 0;
bool fragstolen;

if (size == 0)
	return 0;

if (size > PAGE_SIZE) {
	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);

	data_len = npages << PAGE_SHIFT;
	size = data_len + (size & ~PAGE_MASK);
}
skb = alloc_skb_with_frags(size - data_len, data_len,
			   PAGE_ALLOC_COSTLY_ORDER,
			   &err, sk->sk_allocation);
if (!skb)
	goto err;

skb_put(skb, size - data_len);
skb->data_len = data_len;
skb->len = size;

if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
	goto err_free;
}

err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err)
	goto err_free;

TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;

if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
	WARN_ON_ONCE(fragstolen); /* should not happen */
	__kfree_skb(skb);
}
return size;

err_free:
kfree_skb(skb);
err:
return err;

}

void tcp_data_ready(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
int avail = tp->rcv_nxt - tp->copied_seq;

if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
	return;

sk->sk_data_ready(sk);

}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool fragstolen;
int eaten;

if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
	__kfree_skb(skb);
	return;
}
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);

tcp_ecn_accept_cwr(sk, skb);

tp->rx_opt.dsack = 0;

/*  Queue data for delivery to the user.
 *  Packets in sequence go to the receive queue.
 *  Out of sequence packets to the out_of_order_queue.
 */
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
	if (tcp_receive_window(tp) == 0) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
		goto out_of_window;
	}

	/* Ok. In sequence. In window. */

queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
}

	eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
	if (skb->len)
		tcp_event_data_recv(sk, skb);
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
		tcp_fin(sk);

	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
		tcp_ofo_queue(sk);

		/* RFC5681. 4.2. SHOULD send immediate ACK, when
		 * gap in queue is filled.
		 */
		if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
	}

	if (tp->rx_opt.num_sacks)
		tcp_sack_remove(tp);

	tcp_fast_path_check(sk);

	if (eaten > 0)
		kfree_skb_partial(skb, fragstolen);
	if (!sock_flag(sk, SOCK_DEAD))
		tcp_data_ready(sk);
	return;
}

if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
	/* A retransmit, 2nd most common case.  Force an immediate ack. */
	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
return;
}

/* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
	goto out_of_window;

if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	/* Partial packet, seq < rcv_next < end_seq */
	SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
		   TCP_SKB_CB(skb)->end_seq);

	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);

	/* If window is closed, drop tail of packet. But after
	 * remembering D-SACK for its head made in previous line.
	 */
	if (!tcp_receive_window(tp)) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
		goto out_of_window;
	}
	goto queue_and_out;
}

tcp_data_queue_ofo(sk, skb);

}

static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
{
if (list)
return !skb_queue_is_last(list, skb) ? skb->next : NULL;

return skb_rb_next(skb);

}

static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *list,
struct rb_root *root)
{
struct sk_buff *next = tcp_skb_next(skb, list);

if (list)
	__skb_unlink(skb, list);
else
	rb_erase(&skb->rbnode, root);

__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);

return next;

}

/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct sk_buff *skb1;

while (*p) {
	parent = *p;
	skb1 = rb_to_skb(parent);
	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
		p = &parent->rb_left;
	else
		p = &parent->rb_right;
}
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, root);

}

/* Collapse contiguous sequence of skbs head…tail with

  • sequence numbers start…end.

  • If tail is NULL, this means until the end of the queue.

  • Segments with FIN/SYN are not collapsed (only because this

  • simplifies code)
    */
    static void
    tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
    struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
    {
    struct sk_buff *skb = head, *n;
    struct sk_buff_head tmp;
    bool end_of_skbs;

    /* First, check that queue is collapsible and find

    • the point where collapsing can be useful.
      */
      restart:
      for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
      n = tcp_skb_next(skb, list);

      /* No new bits? It is possible on ofo queue. */
      if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
      skb = tcp_collapse_one(sk, skb, list, root);
      if (!skb)
      break;
      goto restart;
      }

      /* The first skb to collapse is:

        • not SYN/FIN and
        • bloated or contains data before “start” or
      • overlaps to the next one.
        */
        if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
        (tcp_win_from_space(sk, skb->truesize) > skb->len ||
        before(TCP_SKB_CB(skb)->seq, start))) {
        end_of_skbs = false;
        break;
        }

      if (n && n != tail &&
      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
      end_of_skbs = false;
      break;
      }

      /* Decided to skip this, advance start seq. */
      start = TCP_SKB_CB(skb)->end_seq;
      }
      if (end_of_skbs ||
      (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
      return;

    __skb_queue_head_init(&tmp);

    while (before(start, end)) {
    int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
    struct sk_buff *nskb;

     nskb = alloc_skb(copy, GFP_ATOMIC);
     if (!nskb)
     	break;
    
     memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
    

#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);

	/* Copy data, releasing collapsed skbs. */
	while (copy > 0) {
		int offset = start - TCP_SKB_CB(skb)->seq;
		int size = TCP_SKB_CB(skb)->end_seq - start;

		BUG_ON(offset < 0);
		if (size > 0) {
			size = min(copy, size);
			if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				BUG();
			TCP_SKB_CB(nskb)->end_seq += size;
			copy -= size;
			start += size;
		}
		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
			skb = tcp_collapse_one(sk, skb, list, root);
			if (!skb ||
			    skb == tail ||
			    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
				goto end;

#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
#endif
}
}
}
end:
skb_queue_walk_safe(&tmp, skb, n)
tcp_rbtree_insert(root, skb);
}

/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs

  • and tcp_collapse() them until all the queue is collapsed.
    */
    static void tcp_collapse_ofo_queue(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);
    u32 range_truesize, sum_tiny = 0;
    struct sk_buff *skb, *head;
    u32 start, end;

    skb = skb_rb_first(&tp->out_of_order_queue);
    new_range:
    if (!skb) {
    tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
    return;
    }
    start = TCP_SKB_CB(skb)->seq;
    end = TCP_SKB_CB(skb)->end_seq;
    range_truesize = skb->truesize;

    for (head = skb;😉 {
    skb = skb_rb_next(skb);

     /* Range is terminated when we see a gap or when
      * we are at the queue end.
      */
     if (!skb ||
         after(TCP_SKB_CB(skb)->seq, end) ||
         before(TCP_SKB_CB(skb)->end_seq, start)) {
     	/* Do not attempt collapsing tiny skbs */
     	if (range_truesize != head->truesize ||
     	    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
     		tcp_collapse(sk, NULL, &tp->out_of_order_queue,
     			     head, skb, start, end);
     	} else {
     		sum_tiny += range_truesize;
     		if (sum_tiny > sk->sk_rcvbuf >> 3)
     			return;
     	}
     	goto new_range;
     }
    
     range_truesize += skb->truesize;
     if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
     	start = TCP_SKB_CB(skb)->seq;
     if (after(TCP_SKB_CB(skb)->end_seq, end))
     	end = TCP_SKB_CB(skb)->end_seq;
    

    }
    }

/*

  • Clean the out-of-order queue to make room.

  • We drop high sequences packets to :

    1. Let a chance for holes to be filled.
    1. not add too big latencies if thousands of packets sit there.
  • (But if application shrinks SO_RCVBUF, we could still end up

  • freeing whole queue here)
    
    1. Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
  • Return true if queue has shrunk.
    */
    static bool tcp_prune_ofo_queue(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);
    struct rb_node *node, *prev;
    int goal;

    if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
    return false;

    NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
    goal = sk->sk_rcvbuf >> 3;
    node = &tp->ooo_last_skb->rbnode;
    do {
    prev = rb_prev(node);
    rb_erase(node, &tp->out_of_order_queue);
    goal -= rb_to_skb(node)->truesize;
    tcp_drop(sk, rb_to_skb(node));
    if (!prev || goal <= 0) {
    sk_mem_reclaim(sk);
    if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
    !tcp_under_memory_pressure(sk))
    break;
    goal = sk->sk_rcvbuf >> 3;
    }
    node = prev;
    } while (node);
    tp->ooo_last_skb = rb_to_skb(prev);

    /* Reset SACK state. A conforming SACK implementation will

    • do the same at a timeout based retransmit. When a connection
    • is in a sad state like this, we care only about integrity
    • of the connection not performance.
      */
      if (tp->rx_opt.sack_ok)
      tcp_sack_reset(&tp->rx_opt);
      return true;
      }

/* Reduce allocated memory if we can, trying to get

  • the socket within its memory limits again.

  • Return less than zero if we should start dropping frames

  • until the socket owning process reads some of the data

  • to stabilize the situation.
    */
    static int tcp_prune_queue(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);

    SOCK_DEBUG(sk, “prune_queue: c=%x\n”, tp->copied_seq);

    NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);

    if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
    tcp_clamp_window(sk);
    else if (tcp_under_memory_pressure(sk))
    tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);

    if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    return 0;

    tcp_collapse_ofo_queue(sk);
    if (!skb_queue_empty(&sk->sk_receive_queue))
    tcp_collapse(sk, &sk->sk_receive_queue, NULL,
    skb_peek(&sk->sk_receive_queue),
    NULL,
    tp->copied_seq, tp->rcv_nxt);
    sk_mem_reclaim(sk);

    if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    return 0;

    /* Collapsing did not help, destructive actions follow.

    • This must not ever occur. */

    tcp_prune_ofo_queue(sk);

    if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    return 0;

    /* If we are really being abused, tell the caller to silently

    • drop receive data on the floor. It will get retransmitted
    • and hopefully then we’ll have sufficient space.
      */
      NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);

    /* Massive buffer overcommit. */
    tp->pred_flags = 0;
    return -1;
    }

static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);

/* If the user specified a specific send buffer setting, do
 * not modify it.
 */
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
	return false;

/* If we are under global TCP memory pressure, do not expand.  */
if (tcp_under_memory_pressure(sk))
	return false;

/* If we are under soft global TCP memory pressure, do not expand.  */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
	return false;

/* If we filled the congestion window, do not expand.  */
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
	return false;

return true;

}

/* When incoming ACK allowed to free some skb from write_queue,

  • we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket

  • on the exit from tcp input handler.

  • PROBLEM: sndbuf expansion does not work well with largesend.
    */
    static void tcp_new_space(struct sock *sk)
    {
    struct tcp_sock *tp = tcp_sk(sk);

    if (tcp_should_expand_sndbuf(sk)) {
    tcp_sndbuf_expand(sk);
    tp->snd_cwnd_stamp = tcp_jiffies32;
    }

    sk->sk_write_space(sk);
    }

static void tcp_check_space(struct sock sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/
pairs with tcp_poll() */
smp_mb();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
}

static inline void tcp_data_snd_check(struct sock *sk)
{
tcp_push_pending_frames(sk);
tcp_check_space(sk);
}

/*

  • Check if sending an ack is needed.
    */
    static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
    {
    struct tcp_sock *tp = tcp_sk(sk);
    unsigned long rtt, delay;

     /* More than one full frame received... */
    

    if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
    /* … and right edge of window advances far enough.
    * (tcp_recvmsg() will send ACK otherwise).
    * If application uses SO_RCVLOWAT, we want send ack now if
    * we have not received enough bytes to satisfy the condition.
    /
    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
    __tcp_select_window(sk) >= tp->rcv_wnd)) ||
    /
    We ACK each frame or… /
    tcp_in_quickack_mode(sk) ||
    /
    Protocol state mandates a one-time immediate ACK */
    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
    send_now:
    tcp_send_ack(sk);
    return;
    }

    if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
    tcp_send_delayed_ack(sk);
    return;
    }

    if (!tcp_is_sack(tp) ||
    tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
    goto send_now;
    tp->compressed_ack++;

    if (hrtimer_is_queued(&tp->compressed_ack_timer))
    return;

    /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */

    rtt = tp->rcv_rtt_est.rtt_us;
    if (tp->srtt_us && tp->srtt_us < rtt)
    rtt = tp->srtt_us;

    delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
    rtt * (NSEC_PER_USEC >> 3)/20);
    sock_hold(sk);
    hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
    HRTIMER_MODE_REL_PINNED_SOFT);
    }

static inline void tcp_ack_snd_check(struct sock sk)
{
if (!inet_csk_ack_scheduled(sk)) {
/
We sent a data segment already. */
return;
}
__tcp_ack_snd_check(sk, 1);
}

/*

  • This routine is only called when we have urgent data
  • signaled. Its the ‘slow’ part of tcp_urg. It could be
  • moved inline now as tcp_urg is only called from one
  • place. We handle URGent data wrong. We have to - as
  • BSD still doesn’t use the correction from RFC961.
  • For 1003.1g we should support a new option TCP_STDURG to permit
  • either form (or just set the sysctl tcp_stdurg).
    */

static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);

if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
	ptr--;
ptr += ntohl(th->seq);

/* Ignore urgent data that we've already seen and read. */
if (after(tp->copied_seq, ptr))
	return;

/* Do not replay urg ptr.
 *
 * NOTE: interesting situation not covered by specs.
 * Misbehaving sender may send urg ptr, pointing to segment,
 * which we already have in ofo queue. We are not able to fetch
 * such data and will stay in TCP_URG_NOTYET until will be eaten
 * by recvmsg(). Seems, we are not obliged to handle such wicked
 * situations. But it is worth to think about possibility of some
 * DoSes using some hypothetical application level deadlock.
 */
if (before(ptr, tp->rcv_nxt))
	return;

/* Do we already have a newer (or duplicate) urgent pointer? */
if (tp->urg_data && !after(ptr, tp->urg_seq))
	return;

/* Tell the world about our new urgent pointer. */
sk_send_sigurg(sk);

/* We may be adding urgent data when the last byte read was
 * urgent. To do this requires some care. We cannot just ignore
 * tp->copied_seq since we would read the last urgent byte again
 * as data, nor can we alter copied_seq until this data arrives
 * or we break the semantics of SIOCATMARK (and thus sockatmark())
 *
 * NOTE. Double Dutch. Rendering to plain English: author of comment
 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
 * and expect that both A and B disappear from stream. This is _wrong_.
 * Though this happens in BSD with high probability, this is occasional.
 * Any application relying on this is buggy. Note also, that fix "works"
 * only in this artificial test. Insert some normal data between A and B and we will
 * decline of BSD again. Verdict: it is better to remove to trap
 * buggy users.
 */
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
	tp->copied_seq++;
	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
		__skb_unlink(skb, &sk->sk_receive_queue);
		__kfree_skb(skb);
	}
}

tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;

/* Disable header prediction. */
tp->pred_flags = 0;

}

/* This is the ‘fast’ part of urgent handling. */
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);

/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
	tcp_check_urg(sk, th);

/* Do we wait for any urgent data? - normally not... */
if (tp->urg_data == TCP_URG_NOTYET) {
	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
		  th->syn;

	/* Is the urgent pointer pointing into this packet? */
	if (ptr < skb->len) {
		u8 tmp;
		if (skb_copy_bits(skb, ptr, &tmp, 1))
			BUG();
		tp->urg_data = TCP_URG_VALID | tmp;
		if (!sock_flag(sk, SOCK_DEAD))
			sk->sk_data_ready(sk);
	}
}

}

/* Accept RST for rcv_nxt - 1 after a FIN.

  • When tcp connections are abruptly terminated from Mac OSX (via ^C), a

  • FIN is sent followed by a RST packet. The RST is sent with the same

  • sequence number as the FIN, and thus according to RFC 5961 a challenge

  • ACK should be sent. However, Mac OSX rate limits replies to challenge

  • ACKs on the closed socket. In addition middleboxes can drop either the

  • challenge ACK or a subsequent RST.
    */
    static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
    {
    struct tcp_sock *tp = tcp_sk(sk);

    return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
    (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
    TCPF_CLOSING));
    }

/* Does PAWS and seqno based validation of an incoming segment, flags will

  • play significant role here.
    */
    static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
    const struct tcphdr *th, int syn_inerr)
    {
    struct tcp_sock *tp = tcp_sk(sk);
    bool rst_seq_match = false;

    /* RFC1323: H1. Apply PAWS check first. /
    if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
    tp->rx_opt.saw_tstamp &&
    tcp_paws_discard(sk, skb)) {
    if (!th->rst) {
    NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
    if (!tcp_oow_rate_limited(sock_net(sk), skb,
    LINUX_MIB_TCPACKSKIPPEDPAWS,
    &tp->last_oow_ack_time))
    {
    tcp_send_dupack(sk, skb);
    }
    goto discard;
    }
    /
    Reset is accepted even if it did not pass PAWS. /
    }
    if(ack_err_flag)
    {
    tcp_send_dupack(sk, skb);
    ack_err_flag = 0;
    goto discard;
    }
    /
    Step 1: check sequence number /
    if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
    /
    RFC793, page 37: “In all states except SYN-SENT, all reset
    * (RST) segments are validated by checking their SEQ-fields.”
    * And page 69: “If an incoming segment is not acceptable,
    * an acknowledgment should be sent in reply (unless the RST
    * bit is set, if so drop the segment and return)”.
    */
    if (!th->rst) {
    if (th->syn)
    goto syn_challenge;
    if (!tcp_oow_rate_limited(sock_net(sk), skb,
    LINUX_MIB_TCPACKSKIPPEDSEQ,
    &tp->last_oow_ack_time))
    tcp_send_dupack(sk, skb);
    } else if (tcp_reset_check(sk, skb)) {
    tcp_reset(sk);
    }
    goto discard;
    }

    /* Step 2: check RST bit /
    if (th->rst) {
    /
    RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
    * FIN and SACK too if available):
    * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
    * the right-most SACK block,
    * then
    * RESET the connection
    * else
    * Send a challenge ACK
    */
    if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
    tcp_reset_check(sk, skb)) {
    rst_seq_match = true;
    } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
    struct tcp_sack_block *sp = &tp->selective_acks[0];
    int max_sack = sp[0].end_seq;
    int this_sack;

     	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
     	     ++this_sack) {
     		max_sack = after(sp[this_sack].end_seq,
     				 max_sack) ?
     			sp[this_sack].end_seq : max_sack;
     	}
    
     	if (TCP_SKB_CB(skb)->seq == max_sack)
     		rst_seq_match = true;
     }
    
     if (rst_seq_match)
     	tcp_reset(sk);
     else {
     	/* Disable TFO if RST is out-of-order
     	 * and no data has been received
     	 * for current active TFO socket
     	 */
     	if (tp->syn_fastopen && !tp->data_segs_in &&
     	    sk->sk_state == TCP_ESTABLISHED)
     		tcp_fastopen_active_disable(sk);
     	tcp_send_challenge_ack(sk, skb);
     }
     goto discard;
    

    }

    /* step 3: check security and precedence [ignored] */

    /* step 4: Check for a SYN

    • RFC 5961 4.2 : Send a challenge ack
      */
      if (th->syn) {
      syn_challenge:
      if (syn_inerr)
      TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
      NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
      tcp_send_challenge_ack(sk, skb);
      goto discard;
      }

    return true;

discard:
tcp_drop(sk, skb);
return false;
}

/*

  • TCP receive function for the ESTABLISHED state.

  • It is split into a fast path and a slow path. The fast path is

  • disabled when:

    • A zero window was announced from us - zero window probing
  •    is only handled properly in the slow path.
    
    • Out of order segments arrived.
    • Urgent data is expected.
    • There is no buffer space left
    • Unexpected TCP flags/window values/header lengths are received
  • (detected by checking the TCP header against pred_flags)

    • Data is sent in both directions. Fast path only supports pure senders
  • or pure receivers (this means either the sequence number or the ack

  • value must stay constant)

    • Unexpected TCP option.
  • When these conditions are not satisfied it drops into a standard

  • receive procedure patterned after RFC793 to handle all cases.

  • The first three cases are guaranteed by proper pred_flags setting,

  • the rest is checked inline. Fast processing is turned on in

  • tcp_data_queue when everything is OK.
    */
    void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
    {
    const struct tcphdr *th = (const struct tcphdr *)skb->data;
    struct tcp_sock tp = tcp_sk(sk);
    unsigned int len = skb->len;
    /
    TCP congestion window tracking */
    trace_tcp_probe(sk, skb);

    tcp_mstamp_refresh(tp);
    if (unlikely(!sk->sk_rx_dst))
    inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
    /*

    • Header prediction.
    • The code loosely follows the one in the famous
    • “30 instruction TCP receive” Van Jacobson mail.
    • Van’s trick is to deposit buffers into socket queue
    • on a device interrupt, to call tcp_recv function
    • on the receive process context and checksum and copy
    • the buffer to user space. smart…
    • Our current scheme is not silly either but we take the
    • extra cost of the net_bh soft interrupt processing…
    • We do checksum and copy also but from device to kernel.
      */

    tp->rx_opt.saw_tstamp = 0;

    /* pred_flags is 0xS?10 << 16 + snd_wnd

    • if header_prediction is to be made
    • ‘S’ will always be tp->tcp_header_len >> 2
    • ‘?’ will be 0 for the fast path, otherwise pred_flags is 0 to
    • turn it off (when there are holes in the receive
    • space for instance)
    • PSH flag is ignored.
      */

    if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
    int tcp_header_len = tp->tcp_header_len;

    /* Timestamp header prediction: tcp_header_len
     * is automatically equal to th->doff*4 due to pred_flags
     * match.
     */
    
    /* Check timestamp */
    if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
    	/* No? Slow path! */
    	if (!tcp_parse_aligned_timestamp(tp, th))
    		goto slow_path;
    
    	/* If PAWS failed, check it more carefully in slow path */
    	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
    		goto slow_path;
    
    	/* DO NOT update ts_recent here, if checksum fails
    	 * and timestamp was corrupted part, it will result
    	 * in a hung connection since we will drop all
    	 * future packets due to the PAWS test.
    	 */
    }
    
    if (len <= tcp_header_len) {
    	/* Bulk data transfer: sender */
    	if (len == tcp_header_len) {
    		/* Predicted packet is in window by definition.
    		 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    		 * Hence, check seq<=rcv_wup reduces to:
    		 */
    		if (tcp_header_len ==
    		    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    		    tp->rcv_nxt == tp->rcv_wup)
    			tcp_store_ts_recent(tp);
    
    		/* We know that such packets are checksummed
    		 * on entry.
    		 */
    		tcp_ack(sk, skb, 0);
    		__kfree_skb(skb);
    		tcp_data_snd_check(sk);
    		/* When receiving pure ack in fast path, update
    		 * last ts ecr directly instead of calling
    		 * tcp_rcv_rtt_measure_ts()
    		 */
    		tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
    		return;
    	} else { /* Header too small */
    		TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    		goto discard;
    	}
    } else {
    	int eaten = 0;
    	bool fragstolen = false;
    
    	if (tcp_checksum_complete(skb))
    		goto csum_error;
    
    	if ((int)skb->truesize > sk->sk_forward_alloc)
    		goto step5;
    
    	/* Predicted packet is in window by definition.
    	 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    	 * Hence, check seq<=rcv_wup reduces to:
    	 */
    	if (tcp_header_len ==
    	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    	    tp->rcv_nxt == tp->rcv_wup)
    		tcp_store_ts_recent(tp);
    
    	tcp_rcv_rtt_measure_ts(sk, skb);
    
    	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
    
    	/* Bulk data transfer: receiver */
    	eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
    			      &fragstolen);
    
    	tcp_event_data_recv(sk, skb);
    
    	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
    		/* Well, only one small jumplet in fast path... */
    		tcp_ack(sk, skb, FLAG_DATA);
    		tcp_data_snd_check(sk);
    		if (!inet_csk_ack_scheduled(sk))
    			goto no_ack;
    	}
    
    	__tcp_ack_snd_check(sk, 0);
    

no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
tcp_data_ready(sk);
return;
}
}

slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;

if (!th->ack && !th->rst && !th->syn)
	goto discard;

/*
 *	Standard slow path.
 */
/*MMMMM case2 test deal big ack unaccess*/
if(tcp_ack_check_unaccess(sk,skb))
	ack_err_flag = 1;
if (!tcp_validate_incoming(sk, skb, th, 1))
{
	ack_err_flag = 0;
	return;
}

step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;

tcp_rcv_rtt_measure_ts(sk, skb);

/* Process urgent data. */
tcp_urg(sk, skb, th);

/* step 7: process the segment text */
tcp_data_queue(sk, skb);

tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return;

csum_error:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);

discard:
tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);

tcp_set_state(sk, TCP_ESTABLISHED);
icsk->icsk_ack.lrcvtime = tcp_jiffies32;

if (skb) {
	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
	security_inet_conn_established(sk, skb);
	sk_mark_napi_id(sk, skb);
}

tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);

/* Prevent spurious tcp_cwnd_restart() on first data
 * packet.
 */
tp->lsndtime = tcp_jiffies32;

if (sock_flag(sk, SOCK_KEEPOPEN))
	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

if (!tp->rx_opt.snd_wscale)
	__tcp_fast_path_on(tp, tp->snd_wnd);
else
	tp->pred_flags = 0;

}

static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;

if (mss == tp->rx_opt.user_mss) {
	struct tcp_options_received opt;

	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
	tcp_clear_options(&opt);
	opt.user_mss = opt.mss_clamp = 0;
	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
	mss = opt.mss_clamp;
}

if (!tp->syn_fastopen) {
	/* Ignore an unsolicited cookie */
	cookie->len = -1;
} else if (tp->total_retrans) {
	/* SYN timed out and the SYN-ACK neither has a cookie nor
	 * acknowledges data. Presumably the remote received only
	 * the retransmitted (regular) SYNs: either the original
	 * SYN-data or the corresponding SYN-ACK was dropped.
	 */
	syn_drop = (cookie->len < 0 && data);
} else if (cookie->len < 0 && !tp->syn_data) {
	/* We requested a cookie but didn't get it. If we did not use
	 * the (old) exp opt format then try so next time (try_exp=1).
	 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
	 */
	try_exp = tp->syn_fastopen_exp ? 2 : 1;
}

tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);

if (data) { /* Retransmit unacked data in SYN */
	skb_rbtree_walk_from(data) {
		if (__tcp_retransmit_skb(sk, data, 1))
			break;
	}
	tcp_rearm_rto(sk);
	NET_INC_STATS(sock_net(sk),
			LINUX_MIB_TCPFASTOPENACTIVEFAIL);
	return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
	/* SYN-data is counted as two separate packets in tcp_ack() */
	if (tp->delivered > 1)
		--tp->delivered;
}

tcp_fastopen_add_skb(sk, synack);

return false;

}

static void smc_check_reset_syn(struct tcp_sock *tp)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc && !tp->rx_opt.smc_ok)
tp->syn_smc = 0;
}
#endif
}

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;

tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
	tp->rx_opt.rcv_tsecr -= tp->tsoffset;

if (th->ack) {
	/* rfc793:
	 * "If the state is SYN-SENT then
	 *    first check the ACK bit
	 *      If the ACK bit is set
	 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
	 *        a reset (unless the RST bit is set, if so drop
	 *        the segment and return)"
	 */
	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
	    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
		goto reset_and_undo;

	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
	    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
		     tcp_time_stamp(tp))) {
		NET_INC_STATS(sock_net(sk),
				LINUX_MIB_PAWSACTIVEREJECTED);
		goto reset_and_undo;
	}

	/* Now ACK is acceptable.
	 *
	 * "If the RST bit is set
	 *    If the ACK was acceptable then signal the user "error:
	 *    connection reset", drop the segment, enter CLOSED state,
	 *    delete TCB, and return."
	 */

	if (th->rst) {
		tcp_reset(sk);
		goto discard;
	}

	/* rfc793:
	 *   "fifth, if neither of the SYN or RST bits is set then
	 *    drop the segment and return."
	 *
	 *    See note below!
	 *                                        --ANK(990513)
	 */
	if (!th->syn)
		goto discard_and_undo;

	/* rfc793:
	 *   "If the SYN bit is on ...
	 *    are acceptable then ...
	 *    (our SYN has been ACKed), change the connection
	 *    state to ESTABLISHED..."
	 */

	tcp_ecn_rcv_synack(tp, th);

	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
	tcp_ack(sk, skb, FLAG_SLOWPATH);

	/* Ok.. it's good. Set up sequence numbers and
	 * move to established.
	 */
	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

	/* RFC1323: The window in SYN & SYN/ACK segments is
	 * never scaled.
	 */
	tp->snd_wnd = ntohs(th->window);

	if (!tp->rx_opt.wscale_ok) {
		tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
		tp->window_clamp = min(tp->window_clamp, 65535U);
	}

	if (tp->rx_opt.saw_tstamp) {
		tp->rx_opt.tstamp_ok	   = 1;
		tp->tcp_header_len =
			sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
		tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
		tcp_store_ts_recent(tp);
	} else {
		tp->tcp_header_len = sizeof(struct tcphdr);
	}

	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
	tcp_initialize_rcv_mss(sk);

	/* Remember, tcp_poll() does not lock socket!
	 * Change state from SYN-SENT only after copied_seq
	 * is initialized. */
	tp->copied_seq = tp->rcv_nxt;

	smc_check_reset_syn(tp);

	smp_mb();

	tcp_finish_connect(sk, skb);

	fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
			tcp_rcv_fastopen_synack(sk, skb, &foc);

	if (!sock_flag(sk, SOCK_DEAD)) {
		sk->sk_state_change(sk);
		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
	}
	if (fastopen_fail)
		return -1;
	if (sk->sk_write_pending ||
	    icsk->icsk_accept_queue.rskq_defer_accept ||
	    icsk->icsk_ack.pingpong) {
		/* Save one ACK. Data will be ready after
		 * several ticks, if write_pending is set.
		 *
		 * It may be deleted, but with this feature tcpdumps
		 * look so _wonderfully_ clever, that I was not able
		 * to stand against the temptation 8)     --ANK
		 */
		inet_csk_schedule_ack(sk);
		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
					  TCP_DELACK_MAX, TCP_RTO_MAX);

discard:
tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
}
return -1;
}

/* No ACK in the segment */

if (th->rst) {
	/* rfc793:
	 * "If the RST bit is set
	 *
	 *      Otherwise (no ACK) drop the segment and return."
	 */

	goto discard_and_undo;
}

/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
    tcp_paws_reject(&tp->rx_opt, 0))
	goto discard_and_undo;

if (th->syn) {
	/* We see SYN without ACK. It is attempt of
	 * simultaneous connect with crossed SYNs.
	 * Particularly, it can be connect to self.
	 */
	tcp_set_state(sk, TCP_SYN_RECV);

	if (tp->rx_opt.saw_tstamp) {
		tp->rx_opt.tstamp_ok = 1;
		tcp_store_ts_recent(tp);
		tp->tcp_header_len =
			sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
	} else {
		tp->tcp_header_len = sizeof(struct tcphdr);
	}

	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
	tp->copied_seq = tp->rcv_nxt;
	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

	/* RFC1323: The window in SYN & SYN/ACK segments is
	 * never scaled.
	 */
	tp->snd_wnd    = ntohs(th->window);
	tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
	tp->max_window = tp->snd_wnd;

	tcp_ecn_rcv_syn(tp, th);

	tcp_mtup_init(sk);
	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
	tcp_initialize_rcv_mss(sk);

	tcp_send_synack(sk);

#if 0
/* Note, we could accept data and URG from this segment.
* There are no obstacles to make this (except that we must
* either change tcp_recvmsg() to prevent it from returning data
* before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
* Also, seems the code doing it in step6 of tcp_rcv_state_process
* is not flawless. So, discard packet for sanity.
* Uncomment this return to process the data.
/
return -1;
#else
goto discard;
#endif
}
/
“fifth, if neither of the SYN or RST bits is set then
* drop the segment and return.”
*/

discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
goto discard;

reset_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
return 1;
}

/*

  • This function implements the receiving procedure of RFC 793 for
  • all states except ESTABLISHED and TIME_WAIT.
  • It’s called from both tcp_v4_rcv and tcp_v6_rcv and should be
  • address independent.
    */

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;

switch (sk->sk_state) {
case TCP_CLOSE:
	goto discard;

case TCP_LISTEN:
	if (th->ack)
		return 1;

	if (th->rst)
		goto discard;

	if (th->syn) {
		if (th->fin)
			goto discard;
		/* It is possible that we process SYN packets from backlog,
		 * so we need to make sure to disable BH and RCU right there.
		 */
		rcu_read_lock();
		local_bh_disable();
		acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
		local_bh_enable();
		rcu_read_unlock();

		if (!acceptable)
			return 1;
		consume_skb(skb);
		return 0;
	}
	goto discard;

case TCP_SYN_SENT:
	tp->rx_opt.saw_tstamp = 0;
	tcp_mstamp_refresh(tp);
	queued = tcp_rcv_synsent_state_process(sk, skb, th);
	if (queued >= 0)
		return queued;

	/* Do step6 onward by hand. */
	tcp_urg(sk, skb, th);
	__kfree_skb(skb);
	tcp_data_snd_check(sk);
	return 0;
}

tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
	bool req_stolen;

	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
	    sk->sk_state != TCP_FIN_WAIT1);

	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
		goto discard;
}

if (!th->ack && !th->rst && !th->syn)
	goto discard;

if (!tcp_validate_incoming(sk, skb, th, 0))
	return 0;

/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
			      FLAG_UPDATE_TS_RECENT |
			      FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
	if (sk->sk_state == TCP_SYN_RECV)
		return 1;	/* send one RST */
	tcp_send_challenge_ack(sk, skb);
	goto discard;
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
	if (!tp->srtt_us)
		tcp_synack_rtt_meas(sk, req);

	/* Once we leave TCP_SYN_RECV, we no longer need req
	 * so release it.
	 */
	if (req) {
		inet_csk(sk)->icsk_retransmits = 0;
		reqsk_fastopen_remove(sk, req, false);
		/* Re-arm the timer because data may have been sent out.
		 * This is similar to the regular data transmission case
		 * when new data has just been ack'ed.
		 *
		 * (TFO) - we could try to be more aggressive and
		 * retransmitting any data sooner based on when they
		 * are sent out.
		 */
		tcp_rearm_rto(sk);
	} else {
		tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
		tp->copied_seq = tp->rcv_nxt;
	}
	smp_mb();
	tcp_set_state(sk, TCP_ESTABLISHED);
	sk->sk_state_change(sk);

	/* Note, that this wakeup is only for marginal crossed SYN case.
	 * Passively open sockets are not waked up, because
	 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
	 */
	if (sk->sk_socket)
		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);

	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

	if (tp->rx_opt.tstamp_ok)
		tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
		tcp_update_pacing_rate(sk);

	/* Prevent spurious tcp_cwnd_restart() on first data packet */
	tp->lsndtime = tcp_jiffies32;

	tcp_initialize_rcv_mss(sk);
	tcp_fast_path_on(tp);
	break;

case TCP_FIN_WAIT1: {
	int tmo;

	/* If we enter the TCP_FIN_WAIT1 state and we are a
	 * Fast Open socket and this is the first acceptable
	 * ACK we have received, this would have acknowledged
	 * our SYNACK so stop the SYNACK timer.
	 */
	if (req) {
		/* We no longer need the request sock. */
		reqsk_fastopen_remove(sk, req, false);
		tcp_rearm_rto(sk);
	}
	if (tp->snd_una != tp->write_seq)
		break;

	tcp_set_state(sk, TCP_FIN_WAIT2);
	sk->sk_shutdown |= SEND_SHUTDOWN;

	sk_dst_confirm(sk);

	if (!sock_flag(sk, SOCK_DEAD)) {
		/* Wake up lingering close() */
		sk->sk_state_change(sk);
		break;
	}

	if (tp->linger2 < 0) {
		tcp_done(sk);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
		return 1;
	}
	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
	    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
		/* Receive out of order FIN after close() */
		if (tp->syn_fastopen && th->fin)
			tcp_fastopen_active_disable(sk);
		tcp_done(sk);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
		return 1;
	}

	tmo = tcp_fin_time(sk);
	if (tmo > TCP_TIMEWAIT_LEN) {
		inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
	} else if (th->fin || sock_owned_by_user(sk)) {
		/* Bad case. We could lose such FIN otherwise.
		 * It is not a big problem, but it looks confusing
		 * and not so rare event. We still can lose it now,
		 * if it spins in bh_lock_sock(), but it is really
		 * marginal case.
		 */
		inet_csk_reset_keepalive_timer(sk, tmo);
	} else {
		tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
		goto discard;
	}
	break;
}

case TCP_CLOSING:
	if (tp->snd_una == tp->write_seq) {
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
		goto discard;
	}
	break;

case TCP_LAST_ACK:
	if (tp->snd_una == tp->write_seq) {
		tcp_update_metrics(sk);
		tcp_done(sk);
		goto discard;
	}
	break;
}

/* step 6: check the URG bit */
tcp_urg(sk, skb, th);

/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
		break;
	/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
	/* RFC 793 says to queue data in these states,
	 * RFC 1122 says we MUST send a reset.
	 * BSD 4.4 also does reset.
	 */
	if (sk->sk_shutdown & RCV_SHUTDOWN) {
		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
			tcp_reset(sk);
			return 1;
		}
	}
	/* Fall through */
case TCP_ESTABLISHED:
	tcp_data_queue(sk, skb);
	queued = 1;
	break;
}

/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
	tcp_data_snd_check(sk);
	tcp_ack_snd_check(sk);
}

if (!queued) {

discard:
tcp_drop(sk, skb);
}
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);

static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
struct inet_request_sock *ireq = inet_rsk(req);

if (family == AF_INET)
	net_dbg_ratelimited("drop open request from %pI4/%u\n",
			    &ireq->ir_rmt_addr, port);

#if IS_ENABLED(CONFIG_IPV6)
else if (family == AF_INET6)
net_dbg_ratelimited(“drop open request from %pI6/%u\n”,
&ireq->ir_v6_rmt_addr, port);
#endif
}

/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
*

  • If we receive a SYN packet with these bits set, it means a

  • network is playing bad games with TOS bits. In order to

  • avoid possible false congestion notifications, we disable

  • TCP ECN negotiation.

  • Exception: tcp_ca wants ECN. This is required for DCTCP

  • congestion control: Linux DCTCP asserts ECT on all packets,

  • including SYN, which is most optimal solution; however,

  • others, such as FreeBSD do not.
    */
    static void tcp_ecn_create_request(struct request_sock *req,
    const struct sk_buff *skb,
    const struct sock *listen_sk,
    const struct dst_entry *dst)
    {
    const struct tcphdr *th = tcp_hdr(skb);
    const struct net *net = sock_net(listen_sk);
    bool th_ecn = th->ece && th->cwr;
    bool ect, ecn_ok;
    u32 ecn_ok_dst;

    if (!th_ecn)
    return;

    ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
    ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
    ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;

    if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
    tcp_bpf_ca_needs_ecn((struct sock *)req))
    inet_rsk(req)->ecn_ok = 1;
    }

static void tcp_openreq_init(struct request_sock *req,
const struct tcp_options_received *rx_opt,
struct sk_buff *skb, const struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);

req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_clock_us();
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);

#if IS_ENABLED(CONFIG_SMC)
ireq->smc_ok = rx_opt->smc_ok;
#endif
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk_listener,
bool attach_listener)
{
struct request_sock *req = reqsk_alloc(ops, sk_listener,
attach_listener);

if (req) {
	struct inet_request_sock *ireq = inet_rsk(req);

	ireq->ireq_opt = NULL;

#if IS_ENABLED(CONFIG_IPV6)
ireq->pktopts = NULL;
#endif
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
}

return req;

}
EXPORT_SYMBOL(inet_reqsk_alloc);

/*

  • Return true if a syncookie should be sent
    */
    static bool tcp_syn_flood_action(const struct sock *sk,
    const struct sk_buff *skb,
    const char *proto)
    {
    struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
    const char *msg = “Dropping request”;
    bool want_cookie = false;
    struct net *net = sock_net(sk);

#ifdef CONFIG_SYN_COOKIES
if (net->ipv4.sysctl_tcp_syncookies) {
msg = “Sending cookies”;
want_cookie = true;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);

if (!queue->synflood_warned &&
    net->ipv4.sysctl_tcp_syncookies != 2 &&
    xchg(&queue->synflood_warned, 1) == 0)
	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
			     proto, ntohs(tcp_hdr(skb)->dest), msg);

return want_cookie;

}

static void tcp_reqsk_record_syn(const struct sock *sk,
struct request_sock *req,
const struct sk_buff *skb)
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
u32 *copy;

	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
	if (copy) {
		copy[0] = len;
		memcpy(&copy[1], skb_network_header(skb), len);
		req->saved_syn = copy;
	}
}

}

int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;

/* TW buckets are converted to open requests without
 * limitations, they conserve resources and peer is
 * evidently real one.
 */
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
	want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
	if (!want_cookie)
		goto drop;
}

if (sk_acceptq_is_full(sk)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
	goto drop;
}

req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
	goto drop;

tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;

tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss  = tp->rx_opt.user_mss;
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
		  want_cookie ? NULL : &foc);

if (want_cookie && !tmp_opt.saw_tstamp)
	tcp_clear_options(&tmp_opt);

if (IS_ENABLED(CONFIG_SMC) && want_cookie)
	tmp_opt.smc_ok = 0;

tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);

af_ops->init_req(req, sk, skb);

if (security_inet_conn_request(sk, skb, req))
	goto drop_and_free;

if (tmp_opt.tstamp_ok)
	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);

dst = af_ops->route_req(sk, &fl, req);
if (!dst)
	goto drop_and_free;

if (!want_cookie && !isn) {
	/* Kill the following clause, if you dislike this way. */
	if (!net->ipv4.sysctl_tcp_syncookies &&
	    (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
	     (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
	    !tcp_peer_is_proven(req, dst)) {
		/* Without syncookies last quarter of
		 * backlog is filled with destinations,
		 * proven to be alive.
		 * It means that we continue to communicate
		 * to destinations, already remembered
		 * to the moment of synflood.
		 */
		pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
			    rsk_ops->family);
		goto drop_and_release;
	}

	isn = af_ops->init_seq(skb);
}

tcp_ecn_create_request(req, skb, sk, dst);

if (want_cookie) {
	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
	req->cookie_ts = tmp_opt.tstamp_ok;
	if (!tmp_opt.tstamp_ok)
		inet_rsk(req)->ecn_ok = 0;
}

tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
	tcp_reqsk_record_syn(sk, req, skb);
	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
	af_ops->send_synack(fastopen_sk, dst, &fl, req,
			    &foc, TCP_SYNACK_FASTOPEN);
	/* Add the child socket directly into the accept queue */
	inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
	sk->sk_data_ready(sk);
	bh_unlock_sock(fastopen_sk);
	sock_put(fastopen_sk);
} else {
	tcp_rsk(req)->tfo_listener = false;
	if (!want_cookie)
		inet_csk_reqsk_queue_hash_add(sk, req,
			tcp_timeout_init((struct sock *)req));
	af_ops->send_synack(sk, dst, &fl, req, &foc,
			    !want_cookie ? TCP_SYNACK_NORMAL :
					   TCP_SYNACK_COOKIE);
	if (want_cookie) {
		reqsk_free(req);
		return 0;
	}
}
reqsk_put(req);
return 0;

drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值