Tcp_Ip测试

蜗牛的编码旅程

于 2023-06-15 17:33:53 发布

阅读量182

点赞数

分类专栏： Linux内核学习文章标签： tcp/ip 网络协议网络

本文链接：https://blog.csdn.net/m0_37605465/article/details/131232655

版权

Linux内核学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; / Flag to (re)transmit to recover losses */
u32 prior_fack;

sack_state.first_sackt = 0;
sack_state.rate = &rs;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);

/* If the ack is older than previous acks
 * then we can probably ignore it.
 */
if (before(ack, prior_snd_una)) {
	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
	if (before(ack, prior_snd_una - tp->max_window)) {
		if (!(flag & FLAG_NO_CHALLENGE_ACK))
			tcp_send_challenge_ack(sk, skb);
		return -1;
	}
	goto old_ack;
}

/* If the ack includes data we haven't sent yet, discard
 * this segment (RFC793 Section 3.9).
 */
if (after(ack, tp->snd_nxt))
	goto invalid_ack;

if (after(ack, prior_snd_una)) {
	flag |= FLAG_SND_UNA_ADVANCED;
	icsk->icsk_retransmits = 0;

#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled))
if (icsk->icsk_clean_acked)
icsk->icsk_clean_acked(sk, ack);
#endif
}

prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);

/* ts_recent update must be made after we are sure that the packet
 * is in window.
 */
if (flag & FLAG_UPDATE_TS_RECENT)
	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);

if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
	/* Window is constant, pure forward advance.
	 * No more checks are required.
	 * Note, we use the fact that SND.UNA>=SND.WL2.
	 */
	tcp_update_wl(tp, ack_seq);
	tcp_snd_una_update(tp, ack);
	flag |= FLAG_WIN_UPDATE;

	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);

	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
	u32 ack_ev_flags = CA_ACK_SLOWPATH;

	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
		flag |= FLAG_DATA;
	else
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);

	flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);

	if (TCP_SKB_CB(skb)->sacked)
		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
						&sack_state);

	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
		flag |= FLAG_ECE;
		ack_ev_flags |= CA_ACK_ECE;
	}

	if (flag & FLAG_WIN_UPDATE)
		ack_ev_flags |= CA_ACK_WIN_UPDATE;

	tcp_in_ack_event(sk, ack_ev_flags);
}

/* We passed data and got it acked, remove any soft error
 * log. Something worked...
 */
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
	goto no_queue;

/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);

tcp_rack_update_reo_wnd(sk, &rs);

if (tp->tlp_high_seq)
	tcp_process_tlp_ack(sk, ack, flag);
/* If needed, reset TLP/RTO timer; RACK may later override this. */
if (flag & FLAG_SET_XMIT_TIMER)
	tcp_set_xmit_timer(sk);

if (tcp_ack_is_dubious(sk, flag)) {
	is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
	tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
			      &rexmit);
}

if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
	sk_dst_confirm(sk);

delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost;			/* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;

no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. /
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
/ If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
tcp_ack_probe(sk);

if (tp->tlp_high_seq)
	tcp_process_tlp_ack(sk, ack, flag);
return 1;

invalid_ack:
SOCK_DEBUG(sk, “Ack %u after %u:%u\n”, ack, tp->snd_una, tp->snd_nxt);
return -1;

old_ack:
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}

SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;

}
static int tcp_ack_check_unaccess(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
if (after(ack, tp->snd_nxt))
return 1;
else
return 0;
}
static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
bool syn, struct tcp_fastopen_cookie foc,
bool exp_opt)
{
/ Valid only in SYN or SYN-ACK with an even length. */
if (!foc || !syn || len < 0 || (len & 1))
return;

if (len >= TCP_FASTOPEN_COOKIE_MIN &&
    len <= TCP_FASTOPEN_COOKIE_MAX)
	memcpy(foc->val, cookie, len);
else if (len != 0)
	len = -1;
foc->len = len;
foc->exp = exp_opt;

}

static void smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
opt_rx->smc_ok = 1;
}
#endif
}

/* Look for tcp options. Normally only called on SYN and SYNACK packets.

But, this can also be called on packets in the established flow when

the fast version below fails.
*/
void tcp_parse_options(const struct net *net,
const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);

ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;

while (length > 0) {
int opcode = *ptr++;
int opsize;

 switch (opcode) {
 case TCPOPT_EOL:
 	return;
 case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
 	length--;
 	continue;
 default:
 	opsize = *ptr++;
 	if (opsize < 2) /* "silly options" */
 		return;
 	if (opsize > length)
 		return;	/* don't parse partial options */
 	switch (opcode) {
 	case TCPOPT_MSS:
 		if (opsize == TCPOLEN_MSS && th->syn && !estab) {
 			u16 in_mss = get_unaligned_be16(ptr);
 			if (in_mss) {
 				if (opt_rx->user_mss &&
 				    opt_rx->user_mss < in_mss)
 					in_mss = opt_rx->user_mss;
 				opt_rx->mss_clamp = in_mss;
 			}
 		}
 		break;
 	case TCPOPT_WINDOW:
 		if (opsize == TCPOLEN_WINDOW && th->syn &&
 		    !estab && net->ipv4.sysctl_tcp_window_scaling) {
 			__u8 snd_wscale = *(__u8 *)ptr;
 			opt_rx->wscale_ok = 1;
 			if (snd_wscale > TCP_MAX_WSCALE) {
 				net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
 						     __func__,
 						     snd_wscale,
 						     TCP_MAX_WSCALE);
 				snd_wscale = TCP_MAX_WSCALE;
 			}
 			opt_rx->snd_wscale = snd_wscale;
 		}
 		break;
 	case TCPOPT_TIMESTAMP:
 		if ((opsize == TCPOLEN_TIMESTAMP) &&
 		    ((estab && opt_rx->tstamp_ok) ||
 		     (!estab && net->ipv4.sysctl_tcp_timestamps))) {
 			opt_rx->saw_tstamp = 1;
 			opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 			opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
 		}
 		break;
 	case TCPOPT_SACK_PERM:
 		if (opsize == TCPOLEN_SACK_PERM && th->syn &&
 		    !estab && net->ipv4.sysctl_tcp_sack) {
 			opt_rx->sack_ok = TCP_SACK_SEEN;
 			tcp_sack_reset(opt_rx);
 		}
 		break;

 	case TCPOPT_SACK:
 		if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 		   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
 		   opt_rx->sack_ok) {
 			TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
 		}
 		break;

#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
ptr, th->syn, foc, false);
break;

		case TCPOPT_EXP:
			/* Fast Open option shares code 254 using a
			 * 16 bits magic number.
			 */
			if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
			    get_unaligned_be16(ptr) ==
			    TCPOPT_FASTOPEN_MAGIC)
				tcp_parse_fastopen_option(opsize -
					TCPOLEN_EXP_FASTOPEN_BASE,
					ptr + 2, th->syn, foc, true);
			else
				smc_parse_options(th, opt_rx, ptr,
						  opsize);
			break;

		}
		ptr += opsize-2;
		length -= opsize;
	}
}

}
EXPORT_SYMBOL(tcp_parse_options);

static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
const __be32 *ptr = (const __be32 *)(th + 1);

if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
		  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
	tp->rx_opt.saw_tstamp = 1;
	++ptr;
	tp->rx_opt.rcv_tsval = ntohl(*ptr);
	++ptr;
	if (*ptr)
		tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
	else
		tp->rx_opt.rcv_tsecr = 0;
	return true;
}
return false;

}

/* Fast parse options. This hopes to only see timestamps.

If it is wrong it falls back on tcp_parse_options().
*/
static bool tcp_fast_parse_options(const struct net *net,
const struct sk_buff *skb,
const struct tcphdr *th, struct tcp_sock tp)
{
/ In the spirit of fast parsing, compare doff directly to constant
- values. Because equality is used, short doff can be ignored here.
  */
  if (th->doff == (sizeof(*th) / 4)) {
  tp->rx_opt.saw_tstamp = 0;
  return false;
  } else if (tp->rx_opt.tstamp_ok &&
  th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
  if (tcp_parse_aligned_timestamp(tp, th))
  return true;
  }
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;

return true;
}

#ifdef CONFIG_TCP_MD5SIG
/*

Parse MD5 Signature option
*/
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);

/* If not enough data remaining, we can short cut */
while (length >= TCPOLEN_MD5SIG) {
int opcode = *ptr++;
int opsize;
```
 switch (opcode) {
 case TCPOPT_EOL:
 	return NULL;
 case TCPOPT_NOP:
 	length--;
 	continue;
 default:
 	opsize = *ptr++;
 	if (opsize < 2 || opsize > length)
 		return NULL;
 	if (opcode == TCPOPT_MD5SIG)
 		return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
 }
 ptr += opsize - 2;
 length -= opsize;
```
}
return NULL;
}
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif

/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*

It is not fatal. If this ACK does not change critical state (seqs, window)
it can pass through stack. So, the following predicate verifies that
this segment is not used for anything but congestion avoidance or
fast retransmit. Moreover, we even are able to eliminate most of such
second order effects, if we apply some small “replay” window (~RTO)
to timestamp space.
All these measures still do not guarantee that we reject wrapped ACKs
on networks with high bandwidth, when sequence space is recycled fastly,
but it guarantees that such events will be very rare and do not affect
connection seriously. This doesn’t look nice, but alas, PAWS is really
buggy extension.
[ Later note. Even worse! It is buggy for segments with data. RFC
states that events when retransmit arrives after original data are rare.
It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
the biggest problem on large power networks even with minor reordering.
OK, let’s give it small replay window. If peer clock is even 1hz, it is safe
up to bandwidth of 18Gigabit/sec. 8) ]
*/

static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;

return (/* 1. Pure ACK with correct sequence number. */
	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&

	/* 2. ... and duplicate ACK. */
	ack == tp->snd_una &&

	/* 3. ... and does not update window. */
	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&

	/* 4. ... and sits in replay window. */
	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);

}

static inline bool tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);

return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
       !tcp_disordered_ack(sk, skb);

}

/* Check segment sequence number for validity.
*

Segment controls are considered valid, if the segment
fits to the window after truncation to the window. Acceptability
of data (and SYN, FIN, of course) is checked separately.
See tcp_data_queue(), for example.
Also, controls (RST is main one) are accepted using RCV.WUP instead
of RCV.NXT. Peer still did not advance his SND.UNA when we
delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
(borrowed from freebsd)
*/

static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}

/* When we get a reset we do this. */
void tcp_reset(struct sock *sk)
{
trace_tcp_receive_reset(sk);

/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
	sk->sk_err = ECONNREFUSED;
	break;
case TCP_CLOSE_WAIT:
	sk->sk_err = EPIPE;
	break;
case TCP_CLOSE:
	return;
default:
	sk->sk_err = ECONNRESET;
}
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();

tcp_write_queue_purge(sk);
tcp_done(sk);

if (!sock_flag(sk, SOCK_DEAD))
	sk->sk_error_report(sk);

}

Process the FIN bit. This now behaves as it is supposed to work
and the FIN takes effect when it is validly part of sequence
space. Not before when we get holes.
If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
(and thence onto LAST-ACK and finally, CLOSE, we never enter
TIME-WAIT)
If we are in FINWAIT-1, a received FIN indicates simultaneous
close and we go into CLOSING (and later onto TIME-WAIT)
If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

inet_csk_schedule_ack(sk);

sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);

switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk(sk)->icsk_ack.pingpong = 1;
break;

case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
/
break;
case TCP_LAST_ACK:
/ RFC793: Remain in the LAST-ACK state. */
break;

case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/ Received a FIN – send ACK and enter TIME_WAIT. /
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/ Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err(“%s: Impossible, sk->sk_state=%d\n”,
func, sk->sk_state);
break;
}

/* It is possible, that we have something out-of-order after FIN.
- Probably, we should reset in this case. For now drop them.
  */
  skb_rbtree_purge(&tp->out_of_order_queue);
  if (tcp_is_sack(tp))
  tcp_sack_reset(&tp->rx_opt);
  sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
```
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
    sk->sk_state == TCP_CLOSE)
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
```
}
}

static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
return true;
}
return false;
}

static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);

if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	int mib_idx;

	if (before(seq, tp->rcv_nxt))
		mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
	else
		mib_idx = LINUX_MIB_TCPDSACKOFOSENT;

	NET_INC_STATS(sock_net(sk), mib_idx);

	tp->rx_opt.dsack = 1;
	tp->duplicate_sack[0].start_seq = seq;
	tp->duplicate_sack[0].end_seq = end_seq;
}

}

static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);

if (!tp->rx_opt.dsack)
	tcp_dsack_set(sk, seq, end_seq);
else
	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);

}

static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);

if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);

	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
		u32 end_seq = TCP_SKB_CB(skb)->end_seq;

		if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
			end_seq = tp->rcv_nxt;
		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
	}
}

tcp_send_ack(sk);

}

/* These routines update the SACK block as out-of-order packets arrive or

in-order packets close up the sequence space.
*/
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
struct tcp_sack_block *swalk = sp + 1;

/* See if the recent change to the first SACK eats into
- or hits the sequence space of other SACK blocks, if so coalesce.
  */
  for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
  if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
  int i;
```
 /* Zap SWALK, by moving every further SACK up by one slot.
  * Decrease num_sacks.
  */
 tp->rx_opt.num_sacks--;
 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
 	sp[i] = sp[i + 1];
 continue;
```
  }
  this_sack++, swalk++;
  }
  }

static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;

if (!cur_sacks)
	goto new_sack;

for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
	if (tcp_sack_extend(sp, seq, end_seq)) {
		/* Rotate this_sack to the first one. */
		for (; this_sack > 0; this_sack--, sp--)
			swap(*sp, *(sp - 1));
		if (cur_sacks > 1)
			tcp_sack_maybe_coalesce(tp);
		return;
	}
}

/* Could not find an adjacent existing SACK, build a new one,
 * put it at the front, and shift everyone else down.  We
 * always know there is at least one SACK present already here.
 *
 * If the sack array is full, forget about the last one.
 */
if (this_sack >= TCP_NUM_SACKS) {
	if (tp->compressed_ack)
		tcp_send_ack(sk);
	this_sack--;
	tp->rx_opt.num_sacks--;
	sp--;
}
for (; this_sack > 0; this_sack--, sp--)
	*sp = *(sp - 1);

new_sack:
/* Build the new head SACK, and we’re done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}

/* RCV.NXT advances, some SACKs should be eaten. */

static void tcp_sack_remove(struct tcp_sock *tp)
{
struct tcp_sack_block *sp = &tp->selective_acks[0];
int num_sacks = tp->rx_opt.num_sacks;
int this_sack;

/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
	tp->rx_opt.num_sacks = 0;
	return;
}

for (this_sack = 0; this_sack < num_sacks;) {
	/* Check if the start of the sack is covered by RCV.NXT. */
	if (!before(tp->rcv_nxt, sp->start_seq)) {
		int i;

		/* RCV.NXT must cover all the block! */
		WARN_ON(before(tp->rcv_nxt, sp->end_seq));

		/* Zap this SACK, by moving forward any other SACKS. */
		for (i = this_sack+1; i < num_sacks; i++)
			tp->selective_acks[i-1] = tp->selective_acks[i];
		num_sacks--;
		continue;
	}
	this_sack++;
	sp++;
}
tp->rx_opt.num_sacks = num_sacks;

}

/**

tcp_try_coalesce - try to merge skb to prior one
@sk: socket
@dest: destination queue
@to: prior buffer
@from: buffer to add in queue
@fragstolen: pointer to boolean
Before queueing skb @from after @to, try to merge them
to reduce overall memory use and queue lengths, if cost is small.
Packets in ofo or receive queues can stay a long time.
Better try to coalesce them right now to avoid future collapses.
Returns true if caller should free @from instead of queueing it
*/
static bool tcp_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
{
int delta;

*fragstolen = false;

/* Its possible this segment overlaps with prior segment in queue */
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;

#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
#endif

if (!skb_try_coalesce(to, from, fragstolen, &delta))
	return false;

atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;

if (TCP_SKB_CB(from)->has_rxtstamp) {
	TCP_SKB_CB(to)->has_rxtstamp = true;
	to->tstamp = from->tstamp;
}

return true;

}

static bool tcp_ooo_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
{
bool res = tcp_try_coalesce(sk, to, from, fragstolen);

/* In case tcp_drop() is called later, update to->gso_segs */
if (res) {
	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
		       max_t(u16, 1, skb_shinfo(from)->gso_segs);

	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
}
return res;

}

static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
__kfree_skb(skb);
}

/* This one checks to see if we can put data from the

out_of_order queue into the receive_queue.
*/
static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
bool fin, fragstolen, eaten;
struct sk_buff *skb, *tail;
struct rb_node *p;

p = rb_first(&tp->out_of_order_queue);
while § {
skb = rb_to_skb§;
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;

 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
 	__u32 dsack = dsack_high;
 	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
 		dsack_high = TCP_SKB_CB(skb)->end_seq;
 	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
 }
 p = rb_next(p);
 rb_erase(&skb->rbnode, &tp->out_of_order_queue);

 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 	SOCK_DEBUG(sk, "ofo packet was already received\n");
 	tcp_drop(sk, skb);
 	continue;
 }
 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
 	   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
 	   TCP_SKB_CB(skb)->end_seq);

 tail = skb_peek_tail(&sk->sk_receive_queue);
 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 if (!eaten)
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 else
 	kfree_skb_partial(skb, fragstolen);

 if (unlikely(fin)) {
 	tcp_fin(sk);
 	/* tcp_fin() purges tp->out_of_order_queue,
 	 * so we must end this loop right now.
 	 */
 	break;
 }

}
}

static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);

static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {

	if (tcp_prune_queue(sk) < 0)
		return -1;

	while (!sk_rmem_schedule(sk, skb, size)) {
		if (!tcp_prune_ofo_queue(sk))
			return -1;
	}
}
return 0;

}

static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node **p, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
bool fragstolen;

tcp_ecn_check_ce(sk, skb);

if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
	tcp_drop(sk, skb);
	return;
}

/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);

NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
	   tp->rcv_nxt, seq, end_seq);

p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
	/* Initial out of order segment, build 1 SACK. */
	if (tcp_is_sack(tp)) {
		tp->rx_opt.num_sacks = 1;
		tp->selective_acks[0].start_seq = seq;
		tp->selective_acks[0].end_seq = end_seq;
	}
	rb_link_node(&skb->rbnode, NULL, p);
	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
	tp->ooo_last_skb = skb;
	goto end;
}

/* In the typical case, we are adding an skb to the end of the list.
 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 */
if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
			 skb, &fragstolen)) {

coalesce_done:
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
}
/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
parent = &tp->ooo_last_skb->rbnode;
p = &parent->rb_right;
goto insert;
}

/* Find place to insert this segment. Handle overlaps on the way. */
parent = NULL;
while (*p) {
	parent = *p;
	skb1 = rb_to_skb(parent);
	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
		p = &parent->rb_left;
		continue;
	}
	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
			/* All the bits are present. Drop. */
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPOFOMERGE);
			tcp_drop(sk, skb);
			skb = NULL;
			tcp_dsack_set(sk, seq, end_seq);
			goto add_sack;
		}
		if (after(seq, TCP_SKB_CB(skb1)->seq)) {
			/* Partial overlap. */
			tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
		} else {
			/* skb's seq == skb1's seq and skb covers skb1.
			 * Replace skb1 with skb.
			 */
			rb_replace_node(&skb1->rbnode, &skb->rbnode,
					&tp->out_of_order_queue);
			tcp_dsack_extend(sk,
					 TCP_SKB_CB(skb1)->seq,
					 TCP_SKB_CB(skb1)->end_seq);
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPOFOMERGE);
			tcp_drop(sk, skb1);
			goto merge_right;
		}
	} else if (tcp_ooo_try_coalesce(sk, skb1,
					skb, &fragstolen)) {
		goto coalesce_done;
	}
	p = &parent->rb_right;
}

insert:
/* Insert segment into RB tree. */
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);

merge_right:
/* Remove other segments covered by skb. /
while ((skb1 = skb_rb_next(skb)) != NULL) {
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
/ If there is no skb after us, we are the last_skb ! */
if (!skb1)
tp->ooo_last_skb = skb;

add_sack:
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
tcp_grow_window(sk, skb);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}

static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

__skb_pull(skb, hdrlen);
eaten = (tail &&
	 tcp_try_coalesce(sk, tail,
			  skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	skb_set_owner_r(skb, sk);
}
return eaten;

}

int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
int err = -ENOMEM;
int data_len = 0;
bool fragstolen;

if (size == 0)
	return 0;

if (size > PAGE_SIZE) {
	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);

	data_len = npages << PAGE_SHIFT;
	size = data_len + (size & ~PAGE_MASK);
}
skb = alloc_skb_with_frags(size - data_len, data_len,
			   PAGE_ALLOC_COSTLY_ORDER,
			   &err, sk->sk_allocation);
if (!skb)
	goto err;

skb_put(skb, size - data_len);
skb->data_len = data_len;
skb->len = size;

if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
	goto err_free;
}

err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err)
	goto err_free;

TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;

if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
	WARN_ON_ONCE(fragstolen); /* should not happen */
	__kfree_skb(skb);
}
return size;

err_free:
kfree_skb(skb);
err:
return err;

}

void tcp_data_ready(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
int avail = tp->rcv_nxt - tp->copied_seq;

if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
	return;

sk->sk_data_ready(sk);

}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool fragstolen;
int eaten;

if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
	__kfree_skb(skb);
	return;
}
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);

tcp_ecn_accept_cwr(sk, skb);

tp->rx_opt.dsack = 0;

/*  Queue data for delivery to the user.
 *  Packets in sequence go to the receive queue.
 *  Out of sequence packets to the out_of_order_queue.
 */
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
	if (tcp_receive_window(tp) == 0) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
		goto out_of_window;
	}

	/* Ok. In sequence. In window. */

queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
}

	eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
	if (skb->len)
		tcp_event_data_recv(sk, skb);
	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
		tcp_fin(sk);

	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
		tcp_ofo_queue(sk);

		/* RFC5681. 4.2. SHOULD send immediate ACK, when
		 * gap in queue is filled.
		 */
		if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
	}

	if (tp->rx_opt.num_sacks)
		tcp_sack_remove(tp);

	tcp_fast_path_check(sk);

	if (eaten > 0)
		kfree_skb_partial(skb, fragstolen);
	if (!sock_flag(sk, SOCK_DEAD))
		tcp_data_ready(sk);
	return;
}

if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
	/* A retransmit, 2nd most common case.  Force an immediate ack. */
	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
return;
}

/* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
	goto out_of_window;

if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	/* Partial packet, seq < rcv_next < end_seq */
	SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
		   TCP_SKB_CB(skb)->end_seq);

	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);

	/* If window is closed, drop tail of packet. But after
	 * remembering D-SACK for its head made in previous line.
	 */
	if (!tcp_receive_window(tp)) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
		goto out_of_window;
	}
	goto queue_and_out;
}

tcp_data_queue_ofo(sk, skb);

}

static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
{
if (list)
return !skb_queue_is_last(list, skb) ? skb->next : NULL;

return skb_rb_next(skb);

}

static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *list,
struct rb_root *root)
{
struct sk_buff *next = tcp_skb_next(skb, list);

if (list)
	__skb_unlink(skb, list);
else
	rb_erase(&skb->rbnode, root);

__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);

return next;

}

/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct sk_buff *skb1;

while (*p) {
	parent = *p;
	skb1 = rb_to_skb(parent);
	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
		p = &parent->rb_left;
	else
		p = &parent->rb_right;
}
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, root);

}

/* Collapse contiguous sequence of skbs head…tail with

sequence numbers start…end.
If tail is NULL, this means until the end of the queue.
Segments with FIN/SYN are not collapsed (only because this
simplifies code)
*/
static void
tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
struct sk_buff *skb = head, *n;
struct sk_buff_head tmp;
bool end_of_skbs;

/* First, check that queue is collapsible and find
- the point where collapsing can be useful.
  */
  restart:
  for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
  n = tcp_skb_next(skb, list);
  
  /* No new bits? It is possible on ofo queue. */
  if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
  skb = tcp_collapse_one(sk, skb, list, root);
  if (!skb)
  break;
  goto restart;
  }
  
  /* The first skb to collapse is:
  - - not SYN/FIN and
  - - bloated or contains data before “start” or
  - overlaps to the next one.
    */
    if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
    before(TCP_SKB_CB(skb)->seq, start))) {
    end_of_skbs = false;
    break;
    }
  if (n && n != tail &&
  TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
  end_of_skbs = false;
  break;
  }
  
  /* Decided to skip this, advance start seq. */
  start = TCP_SKB_CB(skb)->end_seq;
  }
  if (end_of_skbs ||
  (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
  return;
__skb_queue_head_init(&tmp);

while (before(start, end)) {
int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
```
 nskb = alloc_skb(copy, GFP_ATOMIC);
 if (!nskb)
 	break;

 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
```

#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);

	/* Copy data, releasing collapsed skbs. */
	while (copy > 0) {
		int offset = start - TCP_SKB_CB(skb)->seq;
		int size = TCP_SKB_CB(skb)->end_seq - start;

		BUG_ON(offset < 0);
		if (size > 0) {
			size = min(copy, size);
			if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				BUG();
			TCP_SKB_CB(nskb)->end_seq += size;
			copy -= size;
			start += size;
		}
		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
			skb = tcp_collapse_one(sk, skb, list, root);
			if (!skb ||
			    skb == tail ||
			    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
				goto end;

#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
#endif
}
}
}
end:
skb_queue_walk_safe(&tmp, skb, n)
tcp_rbtree_insert(root, skb);
}

/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs

and tcp_collapse() them until all the queue is collapsed.
*/
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 range_truesize, sum_tiny = 0;
struct sk_buff *skb, *head;
u32 start, end;

skb = skb_rb_first(&tp->out_of_order_queue);
new_range:
if (!skb) {
tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
return;
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
range_truesize = skb->truesize;

for (head = skb;😉 {
skb = skb_rb_next(skb);

 /* Range is terminated when we see a gap or when
  * we are at the queue end.
  */
 if (!skb ||
     after(TCP_SKB_CB(skb)->seq, end) ||
     before(TCP_SKB_CB(skb)->end_seq, start)) {
 	/* Do not attempt collapsing tiny skbs */
 	if (range_truesize != head->truesize ||
 	    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
 		tcp_collapse(sk, NULL, &tp->out_of_order_queue,
 			     head, skb, start, end);
 	} else {
 		sum_tiny += range_truesize;
 		if (sum_tiny > sk->sk_rcvbuf >> 3)
 			return;
 	}
 	goto new_range;
 }

 range_truesize += skb->truesize;
 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 	start = TCP_SKB_CB(skb)->seq;
 if (after(TCP_SKB_CB(skb)->end_seq, end))
 	end = TCP_SKB_CB(skb)->end_seq;

}
}

Clean the out-of-order queue to make room.
We drop high sequences packets to :
1. Let a chance for holes to be filled.
1. not add too big latencies if thousands of packets sit there.
(But if application shrinks SO_RCVBUF, we could still end up
```
freeing whole queue here)
```
1. Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
Return true if queue has shrunk.
*/
static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
int goal;

if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;

NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
tcp_drop(sk, rb_to_skb(node));
if (!prev || goal <= 0) {
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
}
node = prev;
} while (node);
tp->ooo_last_skb = rb_to_skb(prev);

/* Reset SACK state. A conforming SACK implementation will
- do the same at a timeout based retransmit. When a connection
- is in a sad state like this, we care only about integrity
- of the connection not performance.
  */
  if (tp->rx_opt.sack_ok)
  tcp_sack_reset(&tp->rx_opt);
  return true;
  }

/* Reduce allocated memory if we can, trying to get

the socket within its memory limits again.
Return less than zero if we should start dropping frames
until the socket owning process reads some of the data
to stabilize the situation.
*/
static int tcp_prune_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

SOCK_DEBUG(sk, “prune_queue: c=%x\n”, tp->copied_seq);

NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);

if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);

if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;

tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
sk_mem_reclaim(sk);

if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;

/* Collapsing did not help, destructive actions follow.
- This must not ever occur. */
tcp_prune_ofo_queue(sk);

if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;

/* If we are really being abused, tell the caller to silently
- drop receive data on the floor. It will get retransmitted
- and hopefully then we’ll have sufficient space.
  */
  NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
return -1;
}

static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);

/* If the user specified a specific send buffer setting, do
 * not modify it.
 */
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
	return false;

/* If we are under global TCP memory pressure, do not expand.  */
if (tcp_under_memory_pressure(sk))
	return false;

/* If we are under soft global TCP memory pressure, do not expand.  */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
	return false;

/* If we filled the congestion window, do not expand.  */
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
	return false;

return true;

}

/* When incoming ACK allowed to free some skb from write_queue,

we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
on the exit from tcp input handler.
PROBLEM: sndbuf expansion does not work well with largesend.
*/
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

if (tcp_should_expand_sndbuf(sk)) {
tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_jiffies32;
}

sk->sk_write_space(sk);
}

static void tcp_check_space(struct sock sk)
{
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/ pairs with tcp_poll() */
smp_mb();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
}

static inline void tcp_data_snd_check(struct sock *sk)
{
tcp_push_pending_frames(sk);
tcp_check_space(sk);
}

Check if sending an ack is needed.
*/
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;
```
 /* More than one full frame received... */
```
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* … and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise).
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
/
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/ We ACK each frame or… /
tcp_in_quickack_mode(sk) ||
/ Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
send_now:
tcp_send_ack(sk);
return;
}

if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
return;
}

if (!tcp_is_sack(tp) ||
tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
goto send_now;
tp->compressed_ack++;

if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;

/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */

rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;

delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_SOFT);
}

static inline void tcp_ack_snd_check(struct sock sk)
{
if (!inet_csk_ack_scheduled(sk)) {
/ We sent a data segment already. */
return;
}
__tcp_ack_snd_check(sk, 1);
}

This routine is only called when we have urgent data
signaled. Its the ‘slow’ part of tcp_urg. It could be
moved inline now as tcp_urg is only called from one
place. We handle URGent data wrong. We have to - as
BSD still doesn’t use the correction from RFC961.
For 1003.1g we should support a new option TCP_STDURG to permit
either form (or just set the sysctl tcp_stdurg).
*/

static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);

if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
	ptr--;
ptr += ntohl(th->seq);

/* Ignore urgent data that we've already seen and read. */
if (after(tp->copied_seq, ptr))
	return;

/* Do not replay urg ptr.
 *
 * NOTE: interesting situation not covered by specs.
 * Misbehaving sender may send urg ptr, pointing to segment,
 * which we already have in ofo queue. We are not able to fetch
 * such data and will stay in TCP_URG_NOTYET until will be eaten
 * by recvmsg(). Seems, we are not obliged to handle such wicked
 * situations. But it is worth to think about possibility of some
 * DoSes using some hypothetical application level deadlock.
 */
if (before(ptr, tp->rcv_nxt))
	return;

/* Do we already have a newer (or duplicate) urgent pointer? */
if (tp->urg_data && !after(ptr, tp->urg_seq))
	return;

/* Tell the world about our new urgent pointer. */
sk_send_sigurg(sk);

/* We may be adding urgent data when the last byte read was
 * urgent. To do this requires some care. We cannot just ignore
 * tp->copied_seq since we would read the last urgent byte again
 * as data, nor can we alter copied_seq until this data arrives
 * or we break the semantics of SIOCATMARK (and thus sockatmark())
 *
 * NOTE. Double Dutch. Rendering to plain English: author of comment
 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
 * and expect that both A and B disappear from stream. This is _wrong_.
 * Though this happens in BSD with high probability, this is occasional.
 * Any application relying on this is buggy. Note also, that fix "works"
 * only in this artificial test. Insert some normal data between A and B and we will
 * decline of BSD again. Verdict: it is better to remove to trap
 * buggy users.
 */
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
	tp->copied_seq++;
	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
		__skb_unlink(skb, &sk->sk_receive_queue);
		__kfree_skb(skb);
	}
}

tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;

/* Disable header prediction. */
tp->pred_flags = 0;

}

/* This is the ‘fast’ part of urgent handling. */
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);

/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
	tcp_check_urg(sk, th);

/* Do we wait for any urgent data? - normally not... */
if (tp->urg_data == TCP_URG_NOTYET) {
	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
		  th->syn;

	/* Is the urgent pointer pointing into this packet? */
	if (ptr < skb->len) {
		u8 tmp;
		if (skb_copy_bits(skb, ptr, &tmp, 1))
			BUG();
		tp->urg_data = TCP_URG_VALID | tmp;
		if (!sock_flag(sk, SOCK_DEAD))
			sk->sk_data_ready(sk);
	}
}

}

/* Accept RST for rcv_nxt - 1 after a FIN.

When tcp connections are abruptly terminated from Mac OSX (via ^C), a
FIN is sent followed by a RST packet. The RST is sent with the same
sequence number as the FIN, and thus according to RFC 5961 a challenge
ACK should be sent. However, Mac OSX rate limits replies to challenge
ACKs on the closed socket. In addition middleboxes can drop either the
challenge ACK or a subsequent RST.
*/
static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);

return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
TCPF_CLOSING));
}

/* Does PAWS and seqno based validation of an incoming segment, flags will

play significant role here.
*/
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
bool rst_seq_match = false;

/* RFC1323: H1. Apply PAWS check first. /
if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
{
tcp_send_dupack(sk, skb);
}
goto discard;
}
/ Reset is accepted even if it did not pass PAWS. /
}
if(ack_err_flag)
{
tcp_send_dupack(sk, skb);
ack_err_flag = 0;
goto discard;
}
/ Step 1: check sequence number /
if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
/ RFC793, page 37: “In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields.”
* And page 69: “If an incoming segment is not acceptable,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)”.
*/
if (!th->rst) {
if (th->syn)
goto syn_challenge;
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
tcp_reset(sk);
}
goto discard;
}

/* Step 2: check RST bit /
if (th->rst) {
/ RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
* FIN and SACK too if available):
* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
* the right-most SACK block,
* then
* RESET the connection
* else
* Send a challenge ACK
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
tcp_reset_check(sk, skb)) {
rst_seq_match = true;
} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
int max_sack = sp[0].end_seq;
int this_sack;
```
 	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
 	     ++this_sack) {
 		max_sack = after(sp[this_sack].end_seq,
 				 max_sack) ?
 			sp[this_sack].end_seq : max_sack;
 	}

 	if (TCP_SKB_CB(skb)->seq == max_sack)
 		rst_seq_match = true;
 }

 if (rst_seq_match)
 	tcp_reset(sk);
 else {
 	/* Disable TFO if RST is out-of-order
 	 * and no data has been received
 	 * for current active TFO socket
 	 */
 	if (tp->syn_fastopen && !tp->data_segs_in &&
 	    sk->sk_state == TCP_ESTABLISHED)
 		tcp_fastopen_active_disable(sk);
 	tcp_send_challenge_ack(sk, skb);
 }
 goto discard;
```
}

/* step 3: check security and precedence [ignored] */

/* step 4: Check for a SYN
- RFC 5961 4.2 : Send a challenge ack
  */
  if (th->syn) {
  syn_challenge:
  if (syn_inerr)
  TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
  NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
  tcp_send_challenge_ack(sk, skb);
  goto discard;
  }
return true;

discard:
tcp_drop(sk, skb);
return false;
}

TCP receive function for the ESTABLISHED state.
It is split into a fast path and a slow path. The fast path is
disabled when:
- A zero window was announced from us - zero window probing

   is only handled properly in the slow path.

- Out of order segments arrived.
- Urgent data is expected.
- There is no buffer space left
- Unexpected TCP flags/window values/header lengths are received
(detected by checking the TCP header against pred_flags)
- Data is sent in both directions. Fast path only supports pure senders
or pure receivers (this means either the sequence number or the ack
value must stay constant)
- Unexpected TCP option.
When these conditions are not satisfied it drops into a standard
receive procedure patterned after RFC793 to handle all cases.
The first three cases are guaranteed by proper pred_flags setting,
the rest is checked inline. Fast processing is turned on in

tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock tp = tcp_sk(sk);
unsigned int len = skb->len;
/ TCP congestion window tracking */
trace_tcp_probe(sk, skb);

tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*

Header prediction.
The code loosely follows the one in the famous
“30 instruction TCP receive” Van Jacobson mail.
Van’s trick is to deposit buffers into socket queue
on a device interrupt, to call tcp_recv function
on the receive process context and checksum and copy
the buffer to user space. smart…
Our current scheme is not silly either but we take the
extra cost of the net_bh soft interrupt processing…
We do checksum and copy also but from device to kernel.
*/

tp->rx_opt.saw_tstamp = 0;

/* pred_flags is 0xS?10 << 16 + snd_wnd

if header_prediction is to be made
‘S’ will always be tp->tcp_header_len >> 2
‘?’ will be 0 for the fast path, otherwise pred_flags is 0 to
turn it off (when there are holes in the receive
space for instance)
PSH flag is ignored.
*/

if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;

/* Timestamp header prediction: tcp_header_len
 * is automatically equal to th->doff*4 due to pred_flags
 * match.
 */

/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
	/* No? Slow path! */
	if (!tcp_parse_aligned_timestamp(tp, th))
		goto slow_path;

	/* If PAWS failed, check it more carefully in slow path */
	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
		goto slow_path;

	/* DO NOT update ts_recent here, if checksum fails
	 * and timestamp was corrupted part, it will result
	 * in a hung connection since we will drop all
	 * future packets due to the PAWS test.
	 */
}

if (len <= tcp_header_len) {
	/* Bulk data transfer: sender */
	if (len == tcp_header_len) {
		/* Predicted packet is in window by definition.
		 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
		 * Hence, check seq<=rcv_wup reduces to:
		 */
		if (tcp_header_len ==
		    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
		    tp->rcv_nxt == tp->rcv_wup)
			tcp_store_ts_recent(tp);

		/* We know that such packets are checksummed
		 * on entry.
		 */
		tcp_ack(sk, skb, 0);
		__kfree_skb(skb);
		tcp_data_snd_check(sk);
		/* When receiving pure ack in fast path, update
		 * last ts ecr directly instead of calling
		 * tcp_rcv_rtt_measure_ts()
		 */
		tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
		return;
	} else { /* Header too small */
		TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		goto discard;
	}
} else {
	int eaten = 0;
	bool fragstolen = false;

	if (tcp_checksum_complete(skb))
		goto csum_error;

	if ((int)skb->truesize > sk->sk_forward_alloc)
		goto step5;

	/* Predicted packet is in window by definition.
	 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
	 * Hence, check seq<=rcv_wup reduces to:
	 */
	if (tcp_header_len ==
	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
	    tp->rcv_nxt == tp->rcv_wup)
		tcp_store_ts_recent(tp);

	tcp_rcv_rtt_measure_ts(sk, skb);

	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);

	/* Bulk data transfer: receiver */
	eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
			      &fragstolen);

	tcp_event_data_recv(sk, skb);

	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
		/* Well, only one small jumplet in fast path... */
		tcp_ack(sk, skb, FLAG_DATA);
		tcp_data_snd_check(sk);
		if (!inet_csk_ack_scheduled(sk))
			goto no_ack;
	}

	__tcp_ack_snd_check(sk, 0);

no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
tcp_data_ready(sk);
return;
}
}

slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;

if (!th->ack && !th->rst && !th->syn)
	goto discard;

/*
 *	Standard slow path.
 */
/*MMMMM case2 test deal big ack unaccess*/
if(tcp_ack_check_unaccess(sk,skb))
	ack_err_flag = 1;
if (!tcp_validate_incoming(sk, skb, th, 1))
{
	ack_err_flag = 0;
	return;
}

step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;

tcp_rcv_rtt_measure_ts(sk, skb);

/* Process urgent data. */
tcp_urg(sk, skb, th);

/* step 7: process the segment text */
tcp_data_queue(sk, skb);

tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return;

csum_error:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);

discard:
tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);

tcp_set_state(sk, TCP_ESTABLISHED);
icsk->icsk_ack.lrcvtime = tcp_jiffies32;

if (skb) {
	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
	security_inet_conn_established(sk, skb);
	sk_mark_napi_id(sk, skb);
}

tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);

/* Prevent spurious tcp_cwnd_restart() on first data
 * packet.
 */
tp->lsndtime = tcp_jiffies32;

if (sock_flag(sk, SOCK_KEEPOPEN))
	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

if (!tp->rx_opt.snd_wscale)
	__tcp_fast_path_on(tp, tp->snd_wnd);
else
	tp->pred_flags = 0;

}

static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;

if (mss == tp->rx_opt.user_mss) {
	struct tcp_options_received opt;

	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
	tcp_clear_options(&opt);
	opt.user_mss = opt.mss_clamp = 0;
	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
	mss = opt.mss_clamp;
}

if (!tp->syn_fastopen) {
	/* Ignore an unsolicited cookie */
	cookie->len = -1;
} else if (tp->total_retrans) {
	/* SYN timed out and the SYN-ACK neither has a cookie nor
	 * acknowledges data. Presumably the remote received only
	 * the retransmitted (regular) SYNs: either the original
	 * SYN-data or the corresponding SYN-ACK was dropped.
	 */
	syn_drop = (cookie->len < 0 && data);
} else if (cookie->len < 0 && !tp->syn_data) {
	/* We requested a cookie but didn't get it. If we did not use
	 * the (old) exp opt format then try so next time (try_exp=1).
	 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
	 */
	try_exp = tp->syn_fastopen_exp ? 2 : 1;
}

tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);

if (data) { /* Retransmit unacked data in SYN */
	skb_rbtree_walk_from(data) {
		if (__tcp_retransmit_skb(sk, data, 1))
			break;
	}
	tcp_rearm_rto(sk);
	NET_INC_STATS(sock_net(sk),
			LINUX_MIB_TCPFASTOPENACTIVEFAIL);
	return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
	/* SYN-data is counted as two separate packets in tcp_ack() */
	if (tp->delivered > 1)
		--tp->delivered;
}

tcp_fastopen_add_skb(sk, synack);

return false;

}

static void smc_check_reset_syn(struct tcp_sock *tp)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc && !tp->rx_opt.smc_ok)
tp->syn_smc = 0;
}
#endif
}

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;

tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
	tp->rx_opt.rcv_tsecr -= tp->tsoffset;

if (th->ack) {
	/* rfc793:
	 * "If the state is SYN-SENT then
	 *    first check the ACK bit
	 *      If the ACK bit is set
	 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
	 *        a reset (unless the RST bit is set, if so drop
	 *        the segment and return)"
	 */
	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
	    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
		goto reset_and_undo;

	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
	    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
		     tcp_time_stamp(tp))) {
		NET_INC_STATS(sock_net(sk),
				LINUX_MIB_PAWSACTIVEREJECTED);
		goto reset_and_undo;
	}

	/* Now ACK is acceptable.
	 *
	 * "If the RST bit is set
	 *    If the ACK was acceptable then signal the user "error:
	 *    connection reset", drop the segment, enter CLOSED state,
	 *    delete TCB, and return."
	 */

	if (th->rst) {
		tcp_reset(sk);
		goto discard;
	}

	/* rfc793:
	 *   "fifth, if neither of the SYN or RST bits is set then
	 *    drop the segment and return."
	 *
	 *    See note below!
	 *                                        --ANK(990513)
	 */
	if (!th->syn)
		goto discard_and_undo;

	/* rfc793:
	 *   "If the SYN bit is on ...
	 *    are acceptable then ...
	 *    (our SYN has been ACKed), change the connection
	 *    state to ESTABLISHED..."
	 */

	tcp_ecn_rcv_synack(tp, th);

	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
	tcp_ack(sk, skb, FLAG_SLOWPATH);

	/* Ok.. it's good. Set up sequence numbers and
	 * move to established.
	 */
	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

	/* RFC1323: The window in SYN & SYN/ACK segments is
	 * never scaled.
	 */
	tp->snd_wnd = ntohs(th->window);

	if (!tp->rx_opt.wscale_ok) {
		tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
		tp->window_clamp = min(tp->window_clamp, 65535U);
	}

	if (tp->rx_opt.saw_tstamp) {
		tp->rx_opt.tstamp_ok	   = 1;
		tp->tcp_header_len =
			sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
		tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
		tcp_store_ts_recent(tp);
	} else {
		tp->tcp_header_len = sizeof(struct tcphdr);
	}

	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
	tcp_initialize_rcv_mss(sk);

	/* Remember, tcp_poll() does not lock socket!
	 * Change state from SYN-SENT only after copied_seq
	 * is initialized. */
	tp->copied_seq = tp->rcv_nxt;

	smc_check_reset_syn(tp);

	smp_mb();

	tcp_finish_connect(sk, skb);

	fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
			tcp_rcv_fastopen_synack(sk, skb, &foc);

	if (!sock_flag(sk, SOCK_DEAD)) {
		sk->sk_state_change(sk);
		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
	}
	if (fastopen_fail)
		return -1;
	if (sk->sk_write_pending ||
	    icsk->icsk_accept_queue.rskq_defer_accept ||
	    icsk->icsk_ack.pingpong) {
		/* Save one ACK. Data will be ready after
		 * several ticks, if write_pending is set.
		 *
		 * It may be deleted, but with this feature tcpdumps
		 * look so _wonderfully_ clever, that I was not able
		 * to stand against the temptation 8)     --ANK
		 */
		inet_csk_schedule_ack(sk);
		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
					  TCP_DELACK_MAX, TCP_RTO_MAX);

discard:
tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
}
return -1;
}

/* No ACK in the segment */

if (th->rst) {
	/* rfc793:
	 * "If the RST bit is set
	 *
	 *      Otherwise (no ACK) drop the segment and return."
	 */

	goto discard_and_undo;
}

/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
    tcp_paws_reject(&tp->rx_opt, 0))
	goto discard_and_undo;

if (th->syn) {
	/* We see SYN without ACK. It is attempt of
	 * simultaneous connect with crossed SYNs.
	 * Particularly, it can be connect to self.
	 */
	tcp_set_state(sk, TCP_SYN_RECV);

	if (tp->rx_opt.saw_tstamp) {
		tp->rx_opt.tstamp_ok = 1;
		tcp_store_ts_recent(tp);
		tp->tcp_header_len =
			sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
	} else {
		tp->tcp_header_len = sizeof(struct tcphdr);
	}

	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
	tp->copied_seq = tp->rcv_nxt;
	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

	/* RFC1323: The window in SYN & SYN/ACK segments is
	 * never scaled.
	 */
	tp->snd_wnd    = ntohs(th->window);
	tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
	tp->max_window = tp->snd_wnd;

	tcp_ecn_rcv_syn(tp, th);

	tcp_mtup_init(sk);
	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
	tcp_initialize_rcv_mss(sk);

	tcp_send_synack(sk);

#if 0
/* Note, we could accept data and URG from this segment.
* There are no obstacles to make this (except that we must
* either change tcp_recvmsg() to prevent it from returning data
* before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
* Also, seems the code doing it in step6 of tcp_rcv_state_process
* is not flawless. So, discard packet for sanity.
* Uncomment this return to process the data.
/
return -1;
#else
goto discard;
#endif
}
/ “fifth, if neither of the SYN or RST bits is set then
* drop the segment and return.”
*/

discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
goto discard;

reset_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
return 1;
}

This function implements the receiving procedure of RFC 793 for
all states except ESTABLISHED and TIME_WAIT.
It’s called from both tcp_v4_rcv and tcp_v6_rcv and should be
address independent.
*/

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;

switch (sk->sk_state) {
case TCP_CLOSE:
	goto discard;

case TCP_LISTEN:
	if (th->ack)
		return 1;

	if (th->rst)
		goto discard;

	if (th->syn) {
		if (th->fin)
			goto discard;
		/* It is possible that we process SYN packets from backlog,
		 * so we need to make sure to disable BH and RCU right there.
		 */
		rcu_read_lock();
		local_bh_disable();
		acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
		local_bh_enable();
		rcu_read_unlock();

		if (!acceptable)
			return 1;
		consume_skb(skb);
		return 0;
	}
	goto discard;

case TCP_SYN_SENT:
	tp->rx_opt.saw_tstamp = 0;
	tcp_mstamp_refresh(tp);
	queued = tcp_rcv_synsent_state_process(sk, skb, th);
	if (queued >= 0)
		return queued;

	/* Do step6 onward by hand. */
	tcp_urg(sk, skb, th);
	__kfree_skb(skb);
	tcp_data_snd_check(sk);
	return 0;
}

tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
	bool req_stolen;

	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
	    sk->sk_state != TCP_FIN_WAIT1);

	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
		goto discard;
}

if (!th->ack && !th->rst && !th->syn)
	goto discard;

if (!tcp_validate_incoming(sk, skb, th, 0))
	return 0;

/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
			      FLAG_UPDATE_TS_RECENT |
			      FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
	if (sk->sk_state == TCP_SYN_RECV)
		return 1;	/* send one RST */
	tcp_send_challenge_ack(sk, skb);
	goto discard;
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
	if (!tp->srtt_us)
		tcp_synack_rtt_meas(sk, req);

	/* Once we leave TCP_SYN_RECV, we no longer need req
	 * so release it.
	 */
	if (req) {
		inet_csk(sk)->icsk_retransmits = 0;
		reqsk_fastopen_remove(sk, req, false);
		/* Re-arm the timer because data may have been sent out.
		 * This is similar to the regular data transmission case
		 * when new data has just been ack'ed.
		 *
		 * (TFO) - we could try to be more aggressive and
		 * retransmitting any data sooner based on when they
		 * are sent out.
		 */
		tcp_rearm_rto(sk);
	} else {
		tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
		tp->copied_seq = tp->rcv_nxt;
	}
	smp_mb();
	tcp_set_state(sk, TCP_ESTABLISHED);
	sk->sk_state_change(sk);

	/* Note, that this wakeup is only for marginal crossed SYN case.
	 * Passively open sockets are not waked up, because
	 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
	 */
	if (sk->sk_socket)
		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);

	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

	if (tp->rx_opt.tstamp_ok)
		tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
		tcp_update_pacing_rate(sk);

	/* Prevent spurious tcp_cwnd_restart() on first data packet */
	tp->lsndtime = tcp_jiffies32;

	tcp_initialize_rcv_mss(sk);
	tcp_fast_path_on(tp);
	break;

case TCP_FIN_WAIT1: {
	int tmo;

	/* If we enter the TCP_FIN_WAIT1 state and we are a
	 * Fast Open socket and this is the first acceptable
	 * ACK we have received, this would have acknowledged
	 * our SYNACK so stop the SYNACK timer.
	 */
	if (req) {
		/* We no longer need the request sock. */
		reqsk_fastopen_remove(sk, req, false);
		tcp_rearm_rto(sk);
	}
	if (tp->snd_una != tp->write_seq)
		break;

	tcp_set_state(sk, TCP_FIN_WAIT2);
	sk->sk_shutdown |= SEND_SHUTDOWN;

	sk_dst_confirm(sk);

	if (!sock_flag(sk, SOCK_DEAD)) {
		/* Wake up lingering close() */
		sk->sk_state_change(sk);
		break;
	}

	if (tp->linger2 < 0) {
		tcp_done(sk);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
		return 1;
	}
	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
	    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
		/* Receive out of order FIN after close() */
		if (tp->syn_fastopen && th->fin)
			tcp_fastopen_active_disable(sk);
		tcp_done(sk);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
		return 1;
	}

	tmo = tcp_fin_time(sk);
	if (tmo > TCP_TIMEWAIT_LEN) {
		inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
	} else if (th->fin || sock_owned_by_user(sk)) {
		/* Bad case. We could lose such FIN otherwise.
		 * It is not a big problem, but it looks confusing
		 * and not so rare event. We still can lose it now,
		 * if it spins in bh_lock_sock(), but it is really
		 * marginal case.
		 */
		inet_csk_reset_keepalive_timer(sk, tmo);
	} else {
		tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
		goto discard;
	}
	break;
}

case TCP_CLOSING:
	if (tp->snd_una == tp->write_seq) {
		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
		goto discard;
	}
	break;

case TCP_LAST_ACK:
	if (tp->snd_una == tp->write_seq) {
		tcp_update_metrics(sk);
		tcp_done(sk);
		goto discard;
	}
	break;
}

/* step 6: check the URG bit */
tcp_urg(sk, skb, th);

/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
		break;
	/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
	/* RFC 793 says to queue data in these states,
	 * RFC 1122 says we MUST send a reset.
	 * BSD 4.4 also does reset.
	 */
	if (sk->sk_shutdown & RCV_SHUTDOWN) {
		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
			tcp_reset(sk);
			return 1;
		}
	}
	/* Fall through */
case TCP_ESTABLISHED:
	tcp_data_queue(sk, skb);
	queued = 1;
	break;
}

/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
	tcp_data_snd_check(sk);
	tcp_ack_snd_check(sk);
}

if (!queued) {

discard:
tcp_drop(sk, skb);
}
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);

static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
struct inet_request_sock *ireq = inet_rsk(req);

if (family == AF_INET)
	net_dbg_ratelimited("drop open request from %pI4/%u\n",
			    &ireq->ir_rmt_addr, port);

#if IS_ENABLED(CONFIG_IPV6)
else if (family == AF_INET6)
net_dbg_ratelimited(“drop open request from %pI6/%u\n”,
&ireq->ir_v6_rmt_addr, port);
#endif
}

/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
*

If we receive a SYN packet with these bits set, it means a
network is playing bad games with TOS bits. In order to
avoid possible false congestion notifications, we disable
TCP ECN negotiation.
Exception: tcp_ca wants ECN. This is required for DCTCP
congestion control: Linux DCTCP asserts ECT on all packets,
including SYN, which is most optimal solution; however,
others, such as FreeBSD do not.
*/
static void tcp_ecn_create_request(struct request_sock *req,
const struct sk_buff *skb,
const struct sock *listen_sk,
const struct dst_entry *dst)
{
const struct tcphdr *th = tcp_hdr(skb);
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
u32 ecn_ok_dst;

if (!th_ecn)
return;

ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;

if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
}

static void tcp_openreq_init(struct request_sock *req,
const struct tcp_options_received *rx_opt,
struct sk_buff *skb, const struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);

req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_clock_us();
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);

#if IS_ENABLED(CONFIG_SMC)
ireq->smc_ok = rx_opt->smc_ok;
#endif
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk_listener,
bool attach_listener)
{
struct request_sock *req = reqsk_alloc(ops, sk_listener,
attach_listener);

if (req) {
	struct inet_request_sock *ireq = inet_rsk(req);

	ireq->ireq_opt = NULL;

#if IS_ENABLED(CONFIG_IPV6)
ireq->pktopts = NULL;
#endif
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
}

return req;

}
EXPORT_SYMBOL(inet_reqsk_alloc);

Return true if a syncookie should be sent
*/
static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = “Dropping request”;
bool want_cookie = false;
struct net *net = sock_net(sk);

#ifdef CONFIG_SYN_COOKIES
if (net->ipv4.sysctl_tcp_syncookies) {
msg = “Sending cookies”;
want_cookie = true;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);

if (!queue->synflood_warned &&
    net->ipv4.sysctl_tcp_syncookies != 2 &&
    xchg(&queue->synflood_warned, 1) == 0)
	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
			     proto, ntohs(tcp_hdr(skb)->dest), msg);

return want_cookie;

}

static void tcp_reqsk_record_syn(const struct sock *sk,
struct request_sock *req,
const struct sk_buff *skb)
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
u32 *copy;

	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
	if (copy) {
		copy[0] = len;
		memcpy(&copy[1], skb_network_header(skb), len);
		req->saved_syn = copy;
	}
}

}

int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;

/* TW buckets are converted to open requests without
 * limitations, they conserve resources and peer is
 * evidently real one.
 */
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
	want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
	if (!want_cookie)
		goto drop;
}

if (sk_acceptq_is_full(sk)) {
	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
	goto drop;
}

req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
	goto drop;

tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;

tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss  = tp->rx_opt.user_mss;
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
		  want_cookie ? NULL : &foc);

if (want_cookie && !tmp_opt.saw_tstamp)
	tcp_clear_options(&tmp_opt);

if (IS_ENABLED(CONFIG_SMC) && want_cookie)
	tmp_opt.smc_ok = 0;

tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);

af_ops->init_req(req, sk, skb);

if (security_inet_conn_request(sk, skb, req))
	goto drop_and_free;

if (tmp_opt.tstamp_ok)
	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);

dst = af_ops->route_req(sk, &fl, req);
if (!dst)
	goto drop_and_free;

if (!want_cookie && !isn) {
	/* Kill the following clause, if you dislike this way. */
	if (!net->ipv4.sysctl_tcp_syncookies &&
	    (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
	     (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
	    !tcp_peer_is_proven(req, dst)) {
		/* Without syncookies last quarter of
		 * backlog is filled with destinations,
		 * proven to be alive.
		 * It means that we continue to communicate
		 * to destinations, already remembered
		 * to the moment of synflood.
		 */
		pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
			    rsk_ops->family);
		goto drop_and_release;
	}

	isn = af_ops->init_seq(skb);
}

tcp_ecn_create_request(req, skb, sk, dst);

if (want_cookie) {
	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
	req->cookie_ts = tmp_opt.tstamp_ok;
	if (!tmp_opt.tstamp_ok)
		inet_rsk(req)->ecn_ok = 0;
}

tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
	tcp_reqsk_record_syn(sk, req, skb);
	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
	af_ops->send_synack(fastopen_sk, dst, &fl, req,
			    &foc, TCP_SYNACK_FASTOPEN);
	/* Add the child socket directly into the accept queue */
	inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
	sk->sk_data_ready(sk);
	bh_unlock_sock(fastopen_sk);
	sock_put(fastopen_sk);
} else {
	tcp_rsk(req)->tfo_listener = false;
	if (!want_cookie)
		inet_csk_reqsk_queue_hash_add(sk, req,
			tcp_timeout_init((struct sock *)req));
	af_ops->send_synack(sk, dst, &fl, req, &foc,
			    !want_cookie ? TCP_SYNACK_NORMAL :
					   TCP_SYNACK_COOKIE);
	if (want_cookie) {
		reqsk_free(req);
		return 0;
	}
}
reqsk_put(req);
return 0;

drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);