Table of Contents
2.4 tcp处理SYN请求接口 tcp_v4_conn_request()(核心)
2.4.1 SYN请求队列(半连接)已满判断 inet_csk_reqsk_queue_is_full
2.4.2 accept连接队列(全连接)已满判断 sk_acceptq_is_full
2.5.1 连接请求块的分配reqsk_alloc / inet_reqsk_alloc
2.6 将连接请求块加入SYN请求队列 inet_csk_reqsk_queue_hash_add
1 tcp server 接收 SYN 概述
- 根据5元组信息查考从 tcp_hashinfo 查找本端套接字
- 判断本端套接字请求队列满,包括:半连接,全连接
- 发送 SYN+ACK 报文给客户端
- 将新建的请求套接字加入到 SYN 队列中,并启动 SYN+ACK 超时重传定时器(初始值为3s)
注意:linux 内核协议栈在收到SYN包后,并不会将状态迁移至 SYN_RECV 状态,该状态是在收到客户端发来的ACK报文后才会新建一个sock,并将该sock的状态设置成 TCP_SYN_RECV ,在此之后调用 tcp_rcv_state_process 将状态迁移至TCP_ESTABLISHED
2 tcp server 接收 SYN 处理过程
tcp_v4_rcv
--tcp_v4_do_rcv
--tcp_rcv_state_process
--tcp_v4_conn_request
--inet_csk_reqsk_queue_is_full
--sk_acceptq_is_full
--inet_reqsk_alloc
--tcp_v4_send_synack
--inet_csk_reqsk_queue_hash_add
2.1 tcp层的数据包输入接口 tcp_v4_rcv
- 校验tcp报文的合法性
- 根据5元组信息从 tcp_hashinfo 查找本端套接字
- 调用tcp_v4_do_rcv() 对数据包进行处理
int tcp_v4_rcv(struct sk_buff *skb)
{
struct tcphdr *th;
struct sock *sk;
int ret;
//获取TCP首部指针
th = tcp_hdr(skb);
//获取IP首部指针
iph = ip_hdr(skb);
//从TCP的哈希表中寻找应该由哪个套接字来处理这个数据段(根据输入数据段的tcp/ip头部信息)
//对于处理SYN请求段的场景,这里找到的就是监听套接字
sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
if (!sk)
goto no_tcp_socket;
process:
//这里涉及TCP接收时为了性能考虑使用的三个队列,暂不关注,直接看tcp_v4_do_rcv()
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
//调用tcp_v4_do_rcv()对数据包进行处理
ret = tcp_v4_do_rcv(sk, skb);
} else
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
sock_put(sk);
return ret;
}
2.2 tcp_v4_do_rcv()
- 调用 tcp_v4_hnd_req 查找请求套接字,没有找到,返回sk
- 调用 tcp_rcv_state_process 处理SYN请求报文
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
if (sk->sk_state == TCP_LISTEN) {
//返回NULL:出错
//nsk == sk:没有找到新的TCB,所以收到的是第一次握手的SYN(这篇笔记就是这种情况)
//NSK != SK: 找到了新的TCB,所以收到的是第三次握手的ACK
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
//ACK包由tcp_child_process处理
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
}
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
//如果返回非0,表示收到了不期望的数据包,此时会向对端发送RST报文
rsk = sk;
goto reset;
}
return 0;
}
2.3 tcp_rcv_state_process()
- 调用 tcp_v4_conn_request 处理 SYN 连接请求
/*
sk: 接收该报文的TCP套接字
skb:输入数据报文
th:指向该报文的TCP头部指针
len:数据报文长度
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
switch (sk->sk_state) {
case TCP_LISTEN:
//此函数只处理SYN报文段,如果ACK置为,说明收到的是非预期的报文,
//返回1会导致向对端回复RST报文
if (th->ack)
return 1;
//收到RST报文,只是忽略该报文
if (th->rst)
goto discard;
if (th->syn) {
//收到了SYN报文,交由TCP的tcp_v4_conn_request()处理,
//该指针在传输控制块初始化过程中被指定,见tcp_v4_init_sock
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
//上面是关于第一个SYN包是否可以携带数据的讨论,当期版本的实现是不允许其携带报文的
kfree_skb(skb);
return 0;
}
goto discard;
}
}
2.4 tcp处理SYN请求接口 tcp_v4_conn_request()(核心)
该函数要做的最主要的事情就是创建连接请求套接字对象,即struct tcp_request_sock,然后将其加入到监听套接字的SYN请求队列(半连接队列 listen_sock.syn_table)中。总结下该函数的核心操作:
- 检查SYN请求队列和accept连接队列是否还允许接收该SYN请求,如果已经无法接收,那么丢弃该SYN请求段(这里不考虑syn_cookie),但是不会给客户端回RST,这样后续如果客户端重试并且服务器端队列有空余了,就可以继续处理该请求;
- 分配连接请求块struct tcp_request_sock对象;
- 解析处理SYN请求段中的TCP选项(暂不分析);
- 根据收到的选项初始化新分配的连接请求块;
- 生成SYN+ACK报文要携带的seq,即服务器端的初始序列号;
- 向客户段发送SYN+ACK报文(见《TCP之服务器端发送SYN+ACK报文》);
- 将连接请求块加入到监听套接字的SYN请求队列中并启动SYN+ACK超时定时器。
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
struct tcp_options_received tmp_opt;
struct request_sock *req;
//记录SYN请求段中的源和目的地址
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
struct dst_entry *dst = NULL;
//SYN COOKIE技术相关内容,忽略
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
//对于发送给广播和组播地址的SYN报文丢弃,TCP不支持广播,这里应该是出于可靠性的考虑
if (((struct rtable *)skb->dst)->rt_flags &
(RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
//如果SYN请求队列已满,那么丢弃(不考虑SYN Cookie)请求,这种情况客户端会重传SYN请求
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
//这里为什么要判断isn,不理解...
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
} else
#endif
goto drop;
}
//如果accept接收队列已满,并且SYN请求队列中至少有一个请求还没有重传过SYN+ACK包,则丢弃该新的SYN请求.
//个人理解这样设计的考虑是:因为SYN请求队列中有这种“年轻的SYN请求“,而且当前accept队列已满,那么这种
//年轻的SYN请求很可能很快就会完成三次握手,进而需要添加到accept队列中,所以此时如果接受该新的SYN请求,
//那么很可能会导致由于无法加入到accept队列而导致已经完成三次握手的TCP连接失败
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
//分配struct tcp_reqeust_sock对象,并将tcp_request_sock_ops赋值给其rsk_ops,
//后续连接建立过程中会调用该结构指定的函数,
req = reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
#ifdef CONFIG_TCP_MD5SIG
tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
#endif
//解析SYN包携带的TCP选项,这里先不关注TCP选项相关内容
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = 536;
tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
tcp_parse_options(skb, &tmp_opt, 0);
//SYN Cookie相关,忽略
if (want_cookie) {
tcp_clear_options(&tmp_opt);
tmp_opt.saw_tstamp = 0;
}
//时间戳选项处理
if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
/* Some OSes (unknown ones, but I see them on web server, which
* contains information interesting only for windows'
* users) do not send their stamp in SYN. It is easy case.
* We simply do not advertise TS support.
*/
tmp_opt.saw_tstamp = 0;
tmp_opt.tstamp_ok = 0;
}
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
//根据SYN请求段中的字段和选项来初始化连接请求块
tcp_openreq_init(req, &tmp_opt, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
//记录该套接字的源地址和目的地址,这里的saddr和daddr分别是skb中的源IP和目的IP字段,所以相反赋值
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
//将SYN请求段中的IP选项部分保存到连接请求块中
ireq->opt = tcp_v4_save_options(sk, skb);
if (!want_cookie)
TCP_ECN_create_request(req, tcp_hdr(skb));
//根据不同情况生成服务器端的初始发送序号
if (want_cookie) {
#ifdef CONFIG_SYN_COOKIES
syn_flood_warning(skb);
#endif
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
} else if (!isn) {
struct inet_peer *peer = NULL;
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
* accepting new connection request.
*
* If "isn" is not zero, this request hit alive
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
dst_release(dst);
goto drop_and_free;
}
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
"request from %u.%u.%u.%u/%u\n",
NIPQUAD(saddr),
ntohs(tcp_hdr(skb)->source));
dst_release(dst);
goto drop_and_free;
}
isn = tcp_v4_init_sequence(skb);
}
//将确定的初始序列号记录到TCP控制块中
tcp_rsk(req)->snt_isn = isn;
//发送SYN+ACK报文
if (tcp_v4_send_synack(sk, req, dst))
goto drop_and_free;
if (want_cookie) {
reqsk_free(req);
} else {
//将连接请求块加入到SYN请求队列中,并启动SYN+ACK超时重传定时器(初始值为3s)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
return 0;
drop_and_free:
reqsk_free(req);
drop:
return 0;
}
2.4 连接请求队列状态判断
这里要看的accept连接队列和SYN请求队列是否已满的判断。
2.4.1 SYN请求队列(半连接)已满判断 inet_csk_reqsk_queue_is_full
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
//如果当前已经收到SYN请求的套接字数目(qlen)大于nr_tables_entries,
//则认为SYN请求队列已满,这里巧妙的运用了移位运算而不是比较运算
return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}
2.4.2 accept连接队列(全连接)已满判断 sk_acceptq_is_full
static inline int sk_acceptq_is_full(struct sock *sk)
{
//直接比较当前已完成三次握手的套接字数目和允许的最大值,这可以看出listen()
//调用中backlog参数(它会被赋值给sk_max_ack_backlog)的作用
return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
}
2.5 连接请求块的分配和初始化
2.5.1 连接请求块的分配reqsk_alloc / inet_reqsk_alloc
static inline struct request_sock *(const struct request_sock_ops *ops)
{
//分配一个连接请求块,这里实际上是分配的struct tcp_request_sock结构
struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC);
//将操作函数赋值给连接请求块的ops成员
if (req != NULL)
req->rsk_ops = ops;
return req;
}
调用reqsk_alloc()时传入的ops是tcp_request_sock_ops,其定义如下:
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
//创建的对象为struct tcp_request_sock
.obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_v4_send_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
};
这里ops->slab是在AF_INET协议族初始化的时候创建的,代码如下:
struct proto tcp_prot = {
...
.rsk_prot = &tcp_request_sock_ops,
...
};
static int __init inet_init(void)
{
...
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
...
}
int proto_register(struct proto *prot, int alloc_slab)
{
...
prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
prot->rsk_prot->obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
...
}
2.5.2 连接请求块的初始化
连接请求块的初始化依赖于SYN请求段中的TCP选项,所以是在完成TCP选项解析后执行的,代码如下:
static inline void tcp_openreq_init(struct request_sock *req,
struct tcp_options_received *rx_opt,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->rmt_port = tcp_hdr(skb)->source;
}
2.6 将连接请求块加入SYN请求队列 inet_csk_reqsk_queue_hash_add
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
//获取SYN请求队列
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
//根据连接请求块的对端IP地址、端口号、初始哈希值计算一个哈希值
const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
//将连接请求块插入SYN请求队列中,并且将超时时间设置到该连接请求块中
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
//更新SYN请求队列中的计数信息:qlen、qlen_yong,并启动SYN+ACK重传定时器
inet_csk_reqsk_queue_added(sk, timeout);
}
static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout)
{
struct listen_sock *lopt = queue->listen_opt;
//设置超时参数
req->expires = jiffies + timeout;
//初始化SYN+ACK报文重传次数为0
req->retrans = 0;
req->sk = NULL;
//将新的连接请求块插入到SYN请求队列的首部
req->dl_next = lopt->syn_table[hash];
write_lock(&queue->syn_wait_lock);
lopt->syn_table[hash] = req;
write_unlock(&queue->syn_wait_lock);
}
static inline void inet_csk_reqsk_queue_added(struct sock *sk,
const unsigned long timeout)
{
//更新listen_ops的计数信息。如果函数返回0,表示之前SYN请求队列为空,
//这种情况需要复位SYN+ACK重传定时器
if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
inet_csk_reset_keepalive_timer(sk, timeout);
}
static inline int reqsk_queue_added(struct request_sock_queue *queue)
{
struct listen_sock *lopt = queue->listen_opt;
const int prev_qlen = lopt->qlen;
//更新qlne和qlen_young
lopt->qlen_young++;
lopt->qlen++;
//返回SYN请求队列之前的长度
return prev_qlen;
}