socket源码分析之accept()
基于 kernel 3.10
之前有分析过TCP accept()
的实现,但是太过于沉浸于代码本身,没有结合应用去分析accept()
函数。
我们要解决如下几个问题:
1:accept()
函数的实现,包括从全队列中取出sock。
2:accept()
函数如何如何被唤醒
3:accept()
函数如何解决惊群
4:多个进程accept()
,优先唤醒哪个进程
accept()函数的实现
accept()
函数实现逻辑相对比较简单
如果没有完成建立的TCP会话,阻塞情况下,则阻塞,非阻塞情况下,则返回-EAGAIN
。
所以总结来说需要考虑这么几种情况:
1、当前全队列中有socket,则accept()
直接返回对应的fd。
2、如果当前全队列中没有socket,则如果当前socket是阻塞的,直接睡眠。
3、如果当前全队列中没有socket,如果非阻塞,就直接返回-EAGAIN。
4、如果是阻塞的listenfd,需要将当前进程挂在listenfd对应socket的等待队列里面,当前进程让出cpu,并且等待唤醒。
accept实现的调用链
sys_accept->sys_accept4->inet_accept->inet_csk_accept
其中 inet_csk_accept
是核心处理逻辑,其处理了上述1、3两种情况。
/*
* This will accept the next outstanding connection.
*/
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk;
struct request_sock *req;
int error;
lock_sock(sk);
/* We need to make sure that this socket is listening,
* and that it has something pending.
*/
//只有TCP_LISTEN状态的socket才能调用accept
error = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out_err;
/* Find already established connection */
//如果当前全队列中有已经三次握手建立起来后的连接,就不会进这个if,直接走到后面取全队列中的socket
if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
//非阻塞的socket,直接返回了
error = -EAGAIN;
if (!timeo)
goto out_err;
//阻塞的socket,调用 inet_csk_wait_for_connect ,下文会说
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
//走到这里,说明全队列中有socket,直接取出来
req = reqsk_queue_remove(queue);
newsk = req->sk;
sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
spin_lock_bh(&queue->fastopenq->lock);
if (tcp_rsk(req)->listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
* so reqsk_fastopen_remove() will free the req
* when 3WHS finishes (or is aborted).
*/
req->sk = NULL;
req = NULL;
}
spin_unlock_bh(&queue->fastopenq->lock);
}
out:
release_sock(sk);
if (req)
__reqsk_free(req);
return newsk;
out_err:
newsk = NULL;
req = NULL;
*err = error;
goto out;
}
inet_csk_wait_for_connect
函数处理了2、4两种情况。
/*
* Wait for an incoming connection, avoid race conditions. This must be called
* with the socket locked.
*/
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
struct inet_connection_sock *icsk = inet_csk(sk);
DEFINE_WAIT(wait);
int err;
/*
* True wake-one mechanism for incoming connections: only
* one process gets woken up, not the 'whole herd'.
* Since we do not 'race & poll' for established sockets
* anymore, the common case will execute the loop only once.
*
* Subtle issue: "add_wait_queue_exclusive()" will be added
* after any current non-exclusive waiters, and we know that
* it will always _stay_ after any new non-exclusive waiters
* because all non-exclusive waiters are added at the
* beginning of the wait-queue. As such, it's ok to "drop"
* our exclusiveness temporarily when we get woken up without
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
//prepare_to_wait_exclusive很重要,把 wait 挂到当前sk的等待队列里面。
prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
//icsk_accept_queue是全队列
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo);//阻塞情况下,只有主动唤醒当前进程,才会继续执行。
lock_sock(sk);
err = 0;
//如果阻塞且非超时的情况从schedule_timeout返回,那么必然是全队列有值了。
if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
break;//这个break是所有程序必经之路
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
break;
err = sock_intr_errno(timeo);
//有信号或者睡眠时间满了,则退出循环,否则接着睡。
if (signal_pending(current))
break;
err = -EAGAIN;
if (!timeo)
break;
}
finish_wait(sk_sleep(sk), &wait);
return err;
}
首先,为什么循环?这是历史原因,考虑有这么一种情况,就是睡眠时间没有睡满,那么 schedule_timeout
返回的值大于0,那么什么情况下,睡眠没有睡满呢?一种情况就是进程收到信号,另一种就是listenfd对应的socket的全队列有数据了,不考虑信号的情况,假设全队列有数据了,历史上,Linux的accept
是惊群的,全队列有值后,所有进程都唤醒,那么必然存在某些进程读取到了全队列socket,而某些没有读取到,这些没有读取到的进程,肯定是睡眠没睡满,所以需要接着睡。
但是本文分析的Linux内核版本是3.10,全队列有数据时,只会唤醒一个进程,故而,次for循环只会跑一次。
prepare_to_wait_exclusive
函数很重要,把当前上下文加到listenfd对应的socket等待队列里面,如果是多进程,那么listenfd对应的socket等待队列里面会有多个进程的上下文。
多进程 accept 如何处理惊群
多进程accept,不考虑resuseport,那么多进程accept只会出现在父子进程同时accept的情况,那么上文也说过,prepare_to_wait_exclusive
函数会被当前进程上下文加入到listenfd等待队列里面,所以父子进程的上下文都会加入到socket的等待队列里面。核心问题就是这么唤醒,我们可以相当,所谓的惊群,就是把等待队里里面的所有进程都唤醒。
我们此时来看看如何唤醒。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
......
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
//当三次握手客户端的ack到来时,会走tcp_child_process这里
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
}
......
}
tcp_child_process
:
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
{
int ret = 0;
int state = child->sk_state;
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
skb->len);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent, 0);//唤醒 在accept的进程,调用 sock_def_readable
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
__sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
sock_put(child);
return ret;
}
parent->sk_data_ready
:
static void sock_def_readable(struct sock *sk, int len)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
//显然,我们在accept的时候调用了`prepare_to_wait_exclusive`加入了队列,故唤醒靠 wake_up_interruptible_sync_poll
if (wq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
注意,__wake_up_sync_key
的第三个参数是1
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int wake_flags = WF_SYNC;
if (unlikely(!q))
return;
if (unlikely(!nr_exclusive))
wake_flags = 0;
spin_lock_irqsave(&q->lock, flags);
//mode是TASK_INTERRUPTIBLE nr_exclusive是1,
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
spin_unlock_irqrestore(&q->lock, flags);
}
__wake_up_common
:
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
//prepare_to_wait_exclusive时候,flags是WQ_FLAG_EXCLUSIVE,入参nr_exclusive是1,所以只执行一次就break了。
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
所以多个进程accept
的时候,内核只会唤醒1个等待的进程,且唤醒的逻辑是FIFO。