这篇笔记来看看TCP对accept()系统调用的实现。
1. 内核入口
//从注释中也可以看出accept()系统调用要做的事情:
//1.建立一个新的套接字供服务器端和客户端通信
//2.创建一个新的fd供应用程序后续读写该套接字
/*
* For accept, we attempt to create a new socket, set up the link
* with the client, wake up the client, then return the new
* connected fd. We collect the address of the connector in kernel
* space and move it to user at the very end. This is unclean because
* we open the socket then return an error.
*
* 1003.1g adds the ability to recvmsg() to query connection pending
* status to recvmsg. We need to add that support in a way thats
* clean when we restucture accept also.
*/
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen)
{
struct socket *sock, *newsock;
struct file *newfile;
int err, len, newfd, fput_needed;
char address[MAX_SOCK_ADDR];
//根据监听套接字的fd找到监听套接字对应的套接字结构struct scoket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
//为新的通信套接字分配套接字结构struct socket
err = -ENFILE;
if (!(newsock = sock_alloc()))
goto out_put;
//新的套接字类型和操作函数集与监听套接字相同
newsock->type = sock->type;
newsock->ops = sock->ops;
/*
* We don't need try_module_get here, as the listening socket (sock)
* has the protocol module (sock->ops->owner) held.
*/
__module_get(newsock->ops->owner);
//为新创建的通信套接字结构分配文件描述符fd
newfd = sock_alloc_fd(&newfile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out_put;
}
//将fd与socket关联起来
err = sock_attach_fd(newsock, newfile);
if (err < 0)
goto out_fd_simple;
//SELinux相关
err = security_socket_accept(sock, newsock);
if (err)
goto out_fd;
//调用协议族提供的accept()函数完成接收,IPv4协议族中,为inet_accept(),见下文
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
if (err < 0)
goto out_fd;
//如果accept()系统调用参数中指明要获取客户端地址信息,
//则调用getname()接口获取客户端信息后将其拷贝到用户空间
if (upeer_sockaddr) {
if (newsock->ops->getname(newsock, (struct sockaddr *)address,
&len, 2) < 0) {
err = -ECONNABORTED;
goto out_fd;
}
err = move_addr_to_user(address, len, upeer_sockaddr,
upeer_addrlen);
if (err < 0)
goto out_fd;
}
//关联文件系统
/* File flags are not inherited via accept() unlike another OSes. */
fd_install(newfd, newfile);
err = newfd;
security_socket_post_accept(sock, newsock);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
out_fd_simple:
sock_release(newsock);
put_filp(newfile);
put_unused_fd(newfd);
goto out_put;
out_fd:
fput(newfile);
put_unused_fd(newfd);
goto out_put;
}
2. IPv4协议族的accept
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1 = sock->sk;
int err = -EINVAL;
//直接调用传输层的accept()回调,TCP为inet_csk_accept(),该回调需要返回
//新的通信套接字对应的TCB
struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
if (!sk2)
goto do_err;
lock_sock(sk2);
BUG_TRAP((1 << sk2->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
//将新的TCB和新的套接字结构sock关联起来
sock_graft(sk2, newsock);
//设置套接字结构中的状态为”已连接“
newsock->state = SS_CONNECTED;
err = 0;
release_sock(sk2);
do_err:
return err;
}
static inline void sock_graft(struct sock *sk, struct socket *parent)
{
write_lock_bh(&sk->sk_callback_lock);
sk->sk_sleep = &parent->wait;
parent->sk = sk;
sk->sk_socket = parent;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
3. TCP的accept实现
/*
* This will accept the next outstanding connection.
*/
//sk为监听套接字传输控制块
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *newsk;
int error;
lock_sock(sk);
//传入到的套接字的TCB状态必须是TCP_LISTEN
/* We need to make sure that this socket is listening,
* and that it has something pending.
*/
error = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out_err;
//如果监听套接字的accept接收队列为空,则需要根据当前套接字是否阻塞进行操作
/* Find already established connection */
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
//根据是否阻塞决定一个超时值,如果为非阻塞模式,那么timeo将为0
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
//对于非阻塞模式,直接返回重试错误
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out_err;
//休眠等待accept接收队列非空
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
//到这里,说明当前accept队列已经有连接可以接收(可能是阻塞后被唤醒的)
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
release_sock(sk);
return newsk;
out_err:
newsk = NULL;
*err = error;
goto out;
}
3.1 获取通信套接字的TCB
在之前介绍服务器端三次握手过程的笔记中,其实就已经知道,这些等待accept()的套接字已经放到了监听套接字的accept()接收队列中。
static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
struct sock *parent)
{
//从accept接收队列上将第一个已完成连接的请求块摘除
struct request_sock *req = reqsk_queue_remove(queue);
//记录与该请求块关联的真正的传输控制块(在三次握手完成时创建)
struct sock *child = req->sk;
BUG_TRAP(child != NULL);
//更新监听套接字的accept接收队列中的计数信息(即当前已完成连接的请求数目)
sk_acceptq_removed(parent);
//释放该连接请求块,它已经完成了它的使命
__reqsk_free(req);
return child;
}
//取出指定队列的第一个节点,典型的链表操作
static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue)
{
struct request_sock *req = queue->rskq_accept_head;
BUG_TRAP(req != NULL);
queue->rskq_accept_head = req->dl_next;
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_tail = NULL;
return req;
}
static inline void sk_acceptq_removed(struct sock *sk)
{
sk->sk_ack_backlog--;
}