基于linux2.4.0分析,讲解服务器端通过socket bind listen、accept系统调用时,触发内核执行流程。
socket创建
我们编写服务器代码时,一般都是socket函数建立服务器socket,然后bind函数绑定地址和端口,然后listen创建一个socket的监听队列,最后死循环(低效操作)accept客户端的连接请求,发送和接收数据。
s = socket(AF_INET, SOCK_STREAM, 0))
sys_socketcall=>sys_socket
主要作用是根据family找到对应的net_families[family]的create方法,即inet_family_ops->inet_create函数根据传入type即SOCK_STREAM设置socekt->ops为inet_stream_ops,sock->prot为tcp_prot,调用sk->prot->init即tcp_v4_init_sock初始化sock,其中sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;非常重要,然后sock_map_fd创建对应的file、dentry、inode节点,绑定对应的文件系统的f_op为socket_file_ops,最终效果就是write等价于send,最后返回文件描述符,这个代码前面的两篇博客讲解过,这里捎带提下。
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock);
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
sys_bind
接下来看sys_bind函数做了什么:例如使用下面的代码
server.sin_family = AF_INET;
server.sin_port = htons(10000);
server.sin_addr.s_addr = INADDR_ANY;
if (bind(s, (struct sockaddr *)&server, sizeof(server)) < 0)
{
tcperror("Bind()");
exit(3);
}
参数fd为socket系统调用的返回参数,umyaddr为之前的server,addrlen为server的大小,进入内核后,首先根据fd找到对应的socket对象,如果不存在,报错,过程为,通过fd调用fget到当前进程的files的fd数组中找到对用的file对象,然后依次找到dentry=>inode=>socket对象,然后调用move_addr_to_kernel将用户态的地址拷贝内核中,最后通过找到的socket结构,调用socekt->ops->bind函数,进行地址绑定,返回到用户空间。
sys_socketcall=>sys_bind
asmlinkage long sys_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int err;
if((sock = sockfd_lookup(fd,&err))!=NULL)
{
if((err=move_addr_to_kernel(umyaddr,addrlen,address))>=0)
err = sock->ops->bind(sock, (struct sockaddr *)address, addrlen);
sockfd_put(sock);
}
return err;
}
sys_socketcall=>sys_bind=>inet_bind
static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
struct sock *sk=sock->sk;
unsigned short snum;
int chk_addr_ret;
int err;
//tcp_prot 没有定义该函数,跳过
if(sk->prot->bind)
return sk->prot->bind(sk, uaddr, addr_len);
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);//在路由中查找地址类型
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
if (sysctl_ip_nonlocal_bind == 0 &&
sk->protinfo.af_inet.freebind == 0 &&
addr->sin_addr.s_addr != INADDR_ANY &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&//是否为组播地址
chk_addr_ret != RTN_BROADCAST)//是否为广播地址
return -EADDRNOTAVAIL;
snum = ntohs(addr->sin_port);//获取端口号 10000
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))//不满足
return -EACCES;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk);//加锁,如果sock锁被其他进程占用,当前进程进入睡眠
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if ((sk->state != TCP_CLOSE) ||
(sk->num != 0))//检查状态,端口是否已经指定,这里不满足
goto out;
sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;//记录绑定地址
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
sk->saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (sk->prot->get_port(sk, snum) != 0) {//检查是否允许绑定
sk->saddr = sk->rcv_saddr = 0;//失败就清空设置的地址
err = -EADDRINUSE;
goto out;
}
if (sk->rcv_saddr)//如果已经设置地址就增加加锁标志,表示已经绑定了地址
sk->userlocks |= SOCK_BINDADDR_LOCK;
if (snum)//如果端口页已经确定也要增加加锁标志,表示已经绑定了端口
sk->userlocks |= SOCK_BINDPORT_LOCK;
sk->sport = htons(sk->num);//记录端口号
sk->daddr = 0;//初始化目标地址
sk->dport = 0;//初始化目的端口
sk_dst_reset(sk);//初始化缓存的路由内容
err = 0;
out:
release_sock(sk);//解锁,并唤醒sock锁上的其他进程
return err;
}
sys_socketcall=>sys_bind=>inet_bind=>tcp_v4_get_port
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
struct tcp_bind_hashbucket *head;
struct tcp_bind_bucket *tb;
int ret;
local_bh_disable();
if (snum == 0) {//如果端口号没指定
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
int rover;
spin_lock(&tcp_portalloc_lock);
rover = tcp_port_rover;
do { rover++;//在内核中查找一个端口号
if ((rover < low) || (rover > high))
rover = low;
head = &tcp_bhash[tcp_bhashfn(rover)];
spin_lock(&head->lock);
for (tb = head->chain; tb; tb = tb->next)
if (tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
} while (--remaining > 0);
tcp_port_rover = rover;
spin_unlock(&tcp_portalloc_lock);
/* Exhausted local port range during search? */
ret = 1;
if (remaining <= 0)
goto fail;
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
tb = NULL;
} else {//我们走这个分支 10000端口 tcp_bhash=>tcp_hashinfo.__tcp_bhash
head = &tcp_bhash[tcp_bhashfn(snum)];//根据端口号,到哈希表中查找
spin_lock(&head->lock);
for (tb = head->chain; tb != NULL; tb = tb->next)
if (tb->port == snum)//找到就返回,目前肯定找不到
break;
}
if (tb != NULL && tb->owners != NULL) {
if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
goto success;
} else {
struct sock *sk2 = tb->owners;
int sk_reuse = sk->reuse;
for( ; sk2 != NULL; sk2 = sk2->bind_next) {
if (sk != sk2 &&
sk->bound_dev_if == sk2->bound_dev_if) {
if (!sk_reuse ||
!sk2->reuse ||
sk2->state == TCP_LISTEN) {
if (!sk2->rcv_saddr ||
!sk->rcv_saddr ||
(sk2->rcv_saddr == sk->rcv_saddr))
break;
}
}
}
/* If we found a conflict, fail. */
ret = 1;
if (sk2 != NULL)
goto fail_unlock;
}
}
ret = 1;
if (tb == NULL &&
(tb = tcp_bucket_create(head, snum)) == NULL)//会运行到这里,然后创建对用的节点,并绑定端口号
goto fail_unlock;
if (tb->owners == NULL) {
if (sk->reuse && sk->state != TCP_LISTEN)//如果设置地址重用,目前state为TCP_CLOSE
tb->fastreuse = 1;//设置标志
else
tb->fastreuse = 0;
} else if (tb->fastreuse &&
((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
tb->fastreuse = 0;
success:
sk->num = snum;//将端口号保存在sk中
if (sk->prev == NULL) {//初始化
if ((sk->bind_next = tb->owners) != NULL)
tb->owners->bind_pprev = &sk->bind_next;
tb->owners = sk;
sk->bind_pprev = &tb->owners;
sk->prev = (struct sock *) tb;
} else {
BUG_TRAP(sk->prev == (struct sock *) tb);
}
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
sys_socketcall=>sys_bind=>inet_bind=>tcp_v4_get_port=>tcp_bucket_create
struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
unsigned short snum)
{
struct tcp_bind_bucket *tb;
tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);//创建哈希桶
if(tb != NULL) {//初始化
tb->port = snum;
tb->fastreuse = 0;
tb->owners = NULL;
if((tb->next = head->chain) != NULL)
tb->next->pprev = &tb->next;
head->chain = tb;
tb->pprev = &head->chain;
}
return tb;
}
#define tcp_bhash (tcp_hashinfo.__tcp_bhash)
struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
__tcp_ehash: NULL,
__tcp_bhash: NULL,
__tcp_bhash_size: 0,
__tcp_ehash_size: 0,
__tcp_listening_hash: { NULL, },
__tcp_lhash_lock: RW_LOCK_UNLOCKED,
__tcp_lhash_users: ATOMIC_INIT(0),
__tcp_lhash_wait:
__WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
__tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
};
tcp_bhash的创建在tcp_init中实现的
初始化操作:
void __init tcp_init(void)
{
.......
goal = num_physpages >> (23 - PAGE_SHIFT);
for(order = 0; (1UL << order) < goal; order++)
;
do {
tcp_ehash_size = (1UL << order) * PAGE_SIZE /
sizeof(struct tcp_ehash_bucket);
tcp_ehash_size >>= 1;
while (tcp_ehash_size & (tcp_ehash_size-1))
tcp_ehash_size--;
tcp_ehash = (struct tcp_ehash_bucket *)
__get_free_pages(GFP_ATOMIC, order);
} while (tcp_ehash == NULL && --order > 0);
if (!tcp_ehash)
panic("Failed to allocate TCP established hash table\n");
for (i = 0; i < (tcp_ehash_size<<1); i++) {
tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
tcp_ehash[i].chain = NULL;
}
do {//创建绑定的哈希桶
tcp_bhash_size = (1UL << order) * PAGE_SIZE /
sizeof(struct tcp_bind_hashbucket);
if ((tcp_bhash_size > (64 * 1024)) && order > 0)
continue;
tcp_bhash = (struct tcp_bind_hashbucket *)
__get_free_pages(GFP_ATOMIC, order);
} while (tcp_bhash == NULL && --order >= 0);
if (!tcp_bhash)
panic("Failed to allocate TCP bind hash table\n");
for (i = 0; i < tcp_bhash_size; i++) {
tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
tcp_bhash[i].chain = NULL;
}
......
}
绑定地址,其实就是检查下地址是否为特殊地址,多播地址或者广播地址,如果不是,保存到sk->rcv_saddr,然后检查端口是否指定,如果指定,到tcp_bhash哈希表中查找是否已经存在,如果不存创建,并初始化,将端口保存到sk->num,设置tb->owners,将端口的主机序保存在sk->sport,在listen时,会再次调用tcp_v4_get_port进行检查,那时就直接返回。
sys_listen
listen:
指定,监听队列1,
if (listen(s, 1) != 0)
{
tcperror("Listen()");
exit(4);
}
sys_socketcall=>sys_listen
asmlinkage long sys_listen(int fd, int backlog)
{
struct socket *sock;
int err;
if ((sock = sockfd_lookup(fd, &err)) != NULL) {//找到对用socket
if ((unsigned) backlog > SOMAXCONN)//检查监听队列,如果大于最大值,设置为最大值128
backlog = SOMAXCONN;
err=sock->ops->listen(sock, backlog);//调用inet_listen
sockfd_put(sock);
}
return err;
}
sys_socketcall=>sys_listen=>inet_listen
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)//此时的state为SS_UNCONNECTED ,不会跳转
goto out;
old_state = sk->state;
if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
err = tcp_listen_start(sk);//建立监听环境
if (err)
goto out;
}
sk->max_ack_backlog = backlog;//设置监听队列大小
err = 0;
out:
release_sock(sk);
return err;
}
sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start
struct tcp_listen_opt
{
u8 max_qlen_log; /* log_2 of maximal queued SYNs */
int qlen;
int qlen_young;
int clock_hand;
struct open_request *syn_table[TCP_SYNQ_HSIZE];//第三次握手包时,会将对应的请求结构放入该数组的某个队列中
};
int tcp_listen_start(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct tcp_listen_opt *lopt;
sk->max_ack_backlog = 0;
sk->ack_backlog = 0;
tp->accept_queue = tp->accept_queue_tail = NULL;
tp->syn_wait_lock = RW_LOCK_UNLOCKED;
lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);//创建监听对象
if (!lopt)
return -ENOMEM;
memset(lopt, 0, sizeof(struct tcp_listen_opt));
for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
break;
write_lock_bh(&tp->syn_wait_lock);
tp->listen_opt = lopt;
write_unlock_bh(&tp->syn_wait_lock);
.......
sk->state = TCP_LISTEN;//设置监听状态
if (sk->prot->get_port(sk, sk->num) == 0) {//检查绑定端口
sk->sport = htons(sk->num);
sk_dst_reset(sk);
sk->prot->hash(sk);//tcp_v4_hash将sock链入监听队列
return 0;
}
.......
}
sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start=>tcp_v4_hash
static void tcp_v4_hash(struct sock *sk)
{
if (sk->state != TCP_CLOSE) {
local_bh_disable();
__tcp_v4_hash(sk);
local_bh_enable();
}
}
sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start=>tcp_v4_hash=>__tcp_v4_hash
#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
static __inline__ void __tcp_v4_hash(struct sock *sk)
{
struct sock **skp;
rwlock_t *lock;
BUG_TRAP(sk->pprev==NULL);
if(sk->state == TCP_LISTEN) {//前面已经设置为监听状态
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];//根据端口找到对应的桶
lock = &tcp_lhash_lock;
tcp_listen_wlock();
} else {
skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
lock = &tcp_ehash[sk->hashent].lock;
write_lock(lock);
}
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;//链入队列
sk->pprev = skp;
sock_prot_inc_use(sk->prot);
write_unlock(lock);
if (sk->state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
监听函数执行完成,将sk链入tcp_listening_hash的某个队列中。
sys_accept
accept操作
if ((ns = accept(s, (struct sockaddr *)&client, &namelen)) == -1)
{
tcperror("Accept()");
exit(5);
}
sys_socketcall=>sys_accept
asmlinkage long sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
struct socket *sock, *newsock;
int err, len;
char address[MAX_SOCK_ADDR];
sock = sockfd_lookup(fd, &err);//找到对应的socket
if (!sock)
goto out;
err = -EMFILE;
if (!(newsock = sock_alloc())) //创建客户端socket
goto out_put;
//继承服务器的socekt的类型和跳转表函数指针 inet_stream_ops
newsock->type = sock->type;
newsock->ops = sock->ops;
//调用inet_accept执行接受客户端连接
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
if (err < 0)
goto out_release;
if (upeer_sockaddr) {
if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
err = -ECONNABORTED;
goto out_release;
}
err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
if (err < 0)
goto out_release;
}
/* File flags are not inherited via accept() unlike another OSes. */
if ((err = sock_map_fd(newsock)) < 0)
goto out_release;
out_put:
sockfd_put(sock);
out:
return err;
out_release:
sock_release(newsock);
goto out_put;
}
sys_socketcall=>sys_accept=>inet_accept
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1 = sock->sk;
struct sock *sk2;
int err = -EINVAL;
if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)//tcp_accept
goto do_err;
lock_sock(sk2);
BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));
sock_graft(sk2, newsock);
newsock->state = SS_CONNECTED;
release_sock(sk2);
return 0;
do_err:
return err;
}
sys_socketcall=>sys_accept=>inet_accept=>tcp_accept
struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct open_request *req;
struct sock *newsk;
int error;
lock_sock(sk);
/* We need to make sure that this socket is listening,
* and that it has something pending.
*/
error = -EINVAL;
if (sk->state != TCP_LISTEN)//如果服务器sk不是listen状态报错返回
goto out;
/* Find already established connection */
if (!tp->accept_queue) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out;
error = wait_for_connect(sk, timeo);
if (error)
goto out;
}
req = tp->accept_queue;//有连接
if ((tp->accept_queue = req->dl_next) == NULL)
tp->accept_queue_tail = NULL;
newsk = req->sk;//返回客户端的sk
tcp_acceptq_removed(sk);
tcp_openreq_fastfree(req);
BUG_TRAP(newsk->state != TCP_SYN_RECV);
release_sock(sk);
return newsk;
out:
release_sock(sk);
*err = error;
return NULL;
}
这个代码如何和tcp_listening_hash 关联上的呢?三次握手触发的,客户端执行connect操作,发送syn(第一次握手包)过来,设置socket状态
服务器接收到该报文后,最终会进入到tcp_v4_rcv,然后__tcp_v4_lookup根据四元组信息执行__tcp_v4_lookup_established到tcp_ehash已建立的连接中查找,此时失败,然后调用tcp_v4_lookup_listener到tcp_listening_hash查找,而这个在listen时,已经创建好了,可以返回服务器sk,继续tcp_v4_do_rcv=>tcp_v4_hnd_req,在该函数中执行tcp_v4_search_req查询监听结构的lopt->syn_table是否存在,此时不存在返回空,__tcp_v4_lookup_established继续到已监听的哈希表中查找,失败,返回服务器sk.进入tcp_rcv_state_process执行ipv4_specific->conn_request即tcp_v4_conn_request,经过一系列检查并创建req后,执行tcp_v4_synq_add,将req保存至lopt->syn_table的某个队列中。与监听挂上钩,发送第二次握手包;
客户端处理后设置socket状态,发送第三次握手包过来;
服务器收到第三次握手包,同样进入tcp_v4_do_rcv函数然后执行tcp_v4_hnd_req,此时tcp_v4_search_req可以返回,然后执行tcp_check_req函数,执行tcp_v4_syn_recv_sock创建客户端sk,继承服务器的部分信息,比如处理函数跳转表tcp_prot,绑定的端口,设置状态state为TCP_SYN_RECV,然后执行__tcp_v4_hash,将客户端sk根据四元组链入tcp_ehash的某个队列中,再执行__tcp_inherit_port将客户端sk链入到服务器sk的owners中并链入tcp_bhash中。此时连接已经建立,将req从syn_table脱链,然后将客户端sk与req绑定,链入接收队列,并且增加已连接计数,然后进入tcp_child_process,设置为TCP_ESTABLISHED,然后执行服务器sk的data_ready函数即sock_def_readable唤醒服务器进程,接收请求。此时会将该连接从接收队列中脱链,在此之前将该连接挂入了tcp_ehash哈希表中了,后面会从这里获取到对应的连接。
这样整个服务器的socket创建到等待连接的过程基本就讲完了。