linux tcp 的 socket、bind、listen、accept原理分析

基于linux2.4.0分析,讲解服务器端通过socket bind listen、accept系统调用时,触发内核执行流程。

socket创建

我们编写服务器代码时,一般都是socket函数建立服务器socket,然后bind函数绑定地址和端口,然后listen创建一个socket的监听队列,最后死循环(低效操作)accept客户端的连接请求,发送和接收数据。

s = socket(AF_INET, SOCK_STREAM, 0))

sys_socketcall=>sys_socket

主要作用是根据family找到对应的net_families[family]的create方法,即inet_family_ops->inet_create函数根据传入type即SOCK_STREAM设置socekt->ops为inet_stream_ops,sock->prot为tcp_prot,调用sk->prot->init即tcp_v4_init_sock初始化sock,其中sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;非常重要,然后sock_map_fd创建对应的file、dentry、inode节点,绑定对应的文件系统的f_op为socket_file_ops,最终效果就是write等价于send,最后返回文件描述符,这个代码前面的两篇博客讲解过,这里捎带提下。


asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sys_bind

接下来看sys_bind函数做了什么:例如使用下面的代码

    server.sin_family = AF_INET;
    server.sin_port   = htons(10000);
    server.sin_addr.s_addr = INADDR_ANY;

    if (bind(s, (struct sockaddr *)&server, sizeof(server)) < 0)
    {
        tcperror("Bind()");
        exit(3);
    }

参数fd为socket系统调用的返回参数,umyaddr为之前的server,addrlen为server的大小,进入内核后,首先根据fd找到对应的socket对象,如果不存在,报错,过程为,通过fd调用fget到当前进程的files的fd数组中找到对用的file对象,然后依次找到dentry=>inode=>socket对象,然后调用move_addr_to_kernel将用户态的地址拷贝内核中,最后通过找到的socket结构,调用socekt->ops->bind函数,进行地址绑定,返回到用户空间。

sys_socketcall=>sys_bind

asmlinkage long sys_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err;

	if((sock = sockfd_lookup(fd,&err))!=NULL)
	{
		if((err=move_addr_to_kernel(umyaddr,addrlen,address))>=0)
			err = sock->ops->bind(sock, (struct sockaddr *)address, addrlen);
		sockfd_put(sock);
	}			
	return err;
}

sys_socketcall=>sys_bind=>inet_bind


static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
	struct sock *sk=sock->sk;
	unsigned short snum;
	int chk_addr_ret;
	int err;

	//tcp_prot 没有定义该函数,跳过
	if(sk->prot->bind)
		return sk->prot->bind(sk, uaddr, addr_len);

	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);//在路由中查找地址类型

	/* Not specified by any standard per-se, however it breaks too
	 * many applications when removed.  It is unfortunate since
	 * allowing applications to make a non-local bind solves
	 * several problems with systems using dynamic addressing.
	 * (ie. your servers still start up even if your ISDN link
	 *  is temporarily down)
	 */
	if (sysctl_ip_nonlocal_bind == 0 && 
	    sk->protinfo.af_inet.freebind == 0 &&
	    addr->sin_addr.s_addr != INADDR_ANY &&
	    chk_addr_ret != RTN_LOCAL &&
	    chk_addr_ret != RTN_MULTICAST &&//是否为组播地址
	    chk_addr_ret != RTN_BROADCAST)//是否为广播地址
		return -EADDRNOTAVAIL;

	snum = ntohs(addr->sin_port);//获取端口号 10000
	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))//不满足
		return -EACCES;

	/*      We keep a pair of addresses. rcv_saddr is the one
	 *      used by hash lookups, and saddr is used for transmit.
	 *
	 *      In the BSD API these are the same except where it
	 *      would be illegal to use them (multicast/broadcast) in
	 *      which case the sending device address is used.
	 */
	lock_sock(sk);//加锁,如果sock锁被其他进程占用,当前进程进入睡眠

	/* Check these errors (active socket, double bind). */
	err = -EINVAL;
	if ((sk->state != TCP_CLOSE)			||
	    (sk->num != 0))//检查状态,端口是否已经指定,这里不满足
		goto out;

	sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;//记录绑定地址
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		sk->saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. */
	if (sk->prot->get_port(sk, snum) != 0) {//检查是否允许绑定
		sk->saddr = sk->rcv_saddr = 0;//失败就清空设置的地址
		err = -EADDRINUSE;
		goto out;
	}

	if (sk->rcv_saddr)//如果已经设置地址就增加加锁标志,表示已经绑定了地址
		sk->userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)//如果端口页已经确定也要增加加锁标志,表示已经绑定了端口
		sk->userlocks |= SOCK_BINDPORT_LOCK;
	sk->sport = htons(sk->num);//记录端口号
	sk->daddr = 0;//初始化目标地址
	sk->dport = 0;//初始化目的端口
	sk_dst_reset(sk);//初始化缓存的路由内容
	err = 0;
out:
	release_sock(sk);//解锁,并唤醒sock锁上的其他进程
	return err;
}

sys_socketcall=>sys_bind=>inet_bind=>tcp_v4_get_port

static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
	struct tcp_bind_hashbucket *head;
	struct tcp_bind_bucket *tb;
	int ret;

	local_bh_disable();
	if (snum == 0) {//如果端口号没指定
		int low = sysctl_local_port_range[0];
		int high = sysctl_local_port_range[1];
		int remaining = (high - low) + 1;
		int rover;

		spin_lock(&tcp_portalloc_lock);
		rover = tcp_port_rover;
		do {	rover++;//在内核中查找一个端口号
			if ((rover < low) || (rover > high))
				rover = low;
			head = &tcp_bhash[tcp_bhashfn(rover)];
			spin_lock(&head->lock);
			for (tb = head->chain; tb; tb = tb->next)
				if (tb->port == rover)
					goto next;
			break;
		next:
			spin_unlock(&head->lock);
		} while (--remaining > 0);
		tcp_port_rover = rover;
		spin_unlock(&tcp_portalloc_lock);

		/* Exhausted local port range during search? */
		ret = 1;
		if (remaining <= 0)
			goto fail;

		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;
		tb = NULL;
	} else {//我们走这个分支 10000端口 tcp_bhash=>tcp_hashinfo.__tcp_bhash
		head = &tcp_bhash[tcp_bhashfn(snum)];//根据端口号,到哈希表中查找
		spin_lock(&head->lock);
		for (tb = head->chain; tb != NULL; tb = tb->next)
			if (tb->port == snum)//找到就返回,目前肯定找不到
				break;
	}
	if (tb != NULL && tb->owners != NULL) {
		if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
			goto success;
		} else {
			struct sock *sk2 = tb->owners;
			int sk_reuse = sk->reuse;

			for( ; sk2 != NULL; sk2 = sk2->bind_next) {
				if (sk != sk2 &&
				    sk->bound_dev_if == sk2->bound_dev_if) {
					if (!sk_reuse	||
					    !sk2->reuse	||
					    sk2->state == TCP_LISTEN) {
						if (!sk2->rcv_saddr	||
						    !sk->rcv_saddr	||
						    (sk2->rcv_saddr == sk->rcv_saddr))
							break;
					}
				}
			}
			/* If we found a conflict, fail. */
			ret = 1;
			if (sk2 != NULL)
				goto fail_unlock;
		}
	}
	ret = 1;
	if (tb == NULL &&
	    (tb = tcp_bucket_create(head, snum)) == NULL)//会运行到这里,然后创建对用的节点,并绑定端口号
			goto fail_unlock;
	if (tb->owners == NULL) {
		if (sk->reuse && sk->state != TCP_LISTEN)//如果设置地址重用,目前state为TCP_CLOSE
			tb->fastreuse = 1;//设置标志
		else
			tb->fastreuse = 0;
	} else if (tb->fastreuse &&
		   ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
		tb->fastreuse = 0;
success:
	sk->num = snum;//将端口号保存在sk中
	if (sk->prev == NULL) {//初始化
		if ((sk->bind_next = tb->owners) != NULL)
			tb->owners->bind_pprev = &sk->bind_next;
		tb->owners = sk;
		sk->bind_pprev = &tb->owners;
		sk->prev = (struct sock *) tb;
	} else {
		BUG_TRAP(sk->prev == (struct sock *) tb);
	}
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}

sys_socketcall=>sys_bind=>inet_bind=>tcp_v4_get_port=>tcp_bucket_create

struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
					  unsigned short snum)
{
	struct tcp_bind_bucket *tb;

	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);//创建哈希桶
	if(tb != NULL) {//初始化
		tb->port = snum;
		tb->fastreuse = 0;
		tb->owners = NULL;
		if((tb->next = head->chain) != NULL)
			tb->next->pprev = &tb->next;
		head->chain = tb;
		tb->pprev = &head->chain;
	}
	return tb;
}

#define tcp_bhash    (tcp_hashinfo.__tcp_bhash)


struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
    __tcp_ehash:          NULL,
    __tcp_bhash:          NULL,
    __tcp_bhash_size:     0,
    __tcp_ehash_size:     0,
    __tcp_listening_hash: { NULL, },
    __tcp_lhash_lock:     RW_LOCK_UNLOCKED,
    __tcp_lhash_users:    ATOMIC_INIT(0),
    __tcp_lhash_wait:
      __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
    __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
};

tcp_bhash的创建在tcp_init中实现的

初始化操作:

void __init tcp_init(void)
{
.......
	goal = num_physpages >> (23 - PAGE_SHIFT);

	for(order = 0; (1UL << order) < goal; order++)
		;
	do {
		tcp_ehash_size = (1UL << order) * PAGE_SIZE /
			sizeof(struct tcp_ehash_bucket);
		tcp_ehash_size >>= 1;
		while (tcp_ehash_size & (tcp_ehash_size-1))
			tcp_ehash_size--;
		tcp_ehash = (struct tcp_ehash_bucket *)
			__get_free_pages(GFP_ATOMIC, order);
	} while (tcp_ehash == NULL && --order > 0);

	if (!tcp_ehash)
		panic("Failed to allocate TCP established hash table\n");
	for (i = 0; i < (tcp_ehash_size<<1); i++) {
		tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
		tcp_ehash[i].chain = NULL;
	}

	do {//创建绑定的哈希桶
		tcp_bhash_size = (1UL << order) * PAGE_SIZE /
			sizeof(struct tcp_bind_hashbucket);
		if ((tcp_bhash_size > (64 * 1024)) && order > 0)
			continue;
		tcp_bhash = (struct tcp_bind_hashbucket *)
			__get_free_pages(GFP_ATOMIC, order);
	} while (tcp_bhash == NULL && --order >= 0);

	if (!tcp_bhash)
		panic("Failed to allocate TCP bind hash table\n");
	for (i = 0; i < tcp_bhash_size; i++) {
		tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
		tcp_bhash[i].chain = NULL;
	}
......
}

绑定地址,其实就是检查下地址是否为特殊地址,多播地址或者广播地址,如果不是,保存到sk->rcv_saddr,然后检查端口是否指定,如果指定,到tcp_bhash哈希表中查找是否已经存在,如果不存创建,并初始化,将端口保存到sk->num,设置tb->owners,将端口的主机序保存在sk->sport,在listen时,会再次调用tcp_v4_get_port进行检查,那时就直接返回。

sys_listen

listen: 

指定,监听队列1,

    if (listen(s, 1) != 0)
    {
        tcperror("Listen()");
        exit(4);
    }

sys_socketcall=>sys_listen

asmlinkage long sys_listen(int fd, int backlog)
{
	struct socket *sock;
	int err;
	
	if ((sock = sockfd_lookup(fd, &err)) != NULL) {//找到对用socket
		if ((unsigned) backlog > SOMAXCONN)//检查监听队列,如果大于最大值,设置为最大值128
			backlog = SOMAXCONN;
		err=sock->ops->listen(sock, backlog);//调用inet_listen
		sockfd_put(sock);
	}
	return err;
}

sys_socketcall=>sys_listen=>inet_listen

int inet_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	unsigned char old_state;
	int err;

	lock_sock(sk);

	err = -EINVAL;
	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)//此时的state为SS_UNCONNECTED ,不会跳转
		goto out;

	old_state = sk->state;
	if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
		goto out;

	/* Really, if the socket is already in listen state
	 * we can only allow the backlog to be adjusted.
	 */
	if (old_state != TCP_LISTEN) {
		err = tcp_listen_start(sk);//建立监听环境
		if (err)
			goto out;
	}
	sk->max_ack_backlog = backlog;//设置监听队列大小
	err = 0;

out:
	release_sock(sk);
	return err;
}

sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start

struct tcp_listen_opt
{
	u8			max_qlen_log;	/* log_2 of maximal queued SYNs */
	int			qlen;
	int			qlen_young;
	int			clock_hand;
	struct open_request	*syn_table[TCP_SYNQ_HSIZE];//第三次握手包时,会将对应的请求结构放入该数组的某个队列中
};

int tcp_listen_start(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct tcp_listen_opt *lopt;

	sk->max_ack_backlog = 0;
	sk->ack_backlog = 0;
	tp->accept_queue = tp->accept_queue_tail = NULL;
	tp->syn_wait_lock = RW_LOCK_UNLOCKED;

	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);//创建监听对象
	if (!lopt)
		return -ENOMEM;

	memset(lopt, 0, sizeof(struct tcp_listen_opt));
	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
		if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
			break;

	write_lock_bh(&tp->syn_wait_lock);
	tp->listen_opt = lopt;
	write_unlock_bh(&tp->syn_wait_lock);
.......
	sk->state = TCP_LISTEN;//设置监听状态
	if (sk->prot->get_port(sk, sk->num) == 0) {//检查绑定端口
		sk->sport = htons(sk->num);

		sk_dst_reset(sk);
		sk->prot->hash(sk);//tcp_v4_hash将sock链入监听队列

		return 0;
	}

.......
}

sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start=>tcp_v4_hash

static void tcp_v4_hash(struct sock *sk)
{
	if (sk->state != TCP_CLOSE) {
		local_bh_disable();
		__tcp_v4_hash(sk);
		local_bh_enable();
	}
}

sys_socketcall=>sys_listen=>inet_listen=>tcp_listen_start=>tcp_v4_hash=>__tcp_v4_hash

#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)

static __inline__ void __tcp_v4_hash(struct sock *sk)
{
	struct sock **skp;
	rwlock_t *lock;

	BUG_TRAP(sk->pprev==NULL);
	if(sk->state == TCP_LISTEN) {//前面已经设置为监听状态
		skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];//根据端口找到对应的桶
		lock = &tcp_lhash_lock;
		tcp_listen_wlock();
	} else {
		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
		lock = &tcp_ehash[sk->hashent].lock;
		write_lock(lock);
	}
	if((sk->next = *skp) != NULL)
		(*skp)->pprev = &sk->next;
	*skp = sk;//链入队列
	sk->pprev = skp;
	sock_prot_inc_use(sk->prot);
	write_unlock(lock);
	if (sk->state == TCP_LISTEN)
		wake_up(&tcp_lhash_wait);
}

监听函数执行完成,将sk链入tcp_listening_hash的某个队列中。

sys_accept

accept操作

    if ((ns = accept(s, (struct sockaddr *)&client, &namelen)) == -1)
    {
        tcperror("Accept()");
        exit(5);
    }

sys_socketcall=>sys_accept

asmlinkage long sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
	struct socket *sock, *newsock;
	int err, len;
	char address[MAX_SOCK_ADDR];

	sock = sockfd_lookup(fd, &err);//找到对应的socket
	if (!sock)
		goto out;

	err = -EMFILE;
	if (!(newsock = sock_alloc())) //创建客户端socket
		goto out_put;
    //继承服务器的socekt的类型和跳转表函数指针 inet_stream_ops
	newsock->type = sock->type;
	newsock->ops = sock->ops;
    //调用inet_accept执行接受客户端连接
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err < 0)
		goto out_release;

	if (upeer_sockaddr) {
		if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
			err = -ECONNABORTED;
			goto out_release;
		}
		err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
		if (err < 0)
			goto out_release;
	}

	/* File flags are not inherited via accept() unlike another OSes. */

	if ((err = sock_map_fd(newsock)) < 0)
		goto out_release;

out_put:
	sockfd_put(sock);
out:
	return err;

out_release:
	sock_release(newsock);
	goto out_put;
}

sys_socketcall=>sys_accept=>inet_accept

int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1 = sock->sk;
	struct sock *sk2;
	int err = -EINVAL;

	if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)//tcp_accept
		goto do_err;

	lock_sock(sk2);

	BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));

	sock_graft(sk2, newsock);

	newsock->state = SS_CONNECTED;
	release_sock(sk2);
	return 0;

do_err:
	return err;
}

sys_socketcall=>sys_accept=>inet_accept=>tcp_accept

struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
	struct open_request *req;
	struct sock *newsk;
	int error;

	lock_sock(sk); 

	/* We need to make sure that this socket is listening,
	 * and that it has something pending.
	 */
	error = -EINVAL;
	if (sk->state != TCP_LISTEN)//如果服务器sk不是listen状态报错返回
		goto out;

	/* Find already established connection */
	if (!tp->accept_queue) {
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

		/* If this is a non blocking socket don't sleep */
		error = -EAGAIN;
		if (!timeo)
			goto out;

		error = wait_for_connect(sk, timeo);
		if (error)
			goto out;
	}

	req = tp->accept_queue;//有连接
	if ((tp->accept_queue = req->dl_next) == NULL)
		tp->accept_queue_tail = NULL;

 	newsk = req->sk;//返回客户端的sk
	tcp_acceptq_removed(sk);
	tcp_openreq_fastfree(req);
	BUG_TRAP(newsk->state != TCP_SYN_RECV);
	release_sock(sk);
	return newsk;

out:
	release_sock(sk);
	*err = error; 
	return NULL;
}

这个代码如何和tcp_listening_hash 关联上的呢?三次握手触发的,客户端执行connect操作,发送syn(第一次握手包)过来,设置socket状态

服务器接收到该报文后,最终会进入到tcp_v4_rcv,然后__tcp_v4_lookup根据四元组信息执行__tcp_v4_lookup_established到tcp_ehash已建立的连接中查找,此时失败,然后调用tcp_v4_lookup_listener到tcp_listening_hash查找,而这个在listen时,已经创建好了,可以返回服务器sk,继续tcp_v4_do_rcv=>tcp_v4_hnd_req,在该函数中执行tcp_v4_search_req查询监听结构的lopt->syn_table是否存在,此时不存在返回空,__tcp_v4_lookup_established继续到已监听的哈希表中查找,失败,返回服务器sk.进入tcp_rcv_state_process执行ipv4_specific->conn_request即tcp_v4_conn_request,经过一系列检查并创建req后,执行tcp_v4_synq_add,将req保存至lopt->syn_table的某个队列中。与监听挂上钩,发送第二次握手包;

客户端处理后设置socket状态,发送第三次握手包过来;

服务器收到第三次握手包,同样进入tcp_v4_do_rcv函数然后执行tcp_v4_hnd_req,此时tcp_v4_search_req可以返回,然后执行tcp_check_req函数,执行tcp_v4_syn_recv_sock创建客户端sk,继承服务器的部分信息,比如处理函数跳转表tcp_prot,绑定的端口,设置状态state为TCP_SYN_RECV,然后执行__tcp_v4_hash,将客户端sk根据四元组链入tcp_ehash的某个队列中,再执行__tcp_inherit_port将客户端sk链入到服务器sk的owners中并链入tcp_bhash中。此时连接已经建立,将req从syn_table脱链,然后将客户端sk与req绑定,链入接收队列,并且增加已连接计数,然后进入tcp_child_process,设置为TCP_ESTABLISHED,然后执行服务器sk的data_ready函数即sock_def_readable唤醒服务器进程,接收请求。此时会将该连接从接收队列中脱链,在此之前将该连接挂入了tcp_ehash哈希表中了,后面会从这里获取到对应的连接。

这样整个服务器的socket创建到等待连接的过程基本就讲完了。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值