SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)

[bind]
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
	struct socket *sock;
	int err, fput_needed;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
从文件fd中得到对应的socket:
[bind->sockfd_lookup_light]
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
	struct fd f = fdget(fd);
	struct socket *sock;

	*err = -EBADF;
	if (f.file) {
		sock = sock_from_file(f.file, err);
		if (likely(sock)) {
			*fput_needed = f.flags;
			return sock;
		}
		fdput(f);
	}
	return NULL;
}
调用fdget先从进程的文件列表中通过fd找到对应的文件,然后从中得到socket,文件的flag通过fput_needed返回:
[bind->sockfd_lookup_light->sock_from_file]
struct socket *sock_from_file(struct file *file, int *err)
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

	*err = -ENOTSOCK;
	return NULL;
}
在初始化文件时, file->private_data指向新建的socket
[bind]
	if (sock) {
		err = move_addr_to_kernel(umyaddr, addrlen, &address);
将IP地址从用户空间copy到内核空间:
[bind->move_addr_to_kernel]
/**
 *	move_addr_to_kernel	-	copy a socket address into kernel space
 *	@uaddr: Address in user space
 *	@kaddr: Address in kernel space
 *	@ulen: Length in user space
 *
 *	The address is copied into kernel space. If the provided address is
 *	too long an error code of -EINVAL is returned. If the copy gives
 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 */
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
		return -EINVAL;
	if (ulen == 0)
		return 0;
	if (copy_from_user(kaddr, uaddr, ulen))
		return -EFAULT;
 	return audit_sockaddr(ulen, kaddr);
}
对地址进行一些检测,将地址copy到内核空间。audit_sockaddr对地址进一步验证。
[bind]
		if (err >= 0) {
				err = sock->ops->bind(sock, (struct sockaddr *)&address, addrlen);
		}
	}
 	return err;
}
对TCP,这里的ops为inet_stream_ops,调用的是inet_bind。
[bind->inet_bind]
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct net *net = sock_net(sk);
	int chk_addr_ret;

	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

 	chk_addr_ret = __inet_dev_addr_type(net, NULL, addr->sin_addr.s_addr);
先对地址的长度检查,然后得到地址的类型:
[bind->inet_bind->__inet_dev_addr_type]
/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
static inline unsigned int __inet_dev_addr_type(struct net *net,
						const struct net_device *dev,
						__be32 addr)
{
	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
		return RTN_BROADCAST;
	if (ipv4_is_multicast(addr))
		return RTN_MULTICAST;
如果地址(addr & htonl(0xff000000)) == htonl(0x00000000)或者addr == htonl(0xffffffff),类型为广播。如果(addr & htonl(0xf0000000)) == htonl(0xe0000000)类型为多播。
	struct flowi4		fl4 = { .daddr = addr };
	struct fib_result	res;
	unsigned int ret = RTN_BROADCAST;
	struct fib_table *local_table;

	local_table = fib_get_table(net, RT_TABLE_LOCAL);
	if (local_table) {
		ret = RTN_UNICAST;
		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
			if (!dev || dev == res.fi->fib_dev)
				ret = res.type;
		}
	}
	return ret;
}
如果在本地路由表中查找到,类型为路由表中的值。如果不存在本地路由表,类型为广播。否则类型为单播。
[bind->inet_bind]
	snum = ntohs(addr->sin_port);
	if (snum && snum < PROT_SOCK &&
	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
		goto out;

        /* Check these errors (active socket, double bind). */
	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
		goto out_release_sock;

	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		inet->inet_saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. */
	if (sk->sk_prot->get_port(sk, snum)) {
		inet->inet_saddr = inet->inet_rcv_saddr = 0;
		err = -EADDRINUSE;
		goto out_release_sock;
	}
得到端口号并对其范围做一个检测。绑定时socket如果不处于关闭状态或是本地端口不为0(己绑定),出错。根据上面的地址类型设置地址。然后调用get_port,对TCP,调用inet_csk_get_port确定此地址是否可以被绑定:
[bind->inet_bind->inet_csk_get_port]
/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	struct net *net = sock_net(sk);

	if (!snum) {
		int low, high;

again:
		inet_get_local_port_range(net, &low, &high);
当端口号为0时,先得到本地端口号的范围
[bind->inet_bind->inet_csk_get_port->inet_get_local_port_range]
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
		*low = net->ipv4.sysctl_local_ports.range[0];
		*high = net->ipv4.sysctl_local_ports.range[1];
}
net->ipv4.sysctl_local_ports在初始化sysctl模块时被设置,在函数ipv4_sysctl_init_net中,范围为:[ 32768, 61000 ]
[bind->inet_bind->inet_csk_get_port]
	int smallest_size = -1;
	if (!snum) {
 		int remaining, rover;
again:
                ...
                remaining = (high - low) + 1;
		rover = prandom_u32() % remaining + low;

 		smallest_size = -1;
 		do {
 			if (inet_is_reserved_local_port(rover))
 				goto next_nolock;
先计算出一个随机的端口号,判断它是不是已经存在了:
[bind->inet_bind->inet_csk_get_port->inet_is_reserved_local_port]
extern unsigned long *sysctl_local_reserved_ports;
static inline int inet_is_reserved_local_port(int port)
{
	return test_bit(port, sysctl_local_reserved_ports);
}
sysctl_local_reserved_ports在inet_init中被初始化,是一个8192字节的内存块,用来保存己存在的端口号。
[bind->inet_bind->inet_csk_get_port]
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int smallest_size = -1, smallest_rover;
	kuid_t uid = sock_i_uid(sk);

	if (!snum) {
		int remaining, rover, low, high;
again:
		...
		smallest_size = -1;
		do {
			...
			head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)];
			inet_bind_bucket_for_each(tb, &head->chain)
				if (net_eq(ib_net(tb), net) && tb->port == rover) {
					/* 列表项的fastreuse被设置,sk的sk_reuse被设置且状态不是TCP_LISTEN 
					 * 或者列表项的fastreuseport大于0并且sk的sk_reuseport不为0并且两者的用户ID相同 
					 * 上面满足后,找到列表项的num_owners最小的一个
					 */
					if ( (    ( tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN ) 
					       || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) ) 
					  && ( tb->num_owners < smallest_size || smallest_size == -1) ) {
						smallest_size = tb->num_owners;
						smallest_rover = rover;
 		                                /* 列表的bsockets值大于本地端口的范围
                                                 * 在tb中无冲突,找到位置
                                                 */
						 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
							snum = smallest_rover;
 							goto tb_found;
 						}

                                           }
                                           if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
                                                 snum = rover;
                                                 goto tb_found;
                                           }
                                           goto next;
                                  }
                              break;
                      next:
                      next_nolock:
                              if (++rover > high)
                                   rover = low;
                } while (--remaining > 0)
所有被绑定的端口都通过一个结构(inet_bind_bucket)放在一个列表中。对此列表进行循环,如果它们在同一个网络名字空间并且端口号相同,进一步判断。当tb->fastreuse大于0时,表示当中的socket的sk_reuse不为0且状态不是TCP_LISTEN;当tb->fastreuseport大于0时表示当中的socket的sk_reuseport不为0。如果端口号的范围超过范围,使用最小的端口号。TCP时,icsk_af_ops为ipv4_specific,调用的接口为inet_csk_bind_conflict。tb中将相同的socket列入到列表中,对这个列表中的元素也进行查询。remaining初始化为5,这样的检查总共进行5次。
[bind->inet_bind->inet_csk_get_port->bind_conflict]
int inet_csk_bind_conflict(const struct sock *sk,const struct inet_bind_bucket *tb, bool relax)
{
	struct sock *sk2;
	int reuse = sk->sk_reuse;
	int reuseport = sk->sk_reuseport;
	kuid_t uid = sock_i_uid((struct sock *)sk);

	/*
	 * Unlike other sk lookup places we do not check
	 * for sk_net here, since _all_ the socks listed
	 * in tb->owners list belong to the same net - the
	 * one this bucket belongs to.
	 */

	sk_for_each_bound(sk2, &tb->owners) {
		/* socket不同并且
		 * 两个socket的sk_bound_dev_if相同且不为0 
		 */
		if (  sk != sk2 
		   && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
			/* 两者的sk_reuse有一个不为0或列表项的状态为TCP_LISTEN;并且
 			 * 两者的sk_reuseport 有一个不为0;或列表项的状态不是TCP_TIME_WAIT并且两者UID不同 
 			 */
 			if (   (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) 
			    && (!reuseport || !sk2->sk_reuseport || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) {
 				/* 两者的sk_rcv_saddr 相同且不为0
				 */
 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
					break;
 			}
 			/* 两者的sk_reuse都不为0并且列表项的状态不是TCP_LISTEN
 			 */
			if ( !relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) {
 				/* 两者的sk_rcv_saddr 相同且不为0
				 */
 				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
					break;
 			}
		}
 	}
	/* 找到的列表项不为0,有冲突
 	 */
	return sk2 != NULL;
 }
列表中的socket与传进来的socket满足下面的条件:在同一个设备上(设备号不为0);接收地址相同(地址不为空);列表中的状态不是TCP_LISTEN且两者的sk_reuse都不为0;或列表中的状态是TCP_LISTEN或两者的sk_reuse有一个不为0,并且两者的sk_reuseport 有一个不为0,或列表项的状态不是TCP_TIME_WAIT且两者UID不同
[bind->inet_bind->inet_csk_get_port]
		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0) {
			if (smallest_size != -1) {
				snum = smallest_rover;
				goto have_snum;
			}
			goto fail;
		}
		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover;
如果remaining不为0,说明找到了。如果为0,并且smallest_size不为-1,就将商品号设为刚才的最小的端口号然后进一步检查,否则就出错了。下面就是当snum不为0的情况:
} else {
have_snum:
		head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)];
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum)
				goto tb_found;
	}
	tb = NULL;
	goto tb_not_found;
这时简单,只要在列表中找同一网络名字下相同的商品号就可以了。
tb_found:
	if (!hlist_empty(&tb->owners)) {
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		/* tb的fastreuse大于0并且sk的sk_reuse不为0且sk的状态不为TCP_LISTEN
		 * 或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同
		 */
		if (   (     (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) 
		          || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) ) 
		    && smallest_size == -1) {
			goto success;
		} else {
			ret = 1;
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
				/* sk的sk_reuse不为0且状态不为TCP_LISTEN;或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同
				 */
				if (    (     (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 
				           || (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) 
				     && smallest_size != -1 && --attempts >= 0) {
					spin_unlock(&head->lock);
					goto again;
				}

				goto fail_unlock;
			}
		}
	}
如果tb中的列表为空,说明还是没找到。如果sk的sk_reuse值为SK_FORCE_REUSE,成功了。attempts为5,当不成功时会尝试5次
tb_not_found:
	ret = 1;
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) {
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
			tb->fastreuse = 1;
		else
			tb->fastreuse = 0;
		if (sk->sk_reuseport) {
			tb->fastreuseport = 1;
			tb->fastuid = uid;
		} else
			tb->fastreuseport = 0;
	} else {
		if (tb->fastreuse &&
		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
			tb->fastreuse = 0;
		if (tb->fastreuseport &&
		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
			tb->fastreuseport = 0;
	}
当tb为0时,新建一个tb当作节点。当sk的状态不是TCP_LISTEN且sk_reuse不为0时,tb的fastreuse为1。当sk的sk_reuseport不国0时,tb的fastreuseport为1且fastuid设为sk的UID。当tb不为0时,根据sk的值要重设fastreuse和fastreuseport的值。
success:
	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, snum);
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}
最后将tb和端口号加入到列表中。
[bind->inet_bind->inet_csk_get_port->inet_bind_hash]
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum)
{
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;

 	atomic_inc(&hashinfo->bsockets);

	inet_sk(sk)->inet_num = snum;
 	hlist_add_head(&sk->sk_bind_node, &tb->owners);
	tb->num_owners++;
	 inet_csk(sk)->icsk_bind_hash = tb;
}
设置socket的本地端口号为snum,sk通过sk_bind_node,加入到tb->owners中
[bind->inet_bind]
	if (inet->inet_rcv_saddr)
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->inet_sport = htons(inet->inet_num);
	inet->inet_daddr = 0;
	inet->inet_dport = 0;
	sk_dst_reset(sk);
	err = 0;
out_release_sock:
	release_sock(sk);
out:
	return err;
}
设置源端口号inet_sport为本地端口号inet_num。设置sk中的路由为空。如果找到,返回为0,说明端口号被己被绑定,出错。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值