[bind]
[bind->sockfd_lookup_light]
[bind->sockfd_lookup_light->sock_from_file]
[bind]
[bind->move_addr_to_kernel]
[bind]
[bind->inet_bind]
[bind->inet_bind->__inet_dev_addr_type]
[bind->inet_bind]
[bind->inet_bind->inet_csk_get_port]
[bind->inet_bind->inet_csk_get_port->inet_get_local_port_range]
[bind->inet_bind->inet_csk_get_port]
[bind->inet_bind->inet_csk_get_port->inet_is_reserved_local_port]
[bind->inet_bind->inet_csk_get_port]
[bind->inet_bind->inet_csk_get_port->bind_conflict]
[bind->inet_bind->inet_csk_get_port]
[bind->inet_bind->inet_csk_get_port->inet_bind_hash]
[bind->inet_bind]
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
从文件fd中得到对应的socket:
[bind->sockfd_lookup_light]
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
struct fd f = fdget(fd);
struct socket *sock;
*err = -EBADF;
if (f.file) {
sock = sock_from_file(f.file, err);
if (likely(sock)) {
*fput_needed = f.flags;
return sock;
}
fdput(f);
}
return NULL;
}
调用fdget先从进程的文件列表中通过fd找到对应的文件,然后从中得到socket,文件的flag通过fput_needed返回:
[bind->sockfd_lookup_light->sock_from_file]
struct socket *sock_from_file(struct file *file, int *err)
{
if (file->f_op == &socket_file_ops)
return file->private_data; /* set in sock_map_fd */
*err = -ENOTSOCK;
return NULL;
}
在初始化文件时,
file->private_data指向新建的socket
[bind]
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
将IP地址从用户空间copy到内核空间:
[bind->move_addr_to_kernel]
/**
* move_addr_to_kernel - copy a socket address into kernel space
* @uaddr: Address in user space
* @kaddr: Address in kernel space
* @ulen: Length in user space
*
* The address is copied into kernel space. If the provided address is
* too long an error code of -EINVAL is returned. If the copy gives
* invalid addresses -EFAULT is returned. On a success 0 is returned.
*/
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
return -EINVAL;
if (ulen == 0)
return 0;
if (copy_from_user(kaddr, uaddr, ulen))
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
}
对地址进行一些检测,将地址copy到内核空间。audit_sockaddr对地址进一步验证。
[bind]
if (err >= 0) {
err = sock->ops->bind(sock, (struct sockaddr *)&address, addrlen);
}
}
return err;
}
对TCP,这里的ops为inet_stream_ops,调用的是inet_bind。
[bind->inet_bind]
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct net *net = sock_net(sk);
int chk_addr_ret;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
chk_addr_ret = __inet_dev_addr_type(net, NULL, addr->sin_addr.s_addr);
先对地址的长度检查,然后得到地址的类型:
[bind->inet_bind->__inet_dev_addr_type]
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
*/
static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
__be32 addr)
{
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
如果地址(addr & htonl(0xff000000)) == htonl(0x00000000)或者addr == htonl(0xffffffff),类型为广播。如果(addr & htonl(0xf0000000)) == htonl(0xe0000000)类型为多播。
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
unsigned int ret = RTN_BROADCAST;
struct fib_table *local_table;
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
}
return ret;
}
如果在本地路由表中查找到,类型为路由表中的值。如果不存在本地路由表,类型为广播。否则类型为单播。
[bind->inet_bind]
snum = ntohs(addr->sin_port);
if (snum && snum < PROT_SOCK &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* Check these errors (active socket, double bind). */
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
得到端口号并对其范围做一个检测。绑定时socket如果不处于关闭状态或是本地端口不为0(己绑定),出错。根据上面的地址类型设置地址。然后调用get_port,对TCP,调用inet_csk_get_port确定此地址是否可以被绑定:
[bind->inet_bind->inet_csk_get_port]
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct net *net = sock_net(sk);
if (!snum) {
int low, high;
again:
inet_get_local_port_range(net, &low, &high);
当端口号为0时,先得到本地端口号的范围
[bind->inet_bind->inet_csk_get_port->inet_get_local_port_range]
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
*low = net->ipv4.sysctl_local_ports.range[0];
*high = net->ipv4.sysctl_local_ports.range[1];
}
net->ipv4.sysctl_local_ports在初始化sysctl模块时被设置,在函数ipv4_sysctl_init_net中,范围为:[ 32768, 61000 ]
[bind->inet_bind->inet_csk_get_port]
int smallest_size = -1;
if (!snum) {
int remaining, rover;
again:
...
remaining = (high - low) + 1;
rover = prandom_u32() % remaining + low;
smallest_size = -1;
do {
if (inet_is_reserved_local_port(rover))
goto next_nolock;
先计算出一个随机的端口号,判断它是不是已经存在了:
[bind->inet_bind->inet_csk_get_port->inet_is_reserved_local_port]
extern unsigned long *sysctl_local_reserved_ports;
static inline int inet_is_reserved_local_port(int port)
{
return test_bit(port, sysctl_local_reserved_ports);
}
sysctl_local_reserved_ports在inet_init中被初始化,是一个8192字节的内存块,用来保存己存在的端口号。
[bind->inet_bind->inet_csk_get_port]
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
if (!snum) {
int remaining, rover, low, high;
again:
...
smallest_size = -1;
do {
...
head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)];
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
/* 列表项的fastreuse被设置,sk的sk_reuse被设置且状态不是TCP_LISTEN
* 或者列表项的fastreuseport大于0并且sk的sk_reuseport不为0并且两者的用户ID相同
* 上面满足后,找到列表项的num_owners最小的一个
*/
if ( ( ( tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN )
|| (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) )
&& ( tb->num_owners < smallest_size || smallest_size == -1) ) {
smallest_size = tb->num_owners;
smallest_rover = rover;
/* 列表的bsockets值大于本地端口的范围
* 在tb中无冲突,找到位置
*/
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover;
goto tb_found;
}
}
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
goto next;
}
break;
next:
next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0)
所有被绑定的端口都通过一个结构(inet_bind_bucket)放在一个列表中。对此列表进行循环,如果它们在同一个网络名字空间并且端口号相同,进一步判断。当tb->fastreuse大于0时,表示当中的socket的sk_reuse不为0且状态不是TCP_LISTEN;当tb->fastreuseport大于0时表示当中的socket的sk_reuseport不为0。如果端口号的范围超过范围,使用最小的端口号。TCP时,icsk_af_ops为ipv4_specific,调用的接口为inet_csk_bind_conflict。tb中将相同的socket列入到列表中,对这个列表中的元素也进行查询。remaining初始化为5,这样的检查总共进行5次。
[bind->inet_bind->inet_csk_get_port->bind_conflict]
int inet_csk_bind_conflict(const struct sock *sk,const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
int reuse = sk->sk_reuse;
int reuseport = sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
sk_for_each_bound(sk2, &tb->owners) {
/* socket不同并且
* 两个socket的sk_bound_dev_if相同且不为0
*/
if ( sk != sk2
&& (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
/* 两者的sk_reuse有一个不为0或列表项的状态为TCP_LISTEN;并且
* 两者的sk_reuseport 有一个不为0;或列表项的状态不是TCP_TIME_WAIT并且两者UID不同
*/
if ( (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN)
&& (!reuseport || !sk2->sk_reuseport || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) {
/* 两者的sk_rcv_saddr 相同且不为0
*/
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
/* 两者的sk_reuse都不为0并且列表项的状态不是TCP_LISTEN
*/
if ( !relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) {
/* 两者的sk_rcv_saddr 相同且不为0
*/
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
}
}
/* 找到的列表项不为0,有冲突
*/
return sk2 != NULL;
}
列表中的socket与传进来的socket满足下面的条件:在同一个设备上(设备号不为0);接收地址相同(地址不为空);列表中的状态不是TCP_LISTEN且两者的sk_reuse都不为0;或列表中的状态是TCP_LISTEN或两者的sk_reuse有一个不为0,并且两者的sk_reuseport 有一个不为0,或列表项的状态不是TCP_TIME_WAIT且两者UID不同
[bind->inet_bind->inet_csk_get_port]
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0) {
if (smallest_size != -1) {
snum = smallest_rover;
goto have_snum;
}
goto fail;
}
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
如果remaining不为0,说明找到了。如果为0,并且smallest_size不为-1,就将商品号设为刚才的最小的端口号然后进一步检查,否则就出错了。下面就是当snum不为0的情况:
} else {
have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)];
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
tb = NULL;
goto tb_not_found;
这时简单,只要在列表中找同一网络名字下相同的商品号就可以了。
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
/* tb的fastreuse大于0并且sk的sk_reuse不为0且sk的状态不为TCP_LISTEN
* 或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同
*/
if ( ( (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN)
|| (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)) )
&& smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
/* sk的sk_reuse不为0且状态不为TCP_LISTEN;或tb的fastreuseport大于0并且sk的sk_reuseport不为0且tb的fastuid和sk的UID相同
*/
if ( ( (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
|| (tb->fastreuseport > 0 && sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
&& smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}
goto fail_unlock;
}
}
}
如果tb中的列表为空,说明还是没找到。如果sk的sk_reuse值为SK_FORCE_REUSE,成功了。attempts为5,当不成功时会尝试5次
tb_not_found:
ret = 1;
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0;
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
} else
tb->fastreuseport = 0;
} else {
if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
tb->fastreuseport = 0;
}
当tb为0时,新建一个tb当作节点。当sk的状态不是TCP_LISTEN且sk_reuse不为0时,tb的fastreuse为1。当sk的sk_reuseport不国0时,tb的fastreuseport为1且fastuid设为sk的UID。当tb不为0时,根据sk的值要重设fastreuse和fastreuseport的值。
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
spin_unlock(&head->lock);
fail:
local_bh_enable();
return ret;
}
最后将tb和端口号加入到列表中。
[bind->inet_bind->inet_csk_get_port->inet_bind_hash]
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
atomic_inc(&hashinfo->bsockets);
inet_sk(sk)->inet_num = snum;
hlist_add_head(&sk->sk_bind_node, &tb->owners);
tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
设置socket的本地端口号为snum,sk通过sk_bind_node,加入到tb->owners中
[bind->inet_bind]
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
设置源端口号inet_sport为本地端口号inet_num。设置sk中的路由为空。如果找到,返回为0,说明端口号被己被绑定,出错。