三、af_inet 协议簇的协议封装
接下来,函数调用之前已经注的 inet_family_ops的函数指针 create,也就是inet_create()函数,前面,可以说一个通用的 socket 已经创建好了,这里要完成与协议本身相关的一些创建 socket 的工作。这一部份的工作比较复杂,还是先来看看 af_inet.c 中的模块初 始化时候,做了哪些与此相关的工作。
要引入的第一个数据结构是 struct inet_protosw,它封装了一个协议类型(如 SOCK_STREAM、SOCK_DGRAM 等)与 ip 协议中对应的传输层协议:
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
int protocol; /* This is the L4 protocol number. */
struct proto *prot;
struct proto_ops *ops;
int capability; /* Which (if any) capability do
* we need to use this socket
* interface?
*/
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
type 是协议类型,对于 ipv4 而言,就是 SOCK_STREAM、SOCK_DGRAM 或者是 SOCK_RAW 之一。protocol 是传输层的协议号。 prot 用于描述一个具体的传输层协议,而ops 指向对应的当前协议类型的操作函数集。针对不同的协议类型,定义了不同的 ops:
struct proto_ops inet_stream_ops = { .family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage
};
struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
};
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
* udp_poll */
static struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
};
从各个函数指针的名称,我们就可以大约知道它们是做什么事的了。进一步进以看到,它们的函数指针指向的函数差不多都是相同的。除了一些细节上的区别,例如后面两种协议类型并不支持listen。
socket() API 第二个参数是协议类型,第三个参数是该协议类型下的协议——不过对于 ipv4 而言,它们都是一一对应的。但是从抽像封装的角度看,数据结构的设计本 身应该满足一个协议类型下边,可能存在多个不同的协议,即一对多的情况。而一一对应,仅是它们的特例:
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT,
},
{ .type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
数组的每一个元素,就是支持的一种协议名称,例如 IPOROTO_TCP,但是由于 IPV4 本身协议类型跟协议是一一对应的,所以没有更多的.type= SOCK_xxx 了。这样数组实现了对 PF_INET 协议族下支持的协议类型,以及协议类型下边的协议进行了封装,虽然事实上它们是一一对应的关系,不过理论上,完全可能存在一对多的可能。
* socket layer -> transport layer interface
* transport -> network interface is defined by struct inet_proto
*/
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept) (struct sock *sk, int flags, int *err);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk); int (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len);
int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg,
size_t len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
/* Keeping track of sk's, looking them up, and port selection methods. */
void (*hash)(struct sock *sk);
void (*unhash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
/* Memory pressure */
void (*enter_memory_pressure)(void);
atomic_t *memory_allocated; /* Current allocated memory. */
atomic_t *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the sk_stream_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
int *sysctl_mem;
int *sysctl_wmem;
int *sysctl_rmem;
int max_header;
kmem_cache_t *slab; unsigned int obj_size;
struct module *owner;
char name[32];
struct list_head node;
struct {
int inuse;
u8 __pad[SMP_CACHE_BYTES - sizeof(int)];
} stats[NR_CPUS];
};
以 TCP协议为例,TCP协议的 sokcet 操作函数都被封装在这里了。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = tcp_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = tcp_recvmsg,
.backlog_rcv = tcp_v4_do_rcv,
.hash = tcp_v4_hash,
.unhash = tcp_unhash,
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock), };
static int __init inet_init(void)
{
……
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
……
}
inetsw 是一个数组,其每一个元素,都是一个链表首部,前面一个循环初始化之。后一个循环就值得注意了,也就是函数 inet_register_protosw:
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol)
break;
last_perm = lh;
}
answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
synchronize_net();
return;
out_permanent:
printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
protocol);
goto out;
out_illegal:
printk(KERN_ERR
"Ignoring attempt to register invalid socket type %d.\n",
p->type);
goto out;
}
这个函数完成的工作,就是把 inetsw_array 数组中,相同的协议类型下边的协议,加入到 inetsw 对应的协议类型的链表中去。因为事实上一对一 的关系,所以这个函数要简单得多:因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用 list_add_rcu(&p->list, last_perm);把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值)的第一个成员。
来做一个假设,如果 SOCK_STREAM 协议类型下边还有另一个协议,IPPROTO_123,那么
.protocol = IPPROTO_123,
.prot = &123_prot,
……
},
这样,当遍历 inetsw_array,再次进入 inet_register_protosw 函数后,因为 SOCK_STREAM 类型下已经注册了 IPPROTO_TCP,所以,
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol) /* 已经注册了相同协议号,退出循环,因为没有置 answer 为 NULL,所以后面会直接退出函数 */
break;
last_perm = lh; /* 移动位置指针,指向链表中最后一个元素 */
}
answer = NULL;
}
这个循环,answer 就会指向之前注册的 TCP 的链表节点,然后根据标志,如果是INET_PROTOSW_PERMANENT,则 last_perm 指向链表中最后一个节点,也就是 TCP,之后同样的道理,再把 123 追加到 TCP 之后,如果是 INET_PROTOSW_REUSE,因为位置指针 last_perm没有移动,则之前注册的元素会被覆盖。
OK,绕了这么大一圈子,了解了协议的封装及链表的注册。现在回到 inet_create 中来:
/*
* Create an inet socket.
*/
static int inet_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int err;
answer = NULL;
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
answer = NULL;
}
这个循环,根据 socket(2)调用的 protocol,把之前在链表中注册的协议节点找出来一个问题是,因为一一对应关系的存在,用户态调用 socket(2)的时候,常常第三个参数直接就置 0 了。也就是这里protocol 为 0。那内核又如何处理这一默认值呢?也就是 protocol != answer->protocol,而是被 if (IPPROTO_IP == protocol) 所匹配了。这样,将protocol 置为链表中第一个协议。而当循环结束时,answer自然也是指向这个链表中的第一个注册节点。以刚才的例 子,SOCK_STREAM 下同时注册了 TCP和 123,那么这里默认就取 TCP了。当然,把 123 在inetsw_array数组中的位置调前,那么就 默认取 123 了。
err = -ESOCKTNOSUPPORT;
if (!answer)
goto out_rcu_unlock;
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
err = -EPROTONOSUPPORT;
if (!protocol)
goto out_rcu_unlock;
/* 找到了组织,将创建的 socket 的 ops 函数指针集,指向协议类型的。例如创建的是SOCK_STREAM,那么就指向了 inet_stream_ops */
sock->ops = answer->ops;
/* answer_prot 指针指向了当前要创建的 socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的 tcp_prot 结构 */
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
接下来一个重要的工作,就是为 socket 分配一个sock,并初始化它:
err = -ENOBUFS;
sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
if (SOCK_RAW == sock->type) {
inet->num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
#ifdef INET_REFCNT_DEBUG
atomic_inc(&inet_sock_nr);
#endif
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->sport = htons(inet->num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
虽然 create的代码就到这儿了,不过要说清楚 sk 的分配,还得费上大力气。 每一个 Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为 sock),但是同时又一个 struct sock 结构(内核中一般使用名称为 sk)。两者之间是一一对应的关系。在后面的 sock_init_data 函数中,可以看到
socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱 动程序的侧面。设计者把 socket 套接字中,与文件系统关系比较密切的那一部份放在 socket 结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是 sock 结构。由于这两份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。
再暂时回到 inet_init 中来,初始化工作中,有如下代码:
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
这里为每个 protocol 都调用了 proto_register 函数,其重要功能之一,就是根据协议的 obj_size 成员的大小,为协议创建高速缓存:
static DEFINE_RWLOCK(proto_list_lock);
static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
int rc = -ENOBUFS;
if (alloc_slab) {
/* 可以看到,函数最重要的功能就是根据 prot 的obj_size 成员的大小,为协议创建高速缓存 */
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
if (prot->slab == NULL) {
printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
prot->name);
goto out;
}
}
/* 顺便看到它的另一个重要的功能,是维护一个以 proto_list 为首的链表 */
write_lock(&proto_list_lock);
list_add(&prot->node, &proto_list);
write_unlock(&proto_list_lock);
rc = 0;
out:
return rc;
}
这里要注意的是:prot->obj_size 的大小,它它非仅仅是一个 sk 的大小!! !以 TCP为例:.obj_size = sizeof(struct tcp_sock)。稍后再来分析这个。
回到 inet_create()函数中来,其调用 sk_alloc()分配一个 sk:
sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
kmem_cache_t *slab = prot->slab;
if (slab != NULL)
sk = kmem_cache_alloc(slab, priority);
else
sk = kmalloc(prot->obj_size, priority);
if (sk) {
if (zero_it) {
memset(sk, 0, prot->obj_size);
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
……
在之前创建的高速缓存中申请分配一个 slab 缓存项。并清零。然后设置协议族、并把 sk 中的 sk_prot与对应的协议关联起来。
接下来,函数调用之前已经注的 inet_family_ops的函数指针 create,也就是inet_create()函数,前面,可以说一个通用的 socket 已经创建好了,这里要完成与协议本身相关的一些创建 socket 的工作。这一部份的工作比较复杂,还是先来看看 af_inet.c 中的模块初 始化时候,做了哪些与此相关的工作。
要引入的第一个数据结构是 struct inet_protosw,它封装了一个协议类型(如 SOCK_STREAM、SOCK_DGRAM 等)与 ip 协议中对应的传输层协议:
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
int protocol; /* This is the L4 protocol number. */
struct proto *prot;
struct proto_ops *ops;
int capability; /* Which (if any) capability do
* we need to use this socket
* interface?
*/
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
type 是协议类型,对于 ipv4 而言,就是 SOCK_STREAM、SOCK_DGRAM 或者是 SOCK_RAW 之一。protocol 是传输层的协议号。 prot 用于描述一个具体的传输层协议,而ops 指向对应的当前协议类型的操作函数集。针对不同的协议类型,定义了不同的 ops:
struct proto_ops inet_stream_ops = { .family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage
};
struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
};
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
* udp_poll */
static struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
};
从各个函数指针的名称,我们就可以大约知道它们是做什么事的了。进一步进以看到,它们的函数指针指向的函数差不多都是相同的。除了一些细节上的区别,例如后面两种协议类型并不支持listen。
socket() API 第二个参数是协议类型,第三个参数是该协议类型下的协议——不过对于 ipv4 而言,它们都是一一对应的。但是从抽像封装的角度看,数据结构的设计本 身应该满足一个协议类型下边,可能存在多个不同的协议,即一对多的情况。而一一对应,仅是它们的特例:
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT,
},
{ .type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
数组的每一个元素,就是支持的一种协议名称,例如 IPOROTO_TCP,但是由于 IPV4 本身协议类型跟协议是一一对应的,所以没有更多的.type= SOCK_xxx 了。这样数组实现了对 PF_INET 协议族下支持的协议类型,以及协议类型下边的协议进行了封装,虽然事实上它们是一一对应的关系,不过理论上,完全可能存在一对多的可能。
数组内,封装的一个具体的协议,由 struct proto 结构来描述:
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
* transport -> network interface is defined by struct inet_proto
*/
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept) (struct sock *sk, int flags, int *err);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk); int (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len);
int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg,
size_t len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
/* Keeping track of sk's, looking them up, and port selection methods. */
void (*hash)(struct sock *sk);
void (*unhash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
/* Memory pressure */
void (*enter_memory_pressure)(void);
atomic_t *memory_allocated; /* Current allocated memory. */
atomic_t *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the sk_stream_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
int *sysctl_mem;
int *sysctl_wmem;
int *sysctl_rmem;
int max_header;
kmem_cache_t *slab; unsigned int obj_size;
struct module *owner;
char name[32];
struct list_head node;
struct {
int inuse;
u8 __pad[SMP_CACHE_BYTES - sizeof(int)];
} stats[NR_CPUS];
};
以 TCP协议为例,TCP协议的 sokcet 操作函数都被封装在这里了。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = tcp_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = tcp_recvmsg,
.backlog_rcv = tcp_v4_do_rcv,
.hash = tcp_v4_hash,
.unhash = tcp_unhash,
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock), };
四、分配 struct sock(sk)
浏览到这里,看完了 PF_INET 的协议簇、协议类型和协议(也就是 socket调用的三个参数)的封装关系,它们通过了两个数据结构 inet_protosw、struct proto 来描述,被一个数组 inetsw_array所封装。接下来看它的初始化工作:
static struct list_head inetsw[SOCK_MAX];
static int __init inet_init(void)
{
……
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
……
}
inetsw 是一个数组,其每一个元素,都是一个链表首部,前面一个循环初始化之。后一个循环就值得注意了,也就是函数 inet_register_protosw:
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol)
break;
last_perm = lh;
}
answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
synchronize_net();
return;
out_permanent:
printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
protocol);
goto out;
out_illegal:
printk(KERN_ERR
"Ignoring attempt to register invalid socket type %d.\n",
p->type);
goto out;
}
这个函数完成的工作,就是把 inetsw_array 数组中,相同的协议类型下边的协议,加入到 inetsw 对应的协议类型的链表中去。因为事实上一对一 的关系,所以这个函数要简单得多:因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用 list_add_rcu(&p->list, last_perm);把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值)的第一个成员。
来做一个假设,如果 SOCK_STREAM 协议类型下边还有另一个协议,IPPROTO_123,那么
inetsw_array数组中就会多出一个元素:
{
.type = SOCK_STREAM,.protocol = IPPROTO_123,
.prot = &123_prot,
……
},
这样,当遍历 inetsw_array,再次进入 inet_register_protosw 函数后,因为 SOCK_STREAM 类型下已经注册了 IPPROTO_TCP,所以,
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol) /* 已经注册了相同协议号,退出循环,因为没有置 answer 为 NULL,所以后面会直接退出函数 */
break;
last_perm = lh; /* 移动位置指针,指向链表中最后一个元素 */
}
answer = NULL;
}
这个循环,answer 就会指向之前注册的 TCP 的链表节点,然后根据标志,如果是INET_PROTOSW_PERMANENT,则 last_perm 指向链表中最后一个节点,也就是 TCP,之后同样的道理,再把 123 追加到 TCP 之后,如果是 INET_PROTOSW_REUSE,因为位置指针 last_perm没有移动,则之前注册的元素会被覆盖。
OK,绕了这么大一圈子,了解了协议的封装及链表的注册。现在回到 inet_create 中来:
/*
* Create an inet socket.
*/
static int inet_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int err;
sock->state = SS_UNCONNECTED;
answer = NULL;
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
answer = NULL;
}
这个循环,根据 socket(2)调用的 protocol,把之前在链表中注册的协议节点找出来一个问题是,因为一一对应关系的存在,用户态调用 socket(2)的时候,常常第三个参数直接就置 0 了。也就是这里protocol 为 0。那内核又如何处理这一默认值呢?也就是 protocol != answer->protocol,而是被 if (IPPROTO_IP == protocol) 所匹配了。这样,将protocol 置为链表中第一个协议。而当循环结束时,answer自然也是指向这个链表中的第一个注册节点。以刚才的例 子,SOCK_STREAM 下同时注册了 TCP和 123,那么这里默认就取 TCP了。当然,把 123 在inetsw_array数组中的位置调前,那么就 默认取 123 了。
err = -ESOCKTNOSUPPORT;
if (!answer)
goto out_rcu_unlock;
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
err = -EPROTONOSUPPORT;
if (!protocol)
goto out_rcu_unlock;
/* 找到了组织,将创建的 socket 的 ops 函数指针集,指向协议类型的。例如创建的是SOCK_STREAM,那么就指向了 inet_stream_ops */
sock->ops = answer->ops;
/* answer_prot 指针指向了当前要创建的 socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的 tcp_prot 结构 */
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
接下来一个重要的工作,就是为 socket 分配一个sock,并初始化它:
err = -ENOBUFS;
sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
if (SOCK_RAW == sock->type) {
inet->num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
#ifdef INET_REFCNT_DEBUG
atomic_inc(&inet_sock_nr);
#endif
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->sport = htons(inet->num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
虽然 create的代码就到这儿了,不过要说清楚 sk 的分配,还得费上大力气。 每一个 Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为 sock),但是同时又一个 struct sock 结构(内核中一般使用名称为 sk)。两者之间是一一对应的关系。在后面的 sock_init_data 函数中,可以看到
sk->sk_socket = sock;
sock->sk = sk;
这样的代码。
socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱 动程序的侧面。设计者把 socket 套接字中,与文件系统关系比较密切的那一部份放在 socket 结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是 sock 结构。由于这两份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。
再暂时回到 inet_init 中来,初始化工作中,有如下代码:
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
这里为每个 protocol 都调用了 proto_register 函数,其重要功能之一,就是根据协议的 obj_size 成员的大小,为协议创建高速缓存:
static DEFINE_RWLOCK(proto_list_lock);
static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
int rc = -ENOBUFS;
if (alloc_slab) {
/* 可以看到,函数最重要的功能就是根据 prot 的obj_size 成员的大小,为协议创建高速缓存 */
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
if (prot->slab == NULL) {
printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
prot->name);
goto out;
}
}
/* 顺便看到它的另一个重要的功能,是维护一个以 proto_list 为首的链表 */
write_lock(&proto_list_lock);
list_add(&prot->node, &proto_list);
write_unlock(&proto_list_lock);
rc = 0;
out:
return rc;
}
这里要注意的是:prot->obj_size 的大小,它它非仅仅是一个 sk 的大小!! !以 TCP为例:.obj_size = sizeof(struct tcp_sock)。稍后再来分析这个。
回到 inet_create()函数中来,其调用 sk_alloc()分配一个 sk:
sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
kmem_cache_t *slab = prot->slab;
if (slab != NULL)
sk = kmem_cache_alloc(slab, priority);
else
sk = kmalloc(prot->obj_size, priority);
if (sk) {
if (zero_it) {
memset(sk, 0, prot->obj_size);
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
……
在之前创建的高速缓存中申请分配一个 slab 缓存项。并清零。然后设置协议族、并把 sk 中的 sk_prot与对应的协议关联起来。