1. 前言
2. 流程图
2.1 Tcp/Ip 协议层
2.2 代码内部分层结构
3. inet_init源码分析
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
if (!sysctl_local_reserved_ports)
goto out;
//tcp协议注册(传输层)
rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间
if (rc)
goto out_free_reserved_ports;
//udp协议注册(传输层)
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
//raw原始协议注册(传输层)
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
//icmp协议注册(传输层)
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
out_free_reserved_ports:
kfree(sysctl_local_reserved_ports);
goto out;
}
fs_initcall(inet_init);
先贴出源码,下面逐个分析,为了了解架构,先分析它所涉及到的结构体!
3. 结构体
3.1 struct proto(传输层到网络层协议)
该结构体的操作集是“传输层”到“网络层”的接口
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept)(struct sock *sk, int flags, int *err);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk);
void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
unsigned int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
#ifdef CONFIG_COMPAT
int (*compat_setsockopt)(struct sock *sk,
int level,
int optname, char __user *optval,
unsigned int optlen);
int (*compat_getsockopt)(struct sock *sk,
int level,
int optname, char __user *optval,
int __user *option);
int (*compat_ioctl)(struct sock *sk,
unsigned int cmd, unsigned long arg);
#endif
int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len);
int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg,
size_t len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
void (*release_cb)(struct sock *sk);
void (*mtu_reduced)(struct sock *sk);
/* Keeping track of sk's, looking them up, and port selection methods. */
void (*hash)(struct sock *sk);
void (*unhash)(struct sock *sk);
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
void (*clear_sk)(struct sock *sk, int size);
/* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
unsigned int inuse_idx; //表示在proto_inuse_idx数组中占用的一个bit位索引
#endif
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
long *sysctl_mem;
int *sysctl_wmem;
int *sysctl_rmem;
int max_header;
bool no_autobind;
struct kmem_cache *slab;
unsigned int obj_size;
int slab_flags;
struct percpu_counter *orphan_count;
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
union {
struct inet_hashinfo *hashinfo;
struct udp_table *udp_table;
struct raw_hashinfo *raw_hash;
} h;
struct module *owner;
char name[32];
struct list_head node;
#ifdef SOCK_REFCNT_DEBUG
atomic_t socks;
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
* cgroup specific init/deinit functions. Called once for all
* protocols that implement it, from cgroups populate function.
* This function has to setup any files the protocol want to
* appear in the kmem cgroup filesystem.
*/
int (*init_cgroup)(struct mem_cgroup *memcg,
struct cgroup_subsys *ss);
void (*destroy_cgroup)(struct mem_cgroup *memcg);
struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg);
#endif
};
对应的源码:
这里以tcp_prot为例:
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
//tcp协议注册(传输层)
rc = proto_register(&tcp_prot, 1); //第二个参数为1,表示在高速缓存内部分配空间
if (rc)
goto out_free_reserved_ports;
//udp协议注册(传输层)
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
//raw原始协议注册(传输层)
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
//icmp协议注册(传输层)
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
上面接口分别注册了“传输层”-->“网络层”的不同协议(tcp_prot, udp_prot, raw_prot, ping_prot)的接口操作集,它们都是通过以下接口进行协议注册:
int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
SLAB_HWCACHE_ALIGN | prot->slab_flags,
NULL);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
goto out;
}
//请求sock操作集
if (prot->rsk_prot != NULL) {
prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
if (prot->rsk_prot->slab_name == NULL)
goto out_free_sock_slab;
prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
prot->rsk_prot->obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
if (prot->rsk_prot->slab == NULL) {
pr_crit("%s: Can't create request sock SLAB cache!\n",
prot->name);
goto out_free_request_sock_slab_name;
}
}
//等待sock操作集
if (prot->twsk_prot != NULL) {
prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
if (prot->twsk_prot->twsk_slab_name == NULL)
goto out_free_request_sock_slab;
prot->twsk_prot->twsk_slab =
kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
0,
SLAB_HWCACHE_ALIGN |
prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
}
}
mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list); //添加到全局链表proto_list中
assign_proto_idx(prot); //分配一个协议idx
mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
if (prot->rsk_prot && prot->rsk_prot->slab) {
kmem_cache_destroy(prot->rsk_prot->slab);
prot->rsk_prot->slab = NULL;
}
out_free_request_sock_slab_name:
if (prot->rsk_prot)
kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
out:
return -ENOBUFS;
}
EXPORT_SYMBOL(proto_register);
最终这些“传输层”-->“网络层”协议(tcp_prot, udp_prot, raw_prot, ping_prot)都注册到了全局链表“proto_list”中
mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list); //添加到全局链表proto_list中
assign_proto_idx(prot); //分配一个协议idx
mutex_unlock(&proto_list_mutex);
3.2 struct net_proto_family (BSD表示层)
该结构体表示协议族AF_INET, AF_UNIX
struct net_proto_family {
int family; //协议族 AF_INET
//应用层通过socket函数创建socket套接字时调用的接口
int (*create)(struct net *net, struct socket *sock,
int protocol, int kern);
struct module *owner;
};
static const struct net_proto_family inet_family_ops = {
.family = PF_INET, //协议族
.create = inet_create, //应用层创建socket套接字时调用的接口,该函数见下面分析
.owner = THIS_MODULE,
};
(void)sock_register(&inet_family_ops); //sock套接字协议族注册
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
//net_families[]是一个数组,数组成员字段对应不同的协议族
if (rcu_dereference_protected(net_families[ops->family], //
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
EXPORT_SYMBOL(sock_register);
这个协议最终会将协议族添加到全局数组net_families[]中。
最终输出如下信息:
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
3.3 struct net_protocol(inet层和网络层之间的连接协议)
struct net_protocol {
void (*early_demux)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
netns_ok:1;
};
这个结构体比较简单,但他是inet层和网络层(IP层)之间的连接,所以也很重要。
1. 第一个和第二个参数是查找基于包多路径选路,对于需要打了转包的设备,还是需要关闭这个功能,可以查看这里了解它2. handler表示对应协议包的收包处理函数,当收到一个包的时候,在IP层将会判断这个包的协议,然后根据协议类型调用该结构中的收包函数,进而将包传给传输层处理
3. netns_ok表示是否支持虚拟网络? namespace网络命名空间?
下面列以下该对象的一些实例:
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
.netns_ok = 1,
};
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
if (!prot->netns_ok) {
pr_err("Protocol %u is not namespace aware, cannot register.\n",
protocol);
return -EINVAL;
}
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
这些实例在inet_init函数中,通过以下代码将不同的协议接收函数添加到inet_protos[protocol] 全局链表中,从而完成IP层和传输层的链接。
3.4 struct inet_protosw
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
/*type和protocol两个域组成了socket的查找key,他们分别对应着应用
层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种
socket类型。查找的时候主要是查找type,protocol只是用来防止初始化
好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果
应用层又调用了该协议的初始化函数,将直接退出。*/
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
/*表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等
相关的协议具体处理函数.
对应传输层到网络层协议*/
struct proto *prot;
//不同套接字类型的套接字管理函数集,也就是socket函数的第二个参数指
//定的type,不同的套接字类型有不同的管理函数集
const struct proto_ops *ops;
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
前面一直疑惑inet_protosw中的sw表示什么意思,后来才知道,原来是switch的缩写,表示inet层的协议切换,inetsw串联着PF_INET地址族所支持的协议类型,比如tcp, udp等。
这个函数起着承上启下的作用,它上层对应的是BSD层,下层对应的是inet层,它在网络协议栈的最上两层扮演着重要的角色。
这个结构体每个注释已经比较清楚了,但是这里还是再讲一下:
1. type和protocol两个域组成了socket的查找key,他们分别对应着应用层socket函数的第二和第三个参数,通过这两个参数来确定使用的是哪种socket类型。查找的时候主要是查找type,protocol只是用来防止初始化好的协议被再次初始化,比如开机的时候已经初始化好了TCP协议,如果应用层又调用了该协议的初始化函数,将直接退出。
2. prot表示的是该协议相关的处理函数,比如tcp_v4_connect和tcp_v4_init_sock等相关的协议具体处理函数
3. ops表示的是该socket类型的套接字的管理函数,比如处理inet_bind和inet_accept等对inet层的套接字调用
prot和ops都是操作函数的集合,非常重要,我们会在后面进一步分析
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
/* Register the socket-side information for inet_create. */
//初始化inet switch 链表
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
//遍历inet switch数组
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
}
answer = NULL;
}
if (answer)
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
return;
out_permanent:
pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
EXPORT_SYMBOL(inet_register_protosw);
最红会将该协议注册到
inetsw[p->type]全局链表中。