IP头部:
[ include/uapi/linux/ip.h ]
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4, // 头部长度(单位为32位)
version:4; // IP版本4或6
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 tos; // Type of Service,不太常用
__be16 tot_len; // 包的长度(包含头部),单位为字节
__be16 id; // ID, 在分片中起核心作用
__be16 frag_off; // DF (Don’t Fragment);MF (More Fragments);Fragment Offset
__u8 ttl; // 生存时间, 默认64
__u8 protocol; // 上一层协议
__sum16 check; // IP头部校验和
__be32 saddr; // 源地址
__be32 daddr; // 目的地址
/*The options start here.
* IP选项
*/
};
在IP层之上的协议(TCP,UDP),都要支持socket接口的调用。socket提供了一个结构,用来提供各接口:
[ include/net/sock.h ]
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
...
};
然后又提供了一个全局列表,所有支持socket的协议都注册到些列表上:
[ net/core/sock.c ]
static LIST_HEAD(proto_list);
注册函数为:
[ net/core/sock.c ]
int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
/* 分配缓冲,名称为协议的名字(如"TCP")
* 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_sock) )
*/
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
SLAB_HWCACHE_ALIGN | prot->slab_flags,
NULL);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
goto out;
}
/* socket中各种响应(ack)的操作
*/
if (prot->rsk_prot != NULL) {
prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); // 缓冲名称
if (prot->rsk_prot->slab_name == NULL)
goto out_free_sock_slab;
/* 分配缓冲,名称(如"request_sock_TCP")
* 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_request_sock) )
*/
prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
prot->rsk_prot->obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
if (prot->rsk_prot->slab == NULL) {
pr_crit("%s: Can't create request sock SLAB cache!\n",
prot->name);
goto out_free_request_sock_slab_name;
}
}
/* socket处于TIMEWAIT状态时的操作
*/
if (prot->twsk_prot != NULL) {
prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); // 缓冲名称
if (prot->twsk_prot->twsk_slab_name == NULL)
goto out_free_request_sock_slab;
/* 分配缓冲,名称(如"tw_sock_TCP")
* 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_timewait_sock) )
*/
prot->twsk_prot->twsk_slab =
kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
0,
SLAB_HWCACHE_ALIGN |
prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
}
}
mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list); // 将协议加入到全局列表中
/* 为快速查询协议是否在socket中有效,申明了一个bitmap,其中的每一位都表示一个协议
* bitmap中位的索引保存在 prot->inuse_idx中
* 使用时可根据些索引设置和查询对应bitmap中位的值
* 别外还申明了一个PRE_CPU数组变量,通过prot->inuse_idx可设置和查询当前使用prot的数量
*/
assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
if (prot->rsk_prot && prot->rsk_prot->slab) {
kmem_cache_destroy(prot->rsk_prot->slab);
prot->rsk_prot->slab = NULL;
}
out_free_request_sock_slab_name:
if (prot->rsk_prot)
kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
out:
return -ENOBUFS;
}
EXPORT_SYMBOL(proto_register);
对于IP来说,socket对应的family为PF_INET,与之对应的结构为:
[ net/ipv4/af_inet.c ]
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create, // 建立socket时被调用
.owner = THIS_MODULE,
};
对与socket支持的family,都有一个对应的net_proto_family结构,同样,内核提供一个全局数组:
[ net/socket.c ]
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
NPROTO为socket支持的family的总数。用下面的函数进行注册:
[ net/socket.c ]
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops); // 将ops设置到全局数组中
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
EXPORT_SYMBOL(sock_register);
socket 同一个family中有很多类型(如:SOCK_STREAM,SOCK_DGRAM,SOCK_RAW),对IP层来说,这些所有的类型都保存在全局列表中:
[ net/ipv4/af_inet.c ]
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
*/
static struct list_head inetsw[SOCK_MAX];
内核提供了一个初始化列表:
[ net/ipv4/af_inet.c ]
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
将socket类型注册到全局数组inetsw_array,调用下面函数:
[ net/ipv4/af_inet.c ]
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
/* 协议类型(TCP,UDP...)
* 这里要和 p->type( SOCK_STREAM,SOCK_DGRAM... )区分
*/
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type]; // 协议对应的位置
list_for_each(lh, &inetsw[p->type]) { // 协议列表
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) { /* Permanent protocols are unremovable. */
if (protocol == answer->protocol) // 协议类型相同
break;
last_perm = lh;
}
answer = NULL;
}
if (answer) // 协议己经存在
goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
* 加到列表的最后
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
return;
out_permanent:
pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
内核支持不同的协议,如UDP,TCP。当数据到达IP层后,要根据上层协议的类型调用不同的接收函数,内核通过下面的方式处理这种情况:
- 定义一个结构封装各函数:
[ include/net/protocol.h ]
/* This is used to register protocols. */ struct net_protocol { void (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, netns_ok:1, /* does the protocol do more stringent * icmp tag validation than simple * socket lookup? */ icmp_strict_tag_validation:1; };
- 定义一个全局列表,所有协议都注册到此列表
[ net/ipv4/protocol.c ]
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
[ include/net/protocol.h ]/* This is one larger than the largest protocol value that can be * found in an ipv4 or ipv6 header. Since in both cases the protocol * value is presented in a __u8, this is defined to be 256. */ #define MAX_INET_PROTOS 256
以下函数用来向全局列表注册net_protocol类型:
[ net/ipv4/protocol.c ]
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
if (!prot->netns_ok) {
pr_err("Protocol %u is not namespace aware, cannot register.\n",
protocol);
return -EINVAL;
}
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,当接收到不同类型的包后,要调用不同的处理函数,内核通过下面的方式处理这种情况。
- 定义一个结构,用来将类型和函数对应起来
[ include/linux/netdevice.h ]
struct packet_type { __be16 type; /* This is really htons(ether_type).包的类型 */ struct net_device *dev; /* NULL is wildcarded here.对应的网络设备 */ int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; struct list_head list; };
- 定义一个全局列表,所有packet_type类型为ETH_P_ALL(接收所有类型的包)的都挂在此列表上
[ net/core/dev.c ]
struct list_head ptype_all __read_mostly; /* Taps */ - 定义一个哈希表,其中的key为包的类型
[ net/core/dev.c ]
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
[ include/linux/netdevice.h ]
/* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. * * Why 16. Because with 16 the only overlap we get on a hash of the * low nibble of the protocol value is RARP/SNAP/X.25. * * NOTE: That is no longer true with the addition of VLAN tags. Not * sure which should go first, but I bet it won't make much * difference if we are running VLANs. The good news is that * this protocol won't be in the list unless compiled in, so * the average user (w/out VLANs) will not be adversely affected. * --BLG * * 0800 IP * 8100 802.1Q VLAN * 0001 802.3 * 0002 AX.25 * 0004 802.2 * 8035 RARP * 0005 SNAP * 0805 X.25 * 0806 ARP * 8137 IPX * 0009 Localtalk * 86DD IPv6 */ #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
以下函数用来向全局列表注册packet_type类型:
[ net/core/dev.c ]
/*
* Add a protocol ID to the list. Now that the input handler is
* smarter we can dispense with all the messy stuff that used to be
* here.
*
* BEWARE!!! Protocol handlers, mangling input packets,
* MUST BE last in hash buckets and checking protocol handlers
* MUST start from promiscuous ptype_all chain in net_bh.
* It is true now, do not change it.
* Explanation follows: if protocol handler, mangling packet, will
* be the first on list, it is not able to sense, that packet
* is cloned and should be copied-on-write, so that it will
* change it and subsequent readers will get broken packet.
* --ANK (980803)
*/
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL)) // 接收所有类型的包
return &ptype_all;
else
return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
*
* Add a protocol handler to the networking stack. The passed &packet_type
* is linked into kernel lists and may not be freed until it has been
* removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new packet type (until the next received packet).
*/
void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt); // 得到要挂载的列表
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head); // 将pt挂到列表上
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);
对于IP包,定义了如下的类型:
[ net/ipv4/af_inet.c ]
/* 网络数据包的类型(链路层)
*/
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP), /* Internet Protocol packet */
.func = ip_rcv,
};
它会在初始化IP模块时注册到全局列表当中去。其中的ip_rcv就是接收数据包的函数。
下面就可以看IP层的初始化:
[ net/ipv4/af_inet.c ]
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); // 先分配一块大小为8192字节的空间,将些空间初始化为0
if (!sysctl_local_reserved_ports)
goto out;
rc = proto_register(&tcp_prot, 1); // 注册TCP接口
if (rc)
goto out_free_reserved_ports;
rc = proto_register(&udp_prot, 1); // 注册UDP接口
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1); // 注册RAW接口
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1); // 注册PING接口
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops); // 向socket注册IP协议
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init(); // 注册sysctl,和路由相关
#endif
/*
* Add all the base protocols.
* inet_protos是一个全局数组,包含所有支持的协议
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) // 添加ICMP协议
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__); // 添加UDP协议
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__); // 添加TCP协议
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) // 添加IGMP协议
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create.
* inetsw是一个列表,包含所有SOCKET类型
* 初始化所有的SOCKET类型列表
*/
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
/* 用inetsw_array初始化inetsw
*/
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type); // 注册数据包的类型(ETH_P_IP)
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
out_free_reserved_ports:
kfree(sysctl_local_reserved_ports);
goto out;
}
fs_initcall(inet_init); // 在系统初始化时调用inet_init
为提高接收和发送的效率,尤其是在大负载下的效率,内核作了特别的处理
[ net/ipv4/af_inet.c ]
static int __init ipv4_offload_init(void)
{
/*
* Add offloads
*/
if (udpv4_offload_init() < 0)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (tcpv4_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
dev_add_offload(&ip_packet_offload);
inet_add_offload(&ipip_offload, IPPROTO_IPIP);
return 0;
}
fs_initcall(ipv4_offload_init); // 在内核初始化时调用ipv4_offload_init
所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,每种类型都有对应的packet_offload类型。其中IP层处理的数据包的类型是ETH_P_IP,对应结构为ip_packet_offload
[ net/ipv4/af_inet.c ]
/*
* IP protocol layer initialiser
*/
static struct packet_offload ip_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.callbacks = {
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
},
};
内核申明一个全局数组offload_base,通过下面的函数将packet_offload注册到数组中:
[ net/core/dev.c ]
static struct list_head offload_base __read_mostly;
/**
* dev_add_offload - register offload handlers
* @po: protocol offload declaration
*
* Add protocol offload handlers to the networking stack. The passed
* &proto_offload is linked into kernel lists and may not be freed until
* it has been removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new offload handlers (until the next received packet).
*/
void dev_add_offload(struct packet_offload *po)
{
struct list_head *head = &offload_base; // 全局列表
spin_lock(&offload_lock);
list_add_rcu(&po->list, head);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
上面提到内核为支持不同的协议,如UDP,TCP,申明了全局数组inet_protos,内核用相同的方法处理大负载:
[ net/ipv4/protocol.c ]
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
{
return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_offload);
这样就把不同协议的net_offload结构注册到了全局数组inet_offloads中了。而对于IP层对应的结构为:
[ net/ipv4/af_inet.c ]
static const struct net_offload ipip_offload = {
.callbacks = {
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
},
};