IP层实现1--初始化

IP头部:

[ include/uapi/linux/ip.h  ]

struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
	__u8	ihl:4,	// 头部长度(单位为32位)
		version:4;	// IP版本4或6
#elif defined (__BIG_ENDIAN_BITFIELD)
	__u8	version:4,
  		ihl:4;
#else
#error	"Please fix <asm/byteorder.h>"
#endif
	__u8	tos;	// Type of Service,不太常用
	__be16	tot_len;	// 包的长度(包含头部),单位为字节
	__be16	id;	// ID,	在分片中起核心作用
	__be16	frag_off;	// DF (Don’t Fragment);MF (More Fragments);Fragment Offset
	__u8	ttl;	// 生存时间, 默认64
	__u8	protocol;	// 上一层协议
	__sum16	check;	// IP头部校验和
	__be32	saddr;	// 源地址
	__be32	daddr;	// 目的地址
	/*The options start here. 
	 * IP选项
	 */
};
在IP层之上的协议(TCP,UDP),都要支持socket接口的调用。socket提供了一个结构,用来提供各接口:
[ include/net/sock.h ]
struct proto {
	void			(*close)(struct sock *sk,
					long timeout);
	int			(*connect)(struct sock *sk,
					struct sockaddr *uaddr,
					int addr_len);
         ...
};
然后又提供了一个全局列表,所有支持socket的协议都注册到些列表上:
[ net/core/sock.c ]
static LIST_HEAD(proto_list);
注册函数为:
[ net/core/sock.c ]
int proto_register(struct proto *prot, int alloc_slab)
{
	if (alloc_slab) {
		/* 分配缓冲,名称为协议的名字(如"TCP")
		 * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_sock) )
		 */
		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
					SLAB_HWCACHE_ALIGN | prot->slab_flags,
					NULL);

		if (prot->slab == NULL) {
			pr_crit("%s: Can't create sock SLAB cache!\n",
				prot->name);
			goto out;
		}

		/* socket中各种响应(ack)的操作
		 */
		if (prot->rsk_prot != NULL) {
			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);	// 缓冲名称
			if (prot->rsk_prot->slab_name == NULL)
				goto out_free_sock_slab;

			/* 分配缓冲,名称(如"request_sock_TCP")
			 * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_request_sock) )
			 */
			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
								 prot->rsk_prot->obj_size, 0,
								 SLAB_HWCACHE_ALIGN, NULL);

			if (prot->rsk_prot->slab == NULL) {
				pr_crit("%s: Can't create request sock SLAB cache!\n",
					prot->name);
				goto out_free_request_sock_slab_name;
			}
		}

		/* socket处于TIMEWAIT状态时的操作
		 */
		if (prot->twsk_prot != NULL) {
			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);	// 缓冲名称

			if (prot->twsk_prot->twsk_slab_name == NULL)
				goto out_free_request_sock_slab;

			/* 分配缓冲,名称(如"tw_sock_TCP")
			 * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_timewait_sock) )
			 */
			prot->twsk_prot->twsk_slab =
				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
						  prot->twsk_prot->twsk_obj_size,
						  0,
						  SLAB_HWCACHE_ALIGN |
							prot->slab_flags,
						  NULL);
			if (prot->twsk_prot->twsk_slab == NULL)
				goto out_free_timewait_sock_slab_name;
		}
	}

	mutex_lock(&proto_list_mutex);
	list_add(&prot->node, &proto_list);	// 将协议加入到全局列表中
	/* 为快速查询协议是否在socket中有效,申明了一个bitmap,其中的每一位都表示一个协议
	 * bitmap中位的索引保存在 prot->inuse_idx中
	 * 使用时可根据些索引设置和查询对应bitmap中位的值
	 * 别外还申明了一个PRE_CPU数组变量,通过prot->inuse_idx可设置和查询当前使用prot的数量
	 */
	assign_proto_idx(prot);
	mutex_unlock(&proto_list_mutex);
	return 0;

out_free_timewait_sock_slab_name:
	kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
	if (prot->rsk_prot && prot->rsk_prot->slab) {
		kmem_cache_destroy(prot->rsk_prot->slab);
		prot->rsk_prot->slab = NULL;
	}
out_free_request_sock_slab_name:
	if (prot->rsk_prot)
		kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
	kmem_cache_destroy(prot->slab);
	prot->slab = NULL;
out:
	return -ENOBUFS;
}
EXPORT_SYMBOL(proto_register);
对于IP来说,socket对应的family为PF_INET,与之对应的结构为:

[ net/ipv4/af_inet.c ]

static const struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,	// 建立socket时被调用
	.owner	= THIS_MODULE,
};
对与socket支持的family,都有一个对应的net_proto_family结构,同样,内核提供一个全局数组:

[ net/socket.c ]

static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
NPROTO为socket支持的family的总数。用下面的函数进行注册:

[ net/socket.c ]

int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
		       NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (rcu_dereference_protected(net_families[ops->family],
				      lockdep_is_held(&net_family_lock)))
		err = -EEXIST;
	else {
		rcu_assign_pointer(net_families[ops->family], ops);	// 将ops设置到全局数组中
		err = 0;
	}
	spin_unlock(&net_family_lock);

	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
	return err;
}
EXPORT_SYMBOL(sock_register);

socket 同一个family中有很多类型(如:SOCK_STREAM,SOCK_DGRAM,SOCK_RAW),对IP层来说,这些所有的类型都保存在全局列表中:

[ net/ipv4/af_inet.c ]

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
内核提供了一个初始化列表:
[ net/ipv4/af_inet.c ]
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
       },

       {
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_ICMP,
		.prot =       &ping_prot,
		.ops =        &inet_dgram_ops,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_REUSE,
       },

       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
       }
};
将socket类型注册到全局数组inetsw_array,调用下面函数:

[ net/ipv4/af_inet.c ]

void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	/* 协议类型(TCP,UDP...)
	 * 这里要和 p->type( SOCK_STREAM,SOCK_DGRAM... )区分
	 */
	int protocol = p->protocol;
	struct list_head *last_perm;

	spin_lock_bh(&inetsw_lock);

	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. */
	answer = NULL;
	last_perm = &inetsw[p->type];	// 协议对应的位置
	list_for_each(lh, &inetsw[p->type]) {	// 协议列表
		answer = list_entry(lh, struct inet_protosw, list);

		/* Check only the non-wild match. */
		if (INET_PROTOSW_PERMANENT & answer->flags) { /* Permanent protocols are unremovable. */
			if (protocol == answer->protocol)	// 协议类型相同
				break;
			last_perm = lh;
		}

		answer = NULL;
	}
	if (answer)	// 协议己经存在
		goto out_permanent;

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 * 加到列表的最后
	 */
	list_add_rcu(&p->list, last_perm);
out:
	spin_unlock_bh(&inetsw_lock);

	return;

out_permanent:
	pr_err("Attempt to override permanent protocol %d\n", protocol);
	goto out;

out_illegal:
	pr_err("Ignoring attempt to register invalid socket type %d\n",
	       p->type);
	goto out;
}

内核支持不同的协议,如UDP,TCP。当数据到达IP层后,要根据上层协议的类型调用不同的接收函数,内核通过下面的方式处理这种情况:

  1. 定义一个结构封装各函数:
    [ include/net/protocol.h ]
    /* This is used to register protocols. */
    struct net_protocol {
    	void			(*early_demux)(struct sk_buff *skb);
    	int			(*handler)(struct sk_buff *skb);
    	void			(*err_handler)(struct sk_buff *skb, u32 info);
    	unsigned int		no_policy:1,
    				netns_ok:1,
    				/* does the protocol do more stringent
    				 * icmp tag validation than simple
    				 * socket lookup?
    				 */
    				icmp_strict_tag_validation:1;
    };
    
  2. 定义一个全局列表,所有协议都注册到此列表
    [ net/ipv4/protocol.c ]
    const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;

    [ include/net/protocol.h ]
    /* This is one larger than the largest protocol value that can be
     * found in an ipv4 or ipv6 header.  Since in both cases the protocol
     * value is presented in a __u8, this is defined to be 256.
     */
    #define MAX_INET_PROTOS		256

以下函数用来向全局列表注册net_protocol类型:

[ net/ipv4/protocol.c ]

int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
	if (!prot->netns_ok) {
		pr_err("Protocol %u is not namespace aware, cannot register.\n",
			protocol);
		return -EINVAL;
	}

	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
			NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);

所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,当接收到不同类型的包后,要调用不同的处理函数,内核通过下面的方式处理这种情况。

  1. 定义一个结构,用来将类型和函数对应起来
    [ include/linux/netdevice.h ]
    struct packet_type {
    	__be16			type;	/* This is really htons(ether_type).包的类型 */
    	struct net_device	*dev;	/* NULL is wildcarded here.对应的网络设备 */
    	int			(*func) (struct sk_buff *,
    					 struct net_device *,
    					 struct packet_type *,
    					 struct net_device *);
    	bool			(*id_match)(struct packet_type *ptype,
    					    struct sock *sk);
    	void			*af_packet_priv;
    	struct list_head	list;
    };
    
  2. 定义一个全局列表,所有packet_type类型为ETH_P_ALL(接收所有类型的包)的都挂在此列表上
    [ net/core/dev.c ]
    struct list_head ptype_all __read_mostly;    /* Taps */
  3. 定义一个哈希表,其中的key为包的类型
    [ net/core/dev.c ]
    struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

    [ include/linux/netdevice.h ]
    /*
     *	The list of packet types we will receive (as opposed to discard)
     *	and the routines to invoke.
     *
     *	Why 16. Because with 16 the only overlap we get on a hash of the
     *	low nibble of the protocol value is RARP/SNAP/X.25.
     *
     *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
     *             sure which should go first, but I bet it won't make much
     *             difference if we are running VLANs.  The good news is that
     *             this protocol won't be in the list unless compiled in, so
     *             the average user (w/out VLANs) will not be adversely affected.
     *             --BLG
     *
     *		0800	IP
     *		8100    802.1Q VLAN
     *		0001	802.3
     *		0002	AX.25
     *		0004	802.2
     *		8035	RARP
     *		0005	SNAP
     *		0805	X.25
     *		0806	ARP
     *		8137	IPX
     *		0009	Localtalk
     *		86DD	IPv6
     */
    #define PTYPE_HASH_SIZE	(16)
    #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
    

以下函数用来向全局列表注册packet_type类型:

[ net/core/dev.c ]

/*
 *	Add a protocol ID to the list. Now that the input handler is
 *	smarter we can dispense with all the messy stuff that used to be
 *	here.
 *
 *	BEWARE!!! Protocol handlers, mangling input packets,
 *	MUST BE last in hash buckets and checking protocol handlers
 *	MUST start from promiscuous ptype_all chain in net_bh.
 *	It is true now, do not change it.
 *	Explanation follows: if protocol handler, mangling packet, will
 *	be the first on list, it is not able to sense, that packet
 *	is cloned and should be copied-on-write, so that it will
 *	change it and subsequent readers will get broken packet.
 *							--ANK (980803)
 */

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
	if (pt->type == htons(ETH_P_ALL))	// 接收所有类型的包
		return &ptype_all;
	else
		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

/**
 *	dev_add_pack - add packet handler
 *	@pt: packet type declaration
 *
 *	Add a protocol handler to the networking stack. The passed &packet_type
 *	is linked into kernel lists and may not be freed until it has been
 *	removed from the kernel lists.
 *
 *	This call does not sleep therefore it can not
 *	guarantee all CPU's that are in middle of receiving packets
 *	will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
	struct list_head *head = ptype_head(pt);	// 得到要挂载的列表

	spin_lock(&ptype_lock);
	list_add_rcu(&pt->list, head);	// 将pt挂到列表上
	spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);
对于IP包,定义了如下的类型:

[ net/ipv4/af_inet.c ]

/* 网络数据包的类型(链路层)
 */
static struct packet_type ip_packet_type __read_mostly = {
	.type = cpu_to_be16(ETH_P_IP),	/* Internet Protocol packet	*/
	.func = ip_rcv,
};
它会在初始化IP模块时注册到全局列表当中去。其中的ip_rcv就是接收数据包的函数。

下面就可以看IP层的初始化:
[ net/ipv4/af_inet.c ]
static int __init inet_init(void)
{
	struct inet_protosw *q;
	struct list_head *r;
	int rc = -EINVAL;

	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));

	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);	// 先分配一块大小为8192字节的空间,将些空间初始化为0
	if (!sysctl_local_reserved_ports)
		goto out;

	rc = proto_register(&tcp_prot, 1);	// 注册TCP接口
	if (rc)
		goto out_free_reserved_ports;

	rc = proto_register(&udp_prot, 1);	// 注册UDP接口
	if (rc)
		goto out_unregister_tcp_proto;

	rc = proto_register(&raw_prot, 1);	// 注册RAW接口
	if (rc)
		goto out_unregister_udp_proto;

	rc = proto_register(&ping_prot, 1);	// 注册PING接口
	if (rc)
		goto out_unregister_raw_proto;

	/*
	 *	Tell SOCKET that we are alive...
	 */

	(void)sock_register(&inet_family_ops);	// 向socket注册IP协议

#ifdef CONFIG_SYSCTL
	ip_static_sysctl_init();	// 注册sysctl,和路由相关
#endif

	/*
	 *	Add all the base protocols.
	 * inet_protos是一个全局数组,包含所有支持的协议
	 */

	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)	// 添加ICMP协议
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);	// 添加UDP协议
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);	// 添加TCP协议
#ifdef CONFIG_IP_MULTICAST
	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)	// 添加IGMP协议
		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

	/* Register the socket-side information for inet_create. 
	 * inetsw是一个列表,包含所有SOCKET类型
	 * 初始化所有的SOCKET类型列表
	 */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	/* 用inetsw_array初始化inetsw
	 */
	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

	/*
	 *	Set the ARP module up
	 */

	arp_init();

	/*
	 *	Set the IP module up
	 */

	ip_init();

	tcp_v4_init();

	/* Setup TCP slab cache for open requests. */
	tcp_init();

	/* Setup UDP memory threshold */
	udp_init();

	/* Add UDP-Lite (RFC 3828) */
	udplite4_register();

	ping_init();

	/*
	 *	Set the ICMP layer up
	 */

	if (icmp_init() < 0)
		panic("Failed to create the ICMP control socket.\n");

	/*
	 *	Initialise the multicast router
	 */
#if defined(CONFIG_IP_MROUTE)
	if (ip_mr_init())
		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
	/*
	 *	Initialise per-cpu ipv4 mibs
	 */

	if (init_ipv4_mibs())
		pr_crit("%s: Cannot init ipv4 mibs\n", __func__);

	ipv4_proc_init();

	ipfrag_init();

	dev_add_pack(&ip_packet_type);	// 注册数据包的类型(ETH_P_IP)

	rc = 0;
out:
	return rc;
out_unregister_raw_proto:
	proto_unregister(&raw_prot);
out_unregister_udp_proto:
	proto_unregister(&udp_prot);
out_unregister_tcp_proto:
	proto_unregister(&tcp_prot);
out_free_reserved_ports:
	kfree(sysctl_local_reserved_ports);
	goto out;
}

fs_initcall(inet_init);	// 在系统初始化时调用inet_init

为提高接收和发送的效率,尤其是在大负载下的效率,内核作了特别的处理

[ net/ipv4/af_inet.c ]

static int __init ipv4_offload_init(void)
{
	/*
	 * Add offloads
	 */
	if (udpv4_offload_init() < 0)
		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
	if (tcpv4_offload_init() < 0)
		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);

	dev_add_offload(&ip_packet_offload);
	inet_add_offload(&ipip_offload, IPPROTO_IPIP);
	return 0;
}

fs_initcall(ipv4_offload_init);	// 在内核初始化时调用ipv4_offload_init
所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,每种类型都有对应的packet_offload类型。其中IP层处理的数据包的类型是ETH_P_IP,对应结构为ip_packet_offload

[ net/ipv4/af_inet.c ]

/*
 *	IP protocol layer initialiser
 */

static struct packet_offload ip_packet_offload __read_mostly = {
	.type = cpu_to_be16(ETH_P_IP),
	.callbacks = {
		.gso_send_check = inet_gso_send_check,
		.gso_segment = inet_gso_segment,
		.gro_receive = inet_gro_receive,
		.gro_complete = inet_gro_complete,
	},
};
内核申明一个全局数组offload_base,通过下面的函数将packet_offload注册到数组中:

[ net/core/dev.c ]

static struct list_head offload_base __read_mostly;

/**
 *	dev_add_offload - register offload handlers
 *	@po: protocol offload declaration
 *
 *	Add protocol offload handlers to the networking stack. The passed
 *	&proto_offload is linked into kernel lists and may not be freed until
 *	it has been removed from the kernel lists.
 *
 *	This call does not sleep therefore it can not
 *	guarantee all CPU's that are in middle of receiving packets
 *	will see the new offload handlers (until the next received packet).
 */
void dev_add_offload(struct packet_offload *po)
{
	struct list_head *head = &offload_base;	// 全局列表

	spin_lock(&offload_lock);
	list_add_rcu(&po->list, head);
	spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
上面提到内核为支持不同的协议,如UDP,TCP,申明了全局数组inet_protos,内核用相同的方法处理大负载:

[ net/ipv4/protocol.c ]

const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;

int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
{
    return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
            NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_offload);
这样就把不同协议的net_offload结构注册到了全局数组inet_offloads中了。而对于IP层对应的结构为:

[ net/ipv4/af_inet.c ]

static const struct net_offload ipip_offload = {
	.callbacks = {
		.gso_send_check = inet_gso_send_check,
		.gso_segment	= inet_gso_segment,
	},
};

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值