linux内核 邻居子系统

基于linux2.4.0分析。

数据结构:


struct neigh_table
{//邻居表结构
	struct neigh_table	*next;//指向队列中的下一个邻居表
	int			family;//地址族
	int			entry_size;//邻居结构的总长度,邻居结构最后要放一个ip地址作为哈希值
	int			key_len;//ip地址长度
	__u32			(*hash)(const void *pkey, const struct net_device *);//哈希函数指针
	int			(*constructor)(struct neighbour *);//创建邻居结构的函数指针
	int			(*pconstructor)(struct pneigh_entry *);//ipv6使用的创建函数指针
	void			(*pdestructor)(struct pneigh_entry *);//ipv6使用的释放函数指针
	void			(*proxy_redo)(struct sk_buff *skb);//处理函数指针
	char			*id;//协议名称作为ID
	struct neigh_parms	parms;//邻居参数结构
	/* HACK. gc_* shoul follow parms without a gap! */
	int			gc_interval;//回收时间间隔
	int			gc_thresh1;//回收最小阈值
	int			gc_thresh2;//回收中等阈值
	int			gc_thresh3;//回收最大阈值
	unsigned long		last_flush;//最近一次回收时间
	struct timer_list 	gc_timer;//回收定时器
	struct timer_list 	proxy_timer;//代理定时器
	struct sk_buff_head	proxy_queue;//代理队列
	int			entries;//邻居结构数量
	rwlock_t		lock;//读写锁
	unsigned long		last_rand;//最近更新时间
	struct neigh_parms	*parms_list;//
	kmem_cache_t		*kmem_cachep;//用于分配邻居结构的高速缓存
	struct tasklet_struct	gc_task;
	struct neigh_statistics	stats;//邻居统计结构
	struct neighbour	*hash_buckets[NEIGH_HASHMASK+1];//邻居结构的哈希桶
	struct pneigh_entry	*phash_buckets[PNEIGH_HASHMASK+1];//保存ip地址的队列
};


struct neigh_table arp_tbl =
{
	NULL,
	AF_INET,
	sizeof(struct neighbour) + 4,
	4,//ip地址长度
	arp_hash,
	arp_constructor,
	NULL,
	NULL,
	parp_redo,
	"arp_cache",
        { NULL, NULL, &arp_tbl, 0, NULL, NULL,
		  30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },
	30*HZ, 128, 512, 1024,
};

接下来看邻居数据结构struct neighbour

struct neighbour
{
	struct neighbour	*next;//组成队列的指针
	struct neigh_table	*tbl;//邻居表结构
	struct neigh_parms	*parms;//邻居参数结构
	struct net_device		*dev;//网络设备指针
	unsigned long		used;//使用时间
	unsigned long		confirmed;//确认时间
	unsigned long		updated;//更新时间
	__u8			flags;//标志位
	__u8			nud_state;//状态标志
	__u8			type;//类型
	__u8			dead;//删除标志
	atomic_t		probes;//失败计数器
	rwlock_t		lock;//读写锁
	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];//mac地址
	struct hh_cache		*hh;//链路层头部缓存,加速发送
	atomic_t		refcnt;//引用计数
	int			(*output)(struct sk_buff *skb);//发送函数指针
	struct sk_buff_head	arp_queue;//需要处理的数据包队列
	struct timer_list	timer;//定时队列
	struct neigh_ops	*ops;//操作函数表
	u8			primary_key[0];//主键值,一般是网关地址
};

这个结构包括了许多状态字段以及网络层的虚拟函数和接口,也包括定时器队列和链路层的头部信息。

邻居子系统的初始化

在inet_init函数中,调用了arp_init函数。

inet_init=>arp_init

void __init arp_init (void)
{
	neigh_table_init(&arp_tbl);//插入到neigh_tables 全局邻居表队列

	dev_add_pack(&arp_packet_type);//向内核登记ARP数据包类型结构,后面会讲述

	proc_net_create ("arp", 0, arp_get_info);//创建虚拟文件目录

#ifdef CONFIG_SYSCTL
	neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
#endif
}

inet_init=>arp_init=>neigh_table_init

void neigh_table_init(struct neigh_table *tbl)
{
	unsigned long now = jiffies;

	tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time);

	if (tbl->kmem_cachep == NULL)
		tbl->kmem_cachep = kmem_cache_create(tbl->id,
						     (tbl->entry_size+15)&~15,
						     0, SLAB_HWCACHE_ALIGN,
						     NULL, NULL);//创建缓冲

#ifdef CONFIG_SMP
	tasklet_init(&tbl->gc_task, SMP_TIMER_NAME(neigh_periodic_timer), (unsigned long)tbl);
#endif
	init_timer(&tbl->gc_timer);//初始化定时器
	tbl->lock = RW_LOCK_UNLOCKED;
	tbl->gc_timer.data = (unsigned long)tbl;
	tbl->gc_timer.function = neigh_periodic_timer;//定时任务回调函数
	tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time;
	add_timer(&tbl->gc_timer);

	init_timer(&tbl->proxy_timer);
	tbl->proxy_timer.data = (unsigned long)tbl;
	tbl->proxy_timer.function = neigh_proxy_process;
	skb_queue_head_init(&tbl->proxy_queue);

	tbl->last_flush = now;
	tbl->last_rand = now + tbl->parms.reachable_time*20;
	write_lock(&neigh_tbl_lock);
	tbl->next = neigh_tables;
	neigh_tables = tbl;//插入链表,头插
	write_unlock(&neigh_tbl_lock);
}

在创建路由表rt_intern_hash函数中,对新建的路由项查找邻居结构,这是调用arp_bind_neighbour函数完成的。

rt_intern_hash=>arp_bind_neighbour

int arp_bind_neighbour(struct dst_entry *dst)
{
	struct net_device *dev = dst->dev;//取得网络设备结构指针
	struct neighbour *n = dst->neighbour;//取得路由项中的邻居结构指针

	if (dev == NULL)
		return -EINVAL;
	if (n == NULL) {
		u32 nexthop = ((struct rtable*)dst)->rt_gateway;
		if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
			nexthop = 0;
		n = __neigh_lookup_errno(
#ifdef CONFIG_ATM_CLIP
		    dev->type == ARPHRD_ATM ? &clip_tbl :
#endif
		    &arp_tbl, &nexthop, dev);//查询和创建邻居结构
		if (IS_ERR(n))
			return PTR_ERR(n);
		dst->neighbour = n;//记录邻居结构指针
	}
	return 0;
}

rt_intern_hash=>arp_bind_neighbour=>__neigh_lookup_errno

static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
  struct net_device *dev)
{
	struct neighbour *n = neigh_lookup(tbl, pkey, dev);//查找邻居结构

	if (n)//找到即返回
		return n;

	return neigh_create(tbl, pkey, dev);//创建邻居结构
}

rt_intern_hash=>arp_bind_neighbour=>__neigh_lookup_errno=>neigh_create

struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey,
				struct net_device *dev)
{
	struct neighbour *n, *n1;
	u32 hash_val;
	int key_len = tbl->key_len;//获取ip地址长度作为键值
	int error;

	n = neigh_alloc(tbl);//创建邻居结构
	if (n == NULL)
		return ERR_PTR(-ENOBUFS);

	memcpy(n->primary_key, pkey, key_len);//复制地址到邻居结构中
	n->dev = dev;//记录网络设备
	dev_hold(dev);//增加设备的引用计数

	/* Protocol specific setup. */
	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) {//执行构造函数
		neigh_release(n);
		return ERR_PTR(error);
	}

	/* Device specific setup. */
	if (n->parms && n->parms->neigh_setup &&
	    (error = n->parms->neigh_setup(n)) < 0) {//如果指定了安装函数就执行
		neigh_release(n);
		return ERR_PTR(error);
	}

	n->confirmed = jiffies - (n->parms->base_reachable_time<<1);//确认时间

	hash_val = tbl->hash(pkey, dev);//计算哈希值

	write_lock_bh(&tbl->lock);
	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {//在哈希桶中查找要插入的队列
		if (dev == n1->dev &&
		    memcmp(n1->primary_key, pkey, key_len) == 0) {
			neigh_hold(n1);
			write_unlock_bh(&tbl->lock);
			neigh_release(n);//释放已经分配的对象
			return n1;//已存在,直接返回旧的对象
		}
	}

	n->next = tbl->hash_buckets[hash_val];//链入哈希桶的队列中
	tbl->hash_buckets[hash_val] = n;
	n->dead = 0;
	neigh_hold(n);
	write_unlock_bh(&tbl->lock);
	NEIGH_PRINTK2("neigh %p is created.\n", n);
	return n;//返回邻居结构
}

rt_intern_hash=>arp_bind_neighbour=>__neigh_lookup_errno=>neigh_create=>neigh_alloc

static struct neighbour *neigh_alloc(struct neigh_table *tbl)
{
	struct neighbour *n;
	unsigned long now = jiffies;

	if (tbl->entries > tbl->gc_thresh3 ||
	    (tbl->entries > tbl->gc_thresh2 &&
	     now - tbl->last_flush > 5*HZ)) {//检查是否需要回收
		if (neigh_forced_gc(tbl) == 0 &&
		    tbl->entries > tbl->gc_thresh3)//回收后仍然大于最大阈值
			return NULL;//返回空
	}

	n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);//分配结构
	if (n == NULL)
		return NULL;

	memset(n, 0, tbl->entry_size);

	skb_queue_head_init(&n->arp_queue);//初始化邻居结构的队列头
	n->lock = RW_LOCK_UNLOCKED;
	n->updated = n->used = now;//记录当前时间
	n->nud_state = NUD_NONE;//设置状态
	n->output = neigh_blackhole;//设置发送函数
	n->parms = &tbl->parms;//记录邻居参数结构
	init_timer(&n->timer);//初始化定时器
	n->timer.function = neigh_timer_handler;//设置回调函数
	n->timer.data = (unsigned long)n;
	tbl->stats.allocs++;
	neigh_glbl_allocs++;
	tbl->entries++;//增加分配的邻居结构数量
	n->tbl = tbl;//记录邻居表
	atomic_set(&n->refcnt, 1);//增加引用计数
	n->dead = 1;//初始化删除标志
	return n;
}

继续看neigh_create函数,拷贝地址结束后,调用constructor来初始化邻居结构。实际执行的是arp_constructor。

rt_intern_hash=>arp_bind_neighbour=>__neigh_lookup_errno=>neigh_create=>arp_constructor

static int arp_constructor(struct neighbour *neigh)
{
	u32 addr = *(u32*)neigh->primary_key;
	struct net_device *dev = neigh->dev;
	struct in_device *in_dev = in_dev_get(dev);//获取设备配置结构

	if (in_dev == NULL)
		return -EINVAL;

	neigh->type = inet_addr_type(addr);//记录地址类型
	if (in_dev->arp_parms)
		neigh->parms = in_dev->arp_parms;//获取配置结构的邻居参数

	in_dev_put(in_dev);//递减引用计数,与in_dev_get对应

	if (dev->hard_header == NULL) {//如果未安装链路层函数表,由驱动程序决定
		neigh->nud_state = NUD_NOARP;//设置不需要解析状态
		neigh->ops = &arp_direct_ops;//设置函数表
		neigh->output = neigh->ops->queue_xmit;//设置发送函数
	} else {//安装了链路层函数表
		/* Good devices (checked by reading texts, but only Ethernet is
		   tested)

		   ARPHRD_ETHER: (ethernet, apfddi)
		   ARPHRD_FDDI: (fddi)
		   ARPHRD_IEEE802: (tr)
		   ARPHRD_METRICOM: (strip)
		   ARPHRD_ARCNET:
		   etc. etc. etc.

		   ARPHRD_IPDDP will also work, if author repairs it.
		   I did not it, because this driver does not work even
		   in old paradigm.
		 */

#if 1
		/* So... these "amateur" devices are hopeless.
		   The only thing, that I can say now:
		   It is very sad that we need to keep ugly obsolete
		   code to make them happy.

		   They should be moved to more reasonable state, now
		   they use rebuild_header INSTEAD OF hard_start_xmit!!!
		   Besides that, they are sort of out of date
		   (a lot of redundant clones/copies, useless in 2.1),
		   I wonder why people believe that they work.
		 */
		switch (dev->type) {
		default:
			break;
		case ARPHRD_ROSE:	
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
		case ARPHRD_AX25:
#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
		case ARPHRD_NETROM:
#endif
			neigh->ops = &arp_broken_ops;//记录函数表
			neigh->output = neigh->ops->output;//设置发送函数
			return 0;
#endif
		;}
#endif
		if (neigh->type == RTN_MULTICAST) {//组播类型
			neigh->nud_state = NUD_NOARP;//设置状态
			arp_mc_map(addr, neigh->ha, dev, 1);//设置Mac地址
		} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {//回接设备
			neigh->nud_state = NUD_NOARP;//设置不需要解析状态
			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);//复制设备地址
		} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {//如果是广播或者点对点类型
			neigh->nud_state = NUD_NOARP;//设置为不需要解析状态
			memcpy(neigh->ha, dev->broadcast, dev->addr_len);//复制广播地址
		}
		if (dev->hard_header_cache)//如果提供了缓冲函数,有驱动程序安装
			neigh->ops = &arp_hh_ops;//记录函数表
		else
			neigh->ops = &arp_generic_ops;
		if (neigh->nud_state&NUD_VALID)//如果是有效状态
			neigh->output = neigh->ops->connected_output;//设置已连接的发送函数
		else
			neigh->output = neigh->ops->output;//设置普通函数表函数
	}
	return 0;
}

设置Mac地址之后还要设置函数表,代码中检查链路层是否指定了缓冲函数,这个函数在网络设置管理自己的缓冲区使用。

        if (dev->hard_header_cache)
            neigh->ops = &arp_hh_ops;
        else
            neigh->ops = &arp_generic_ops;

这里要根据网卡驱动是否提供了缓冲函数来决定链入哪一个邻居函数表,我们结合以太网卡驱动程序来看。


void ether_setup(struct net_device *dev)
{
	/* Fill in the fields of the device structure with ethernet-generic values.
	   This should be in a common file instead of per-driver.  */
	
	dev->change_mtu		= eth_change_mtu;
	dev->hard_header	= eth_header;
	dev->rebuild_header 	= eth_rebuild_header;
	dev->set_mac_address 	= eth_mac_addr;
	dev->hard_header_cache	= eth_header_cache;
.......
}

设置为eth_header_cache,因此邻居结构函数表挂入的是arp_hh_ops,再看一下它的定义:

static struct neigh_ops arp_hh_ops =
{
	AF_INET,
	NULL,
	arp_solicit,
	arp_error_report,
	neigh_resolve_output,
	neigh_resolve_output,
	dev_queue_xmit,
	dev_queue_xmit
};

查找邻居结构:

在fib_detect_death函数中调用了neigh_lookup,这个函数在全局的邻居表arp_tbl中查找符合网关和设备的邻居结构,这里按照fib_detect_death来分析。

fib_detect_death=>neigh_lookup

struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
			       struct net_device *dev)
{
	struct neighbour *n;
	u32 hash_val;
	int key_len = tbl->key_len;//取得地址长度

	hash_val = tbl->hash(pkey, dev);//计算哈希值

	read_lock_bh(&tbl->lock);
	for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {//在哈希桶中查找指定设备、指定网关的邻居结构
		if (dev == n->dev &&
		    memcmp(n->primary_key, pkey, key_len) == 0) {
			neigh_hold(n);
			break;
		}
	}
	read_unlock_bh(&tbl->lock);
	return n;
}

函数先取得邻居表指定的地址长度,这个值是4。接着调用arp_tbl的哈希函数,对应是arp_hash,它根据网关值、网络设备号来计算的。

邻居子系统的发送事件 :

当数据包发送时还要调用邻居子系统的发送事件函数neigh_event_send,它将调整邻居结构的状态,将数据包链入到邻居结构的处理队列。我们以linux 协议栈 准备连接请求 客户端发送连接请求时调用neigh_event_send函数为例,先回忆下neigh_resolve_output函数代码。

sys_socketcall=> sys_connect=>inet_stream_connect=>tcp_v4_connect=>tcp_connect=>tcp_transmit_skb=>ip_queue_xmit=>ip_queue_xmit2=>ip_output=>ip_finish_output=>ip_finish_output2=>neigh_resolve_output


int neigh_resolve_output(struct sk_buff *skb)
{
......
	if (neigh_event_send(neigh, skb) == 0) {
		int err;
		struct net_device *dev = neigh->dev;
		if (dev->hard_header_cache && dst->hh == NULL) {
			write_lock_bh(&neigh->lock);
			if (dst->hh == NULL)
				neigh_hh_init(neigh, dst, dst->ops->protocol);
			err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
			write_unlock_bh(&neigh->lock);
		} else {
			read_lock_bh(&neigh->lock);
			err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
			read_unlock_bh(&neigh->lock);
		}
		if (err >= 0)
			return neigh->ops->queue_xmit(skb);
		kfree_skb(skb);
		return -EINVAL;
	}
	return 0;

discard:
	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL);
	kfree_skb(skb);
	return -EINVAL;
}

neigh_resolve_output=>neigh_event_send

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	neigh->used = jiffies;
	if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))//查找邻居结构不处于连接、延迟、探测状态
		return __neigh_event_send(neigh, skb);
	return 0;
}

neigh_resolve_output=>neigh_event_send=>__neigh_event_send

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	write_lock_bh(&neigh->lock);
	if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) {//处于NUD_NONE状态
		if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) {
			if (neigh->parms->mcast_probes + neigh->parms->app_probes) {//进入该分支
				atomic_set(&neigh->probes, neigh->parms->ucast_probes);//记录探测值
				neigh->nud_state = NUD_INCOMPLETE;//修改为未完成状态
				neigh_hold(neigh);//增加引用计数
				neigh->timer.expires = jiffies + neigh->parms->retrans_time;//记录当前时间
				add_timer(&neigh->timer);//设置定时器,
				write_unlock_bh(&neigh->lock);
				neigh->ops->solicit(neigh, skb);
				atomic_inc(&neigh->probes);
				write_lock_bh(&neigh->lock);
			} else {
				neigh->nud_state = NUD_FAILED;
				write_unlock_bh(&neigh->lock);

				if (skb)
					kfree_skb(skb);
				return 1;
			}
		}
		if (neigh->nud_state == NUD_INCOMPLETE) {//同时也会进入该分支
			if (skb) {//需要发送数据包
				if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) {//检查队列长度过长
					struct sk_buff *buff;
					buff = neigh->arp_queue.prev;
					__skb_unlink(buff, &neigh->arp_queue);//脱链
					kfree_skb(buff);
				}
				__skb_queue_head(&neigh->arp_queue, skb);//将要发送的数据包链入队列
			}
			write_unlock_bh(&neigh->lock);
			return 1;
		}
		if (neigh->nud_state == NUD_STALE) {
			NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
			neigh_hold(neigh);
			neigh->nud_state = NUD_DELAY;
			neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
			add_timer(&neigh->timer);
		}
	}
	write_unlock_bh(&neigh->lock);
	return 0;
}

上面通过add_timer函数开启了定时器,还将要发送的数据包链入邻居结构的数据包队列中。定时器链入队列后,会立即执行,回调函数在neigh_alloc中设置为neigh_timer_handler,执行时又会重新开启定时器。对于新建的邻居结构它会调用arp_solicit函数(arp_hh_ops)。neigh_timer_handler回调函数中也会执行arp_solicit函数。

neigh_resolve_output=>neigh_event_send=>__neigh_event_send=>arp_solicit

neigh_timer_handler=>arp_solicit


static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
	u32 saddr;
	u8  *dst_ha = NULL;
	struct net_device *dev = neigh->dev;//获取网络设备结构
	u32 target = *(u32*)neigh->primary_key;//获取网关地址
	int probes = atomic_read(&neigh->probes);//获取探测次数 ,失败次数

	if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)//检查ip头部中的源地址的地址类型,如果为本地地址,使用之
		saddr = skb->nh.iph->saddr;
	else
		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

	if ((probes -= neigh->parms->ucast_probes) < 0) {//检查邻居参数的探测值
		if (!(neigh->nud_state&NUD_VALID))//无效,打印调试日志
			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
		dst_ha = neigh->ha;//取邻居结构的Mac地址作为目的Mac地址
		read_lock_bh(&neigh->lock);
	} else if ((probes -= neigh->parms->app_probes) < 0) {
#ifdef CONFIG_ARPD
		neigh_app_ns(neigh);
#endif
		return;
	}

	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
		 dst_ha, dev->dev_addr, NULL);//创建并发送ARP包
	if (dst_ha)
		read_unlock_bh(&neigh->lock);
}

这个函数的主要目的是获取源地址和网关地址,然后调用arp_send函数,创建并发送ARP包。

neigh_timer_handler=>arp_solicit=>arp_send

void arp_send(int type, int ptype, u32 dest_ip, 
	      struct net_device *dev, u32 src_ip, 
	      unsigned char *dest_hw, unsigned char *src_hw,
	      unsigned char *target_hw)
{
	struct sk_buff *skb;
	struct arphdr *arp;
	unsigned char *arp_ptr;

	/*
	 *	No arp on this interface.
	 */
	
	if (dev->flags&IFF_NOARP)//检查设备是否支持ARP
		return;

	/*
	 *	Allocate a buffer
	 */
	
	skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
				+ dev->hard_header_len + 15, GFP_ATOMIC);//分配数据包
	if (skb == NULL)
		return;

	skb_reserve(skb, (dev->hard_header_len+15)&~15);//开辟空间
	skb->nh.raw = skb->data;
	arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
	skb->dev = dev;//记录设备结构
	skb->protocol = __constant_htons (ETH_P_ARP);//记录协议类型
	if (src_hw == NULL)
		src_hw = dev->dev_addr;//记录设备的Mac地址
	if (dest_hw == NULL)//如果没有指定目的Mac地址
		dest_hw = dev->broadcast;//记录设备的广播地址

	/*
	 *	Fill the device header for the ARP frame
	 */
	if (dev->hard_header &&
	    dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)//调用eth_header函数
		goto out;

	/*
	 * Fill out the arp protocol part.
	 *
	 * The arp hardware type should match the device type, except for FDDI,
	 * which (according to RFC 1390) should always equal 1 (Ethernet).
	 */
	/*
	 *	Exceptions everywhere. AX.25 uses the AX.25 PID value not the
	 *	DIX code for the protocol. Make these device structure fields.
	 */
	switch (dev->type) {
	default:
		arp->ar_hrd = htons(dev->type);//记录硬件类型
		arp->ar_pro = __constant_htons(ETH_P_IP);//记录协议类型
		break;

#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
	case ARPHRD_AX25:
		arp->ar_hrd = __constant_htons(ARPHRD_AX25);
		arp->ar_pro = __constant_htons(AX25_P_IP);
		break;

#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
	case ARPHRD_NETROM:
		arp->ar_hrd = __constant_htons(ARPHRD_NETROM);
		arp->ar_pro = __constant_htons(AX25_P_IP);
		break;
#endif
#endif

#ifdef CONFIG_FDDI
	case ARPHRD_FDDI:
		arp->ar_hrd = __constant_htons(ARPHRD_ETHER);
		arp->ar_pro = __constant_htons(ETH_P_IP);
		break;
#endif
#ifdef CONFIG_TR
	case ARPHRD_IEEE802_TR:
		arp->ar_hrd = __constant_htons(ARPHRD_IEEE802);
		arp->ar_pro = __constant_htons(ETH_P_IP);
		break;
#endif
	}

	arp->ar_hln = dev->addr_len;//记录设备的地址长度
	arp->ar_pln = 4;//记录设备地址的字节数
	arp->ar_op = htons(type);//记录类型值

	arp_ptr=(unsigned char *)(arp+1);//数据块中用于保存源地址处

	memcpy(arp_ptr, src_hw, dev->addr_len);//复制源Mac地址
	arp_ptr+=dev->addr_len;//数据块中用于保存ip地址处
	memcpy(arp_ptr, &src_ip,4);//复制源ip地址
	arp_ptr+=4;//数据块中用于保存目的地址处
	if (target_hw != NULL)
		memcpy(arp_ptr, target_hw, dev->addr_len);//复制目的Mac地址
	else
		memset(arp_ptr, 0, dev->addr_len);
	arp_ptr+=dev->addr_len;//数据块中用于保存目的ip地址处
	memcpy(arp_ptr, &dest_ip, 4);//复制目的ip地址,一般是网关地址
	skb->dev = dev;

	dev_queue_xmit(skb);//发送数据包
	return;

out:
	kfree_skb(skb);
}

邻居子系统的接收处理:

arp_init 函数中,曾向内核注册过arp数据包类型结构arp_packet_type,这是由dev_add_pack函数完成的。


static struct packet_type arp_packet_type =
{
	__constant_htons(ETH_P_ARP),
	NULL,		/* All devices */
	arp_rcv,
	(void*)1,
	NULL
};

这个可以参考linux 内核协议栈原理分析之 tcp 服务器端的 send 过程中的接收过程。

我们直接进入net_rx_action函数



static void net_rx_action(struct softirq_action *h)
{
......

			for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
				if (ptype->type == type &&
				    (!ptype->dev || ptype->dev == skb->dev)) {
					if (pt_prev) {
						if (!pt_prev->data)
							deliver_to_old_ones(pt_prev, skb, 0);
						else {
							atomic_inc(&skb->users);
							pt_prev->func(skb,
								      skb->dev,
								      pt_prev);
						}
					}
					pt_prev = ptype;
				}
			}

			if (pt_prev) {
				if (!pt_prev->data)
					deliver_to_old_ones(pt_prev, skb, 1);
				else
					pt_prev->func(skb, skb->dev, pt_prev);
			} else
				kfree_skb(skb);
		}

......
}

net_rx_action=>arp_rcv

int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
	struct arphdr *arp = skb->nh.arph;
	unsigned char *arp_ptr= (unsigned char *)(arp+1);//获取arp头部
	struct rtable *rt;
	unsigned char *sha, *tha;
	u32 sip, tip;
	u16 dev_type = dev->type;
	int addr_type;
	struct in_device *in_dev = in_dev_get(dev);
	struct neighbour *n;

/*
 *	The hardware length of the packet should match the hardware length
 *	of the device.  Similarly, the hardware types should match.  The
 *	device should be ARP-able.  Also, if pln is not 4, then the lookup
 *	is not from an IP number.  We can't currently handle this, so toss
 *	it. 
 */  
	if (in_dev == NULL ||
	    arp->ar_hln != dev->addr_len    || //对比设备地址长度
	    dev->flags & IFF_NOARP ||//设备是否支持arp协议
	    skb->pkt_type == PACKET_OTHERHOST ||//是否为发送给其他主机的数据包
	    skb->pkt_type == PACKET_LOOPBACK ||//是否为回接类型
	    arp->ar_pln != 4)//检查地址字节数
		goto out;
//检查能否共享数据包,如果能够共享的话就会通过skb_clone克隆一个新的数据包结构,函数使用这个新的数据包结构
	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
		goto out_of_mem;

	switch (dev_type) {
	default:	
		if (arp->ar_pro != __constant_htons(ETH_P_IP))
			goto out;
		if (htons(dev_type) != arp->ar_hrd)
			goto out;
		break;
#ifdef CONFIG_NET_ETHERNET
	case ARPHRD_ETHER:
		/*
		 * ETHERNET devices will accept ARP hardware types of either
		 * 1 (Ethernet) or 6 (IEEE 802.2).
		 */
		if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
		    arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
			goto out;
		if (arp->ar_pro != __constant_htons(ETH_P_IP))
			goto out;
		break;
#endif
#ifdef CONFIG_TR
	case ARPHRD_IEEE802_TR:
		/*
		 * Token ring devices will accept ARP hardware types of either
		 * 1 (Ethernet) or 6 (IEEE 802.2).
		 */
		if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
		    arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
			goto out;
		if (arp->ar_pro != __constant_htons(ETH_P_IP))
			goto out;
		break;
#endif
#ifdef CONFIG_FDDI
	case ARPHRD_FDDI:
		/*
		 * According to RFC 1390, FDDI devices should accept ARP hardware types
		 * of 1 (Ethernet).  However, to be more robust, we'll accept hardware
		 * types of either 1 (Ethernet) or 6 (IEEE 802.2).
		 */
		if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
		    arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
			goto out;
		if (arp->ar_pro != __constant_htons(ETH_P_IP))
			goto out;
		break;
#endif
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
	case ARPHRD_AX25:
		if (arp->ar_pro != __constant_htons(AX25_P_IP))
			goto out;
		if (arp->ar_hrd != __constant_htons(ARPHRD_AX25))
			goto out;
		break;
#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
	case ARPHRD_NETROM:
		if (arp->ar_pro != __constant_htons(AX25_P_IP))
			goto out;
		if (arp->ar_hrd != __constant_htons(ARPHRD_NETROM))
			goto out;
		break;
#endif
#endif
	}

	/* Understand only these message types */

	if (arp->ar_op != __constant_htons(ARPOP_REPLY) &&
	    arp->ar_op != __constant_htons(ARPOP_REQUEST))
		goto out;

/*
 *	Extract fields
 */
	sha=arp_ptr;//获取客户端Mac地址
	arp_ptr += dev->addr_len;//指向数据包中的源ip地址
	memcpy(&sip, arp_ptr, 4);//获取客户端的ip地址
	arp_ptr += 4;//偏移一下
	tha=arp_ptr;
	arp_ptr += dev->addr_len;
	memcpy(&tip, arp_ptr, 4);//获取目的ip地址
/* 
 *	Check for bad requests for 127.x.x.x and requests for multicast
 *	addresses.  If this is one such, delete it.
 */
	if (LOOPBACK(tip) || MULTICAST(tip))
		goto out;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.  
 *  (The assumption for this last is that if someone is requesting our 
 *  address, they are probably intending to talk to us, so it saves time 
 *  if we cache their address.  Their address is also probably not in 
 *  our cache, since ours is not in their cache.)
 * 
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp 
 *  cache.
 */

	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
	if (sip == 0) {
		if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
		    inet_addr_type(tip) == RTN_LOCAL)
			arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
		goto out;
	}

	if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
	    ip_route_input(skb, tip, sip, 0, dev) == 0) {//如果是arp请求,查找或者创建目的地址的路由表

		rt = (struct rtable*)skb->dst;//获取路由表
		addr_type = rt->rt_type;//获取地址类型

		if (addr_type == RTN_LOCAL) {//如果是本地路由类型
			n = neigh_event_ns(&arp_tbl, sha, &sip, dev);//创建客户端地址的邻居结构
			if (n) {
				arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);//发送arp应答给客户端
				neigh_release(n);
			}
			goto out;
		} else if (IN_DEV_FORWARD(in_dev)) {
			if ((rt->rt_flags&RTCF_DNAT) ||
			    (addr_type == RTN_UNICAST  && rt->u.dst.dev != dev &&
			     (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
				if (n)
					neigh_release(n);

				if (skb->stamp.tv_sec == 0 ||
				    skb->pkt_type == PACKET_HOST ||
				    in_dev->arp_parms->proxy_delay == 0) {
					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
				} else {
					pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
					in_dev_put(in_dev);
					return 0;
				}
				goto out;
			}
		}
	}

	/* Update our ARP tables */

	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);

#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
	/* Unsolicited ARP is not accepted by default.
	   It is possible, that this option should be enabled for some
	   devices (strip is candidate)
	 */
	if (n == NULL &&
	    arp->ar_op == __constant_htons(ARPOP_REPLY) &&
	    inet_addr_type(sip) == RTN_UNICAST)
		n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
#endif

	if (n) {
		int state = NUD_REACHABLE;//设置为可到达状态
		int override = 0;

		/* If several different ARP replies follows back-to-back,
		   use the FIRST one. It is possible, if several proxy
		   agents are active. Taking the first reply prevents
		   arp trashing and chooses the fastest router.
		 */
		if (jiffies - n->updated >= n->parms->locktime)
			override = 1;

		/* Broadcast replies and request packets
		   do not assert neighbour reachability.
		 */
		if (arp->ar_op != __constant_htons(ARPOP_REPLY) ||
		    skb->pkt_type != PACKET_HOST)
			state = NUD_STALE;
		neigh_update(n, sha, state, override, 1);
		neigh_release(n);
	}

out:
	kfree_skb(skb);
	if (in_dev)
		in_dev_put(in_dev);
out_of_mem:
	return 0;
}

对于服务器来说,首先是将客户端发送的arp请求解包,获取客户端的Mac地址ip地址以及目的地址。然后调用ip_route_input查找或者创建目的地址的路由表,如果显示目的地址为本地类型,则可以创建客户端地址的邻居结构,将客户端的Mac地址记录到邻居结构中。如果需要发送arp应答,则服务器调用arp_send发送arp应答,发送后就释放该数据包。

然后arp应答到达客户端,客户端的接收过程与上面介绍的服务器过程一样,同样会执行arp_rcv,由于arp应答类型为ARPOP_REPLY,因此客户端会调用__neigh_lookup函数找到已经创建的邻居结构,然后通过neigh_update函数设置邻居结构为可到达状态,更新记录的服务器Mac地址,发送前面挂入到队列的数据包。注意传递给这个函数的第一个参数是查找到的邻居结构,第二个参数是服务器的Mac地址,第三个参数是可到达状态,第四个参数是过期标志。

int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, int override, int arp)
{
	u8 old;
	int err;
	int notify = 0;
	struct net_device *dev = neigh->dev;//获取设备类型

	write_lock_bh(&neigh->lock);
	old = neigh->nud_state;//获取邻居结构的历史状态

	err = -EPERM;
	if (arp && (old&(NUD_NOARP|NUD_PERMANENT)))
		goto out;

	if (!(new&NUD_VALID)) {//如果状态不是valid,清除定时器,此时new为NUD_REACHABLE,不会执行下面的代码
		neigh_del_timer(neigh);
		if (old&NUD_CONNECTED)
			neigh_suspect(neigh);
		neigh->nud_state = new;//更新状态
		err = 0;
		notify = old&NUD_VALID;
		goto out;
	}

	/* Compare new lladdr with cached one */
	if (dev->addr_len == 0) {
		/* First case: device needs no address. */
		lladdr = neigh->ha;
	} else if (lladdr) {
		/* The second case: if something is already cached
		   and a new address is proposed:
		   - compare new & old
		   - if they are different, check override flag
		 */
		if (old&NUD_VALID) {
			if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0)
				lladdr = neigh->ha;
			else if (!override)
				goto out;
		}
	} else {
		/* No address is supplied; if we know something,
		   use it, otherwise discard the request.
		 */
		err = -EINVAL;
		if (!(old&NUD_VALID))
			goto out;
		lladdr = neigh->ha;
	}

	neigh_sync(neigh);//
	old = neigh->nud_state;
	if (new&NUD_CONNECTED)
		neigh->confirmed = jiffies;
	neigh->updated = jiffies;

	/* If entry was valid and address is not changed,
	   do not change entry state, if new one is STALE.
	 */
	err = 0;
	if (old&NUD_VALID) {
		if (lladdr == neigh->ha)
			if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED)))
				goto out;
	}
	neigh_del_timer(neigh);
	neigh->nud_state = new;
	if (lladdr != neigh->ha) {
		memcpy(&neigh->ha, lladdr, dev->addr_len);
		neigh_update_hhs(neigh);
		if (!(new&NUD_CONNECTED))
			neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1);
#ifdef CONFIG_ARPD
		notify = 1;
#endif
	}
	if (new == old)
		goto out;
	if (new&NUD_CONNECTED)//检查是否为已连接状态
		neigh_connect(neigh);//重新设置邻居结构的发送函数
	else
		neigh_suspect(neigh);//设置邻居结构的发送函数
	if (!(old&NUD_VALID)) {
		struct sk_buff *skb;

		/* Again: avoid dead loop if something went wrong */
        //循环发送arp队列中的数据包
		while (neigh->nud_state&NUD_VALID &&
		       (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) {
			struct neighbour *n1 = neigh;
			write_unlock_bh(&neigh->lock);
			/* On shaper/eql skb->dst->neighbour != neigh :( */
			if (skb->dst && skb->dst->neighbour)
				n1 = skb->dst->neighbour;
			n1->output(skb);//调用邻居结构的发送函数发送数据包
			write_lock_bh(&neigh->lock);
		}
		skb_queue_purge(&neigh->arp_queue);//释放队列中的数据包
	}
out:
	write_unlock_bh(&neigh->lock);
#ifdef CONFIG_ARPD
	if (notify && neigh->parms->app_probes)
		neigh_app_notify(neigh);
#endif
	return err;
}

函数先重新调整邻居结构的定时器,修改它为可到达状态,然后记录服务器的Mac地址到邻居结构中。至此客户端与服务器就成了事实上的“邻居”。对于设置邻居结构的发送函数,其结果仍然是neigh_resolve_output函数,接着将__neigh_event_send函数链入到arp队列的数据包,逐一发送给服务器。

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值