arp作为Neighboring Subsystem 模块在IPv4上的一种实现,符合Neighboring Subsystem的设计:
ARP实现会创建一个表,除了保存一些状态和属性,主要是缓冲ARP的地址。表的定义如下:
[ include/net/neighbour.h ]
[ include/net/neighbour.h ]
[ include/net/neighbour.h ]
[ include/net/neighbour.h ]
[ include/net/neighbour.h ]
[ include/uapi/linux/neighbour.h ]
[ net/ipv4/arp.c ]
arp_announce : 本地发出的ARP请求包,当源地址使用本机的IP地址时,提供一个等级限制:
ARP实现会创建一个表,除了保存一些状态和属性,主要是缓冲ARP的地址。表的定义如下:
[ include/net/neighbour.h ]
struct neigh_table {
struct neigh_table *next; // 所有的协议在一个列表中
int family; // 协议的famiml,如:AF_INET
int entry_size; // 对象的大小
int key_len; // 查找函数key的大小
__u32 (*hash)(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
int (*constructor)(struct neighbour *);
int (*pconstructor)(struct pneigh_entry *);
void (*pdestructor)(struct pneigh_entry *);
void (*proxy_redo)(struct sk_buff *skb);
char *id; // 标识协议的ID,标识内存池
struct neigh_parms parms; // 决定neighbour协议的行为的参数
/* HACK. gc_* should follow parms without a gap! */
int gc_interval; // 垃圾回收:回收频率
int gc_thresh1; // 缓冲中的内存使用水平阈值
int gc_thresh2; // 缓冲中的内存使用水平阈值
int gc_thresh3; // 缓冲中的内存使用水平阈值
unsigned long last_flush; // 最近的垃圾回收时间
struct delayed_work gc_work;
struct timer_list proxy_timer; // 当proxy_queue中有对象时,开始执行
struct sk_buff_head proxy_queue; // 接收到的请求代理(ARPOP_REQUEST)的包的队列
atomic_t entries; // 在缓冲中的neighbour对象数量
rwlock_t lock;
unsigned long last_rand; // 更新时间,和neigh_parms->reachable_time相关联
struct neigh_statistics __percpu *stats; // 在缓冲中的neighbour对象的状态
struct neigh_hash_table __rcu *nht; // neighbour对象的哈希表
struct pneigh_entry **phash_buckets; // 存储IP地址,这些地址被代理
};
其中的neigh_parms结构为属性,定义如下:
[ include/net/neighbour.h ]
struct neigh_parms {
#ifdef CONFIG_NET_NS
struct net *net;
#endif
struct net_device *dev;
struct neigh_parms *next; // 挂在相同family的协议
int (*neigh_setup)(struct neighbour *);
void (*neigh_cleanup)(struct neighbour *);
struct neigh_table *tbl;
void *sysctl_table; // net/ipv4/neighbour.c
int dead; // 为1,可以删除
atomic_t refcnt; // 引用计数
struct rcu_head rcu_head;
int reachable_time; // 最近可到达的时间
int data[NEIGH_VAR_DATA_MAX];
DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
};
enum {
NEIGH_VAR_MCAST_PROBES, // number of multicast solicitations that can be sent to resolve a neighbor’s address
NEIGH_VAR_UCAST_PROBES, // number of unicast solicitations that can be sent to confirm the reachability of an address.
NEIGH_VAR_APP_PROBES, // number of solicitations that can be sent by a user-space application when resolving an address
NEIGH_VAR_RETRANS_TIME, // 重传时间
NEIGH_VAR_BASE_REACHABLE_TIME, // interval of time since the most recent proof of reachability was received.
NEIGH_VAR_DELAY_PROBE_TIME, // how long a neighbor in the NUD_DELAY state waits before entering the NUD_PROBE state.
NEIGH_VAR_GC_STALETIME, // A neighbour structure is removed if it has not been used for gc_staletime time and no one holds a reference to it
NEIGH_VAR_QUEUE_LEN_BYTES, // arp_queue 的最大长度
NEIGH_VAR_PROXY_QLEN, // proxy_queue 的最大长度
NEIGH_VAR_ANYCAST_DELAY,
NEIGH_VAR_PROXY_DELAY, // Amount of time that neighboring protocol packets handled by a proxy should be kept in a queue before being processed.
NEIGH_VAR_LOCKTIME, // Minimum time that has to pass between two updates of the fields of a neighbour entry
#define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1)
/* Following are used as a second way to access one of the above */
NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */
NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */
NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */
/* Following are used by "default" only */
NEIGH_VAR_GC_INTERVAL,
NEIGH_VAR_GC_THRESH1,
NEIGH_VAR_GC_THRESH2,
NEIGH_VAR_GC_THRESH3,
NEIGH_VAR_MAX
};
其中的neigh_statistics结构为状态,定义如下:
[ include/net/neighbour.h ]
struct neigh_statistics {
unsigned long allocs; /* number of allocated neighs */
unsigned long destroys; /* number of destroyed neighs */
unsigned long hash_grows; /* number of hash resizes */
unsigned long res_failed; /* number of failed resolutions */
unsigned long lookups; /* number of lookups */
unsigned long hits; /* number of hits (among lookups) */
unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */
unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */
unsigned long periodic_gc_runs; /* number of periodic GC runs */
unsigned long forced_gc_runs; /* number of forced GC runs */
unsigned long unres_discards; /* number of unresolved drops */
};
哈希表的结构:
[ include/net/neighbour.h ]
#define NEIGH_NUM_HASH_RND 4
struct neigh_hash_table {
struct neighbour __rcu **hash_buckets;
unsigned int hash_shift;
__u32 hash_rnd[NEIGH_NUM_HASH_RND];
struct rcu_head rcu;
};
面每个neighbour由下面结构表示:
[ include/net/neighbour.h ]
struct neighbour {
struct neighbour __rcu *next; // 每个neighbour 对象都要插入到哈希表中,next用来构建子表
struct neigh_table *tbl; // IPv4指向arp_tbl
struct neigh_parms *parms; // 决定neighbour协议的行为的参数
unsigned long confirmed; // 时间戳(确定地址可到达的最近时间)
unsigned long updated; // 被更新的时间戳
rwlock_t lock;
atomic_t refcnt; // 引用计数
struct sk_buff_head arp_queue; // 还没有找到硬件地址的包先放在这里
unsigned int arp_queue_len_bytes;
struct timer_list timer;
unsigned long used; // 被使用的时间戳
atomic_t probes; // 探测目标地址的次数
__u8 flags;
__u8 nud_state;
__u8 type;
__u8 dead; // 设为1,标记此对象可以被删除
seqlock_t ha_lock;
unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; // 与IP地址对应的硬件地址
struct hh_cache hh; // L2 header caching
int (*output)(struct neighbour *, struct sk_buff *);
const struct neigh_ops *ops; // VFT接口
struct rcu_head rcu;
struct net_device *dev; // 通过此设备neighbor是可到达的
u8 primary_key[0]; // IP地址,被当作key,用于在缓冲中查找
};
在缓冲中的neighbour的状态值如下:
[ include/uapi/linux/neighbour.h ]
/*
* Neighbor Cache Entry States.
*/
/* A solicitation has been sent, but no reply has been received yet.
* In this state, there is no hardware address to use (not even an old one, as there is with NUD_STALE).
*/
#define NUD_INCOMPLETE 0x01
/* The address of the neighbor is cached and the latter is known
* to be reachable (there has been a proof of reachability).
*/
#define NUD_REACHABLE 0x02 // 连接状态
#define NUD_STALE 0x04 // 过期状态
#define NUD_DELAY 0x08 // 延迟状态
#define NUD_PROBE 0x10 // 探测状态
#define NUD_FAILED 0x20 // Marks a neighbor as unreachable
/* Dummy states */
/* This state is used to mark neighbors that do not need any protocol to resolve the L3-to-L2 mapping
*/
#define NUD_NOARP 0x40
/* The L2 address of the neighbor has been statically configured (i.e., with userspace commands)
* and therefore there is no need to use any neighboring protocol to take care of it
*/
#define NUD_PERMANENT 0x80
#define NUD_NONE 0x00
/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
and make no address resolution or NUD.
NUD_PERMANENT also cannot be deleted by garbage collectors.
*/
对ARP,当然要创建一个neigh_table表:
[ net/ipv4/arp.c ]
// arp表
struct neigh_table arp_tbl = {
.family = AF_INET, // 协议的family
.key_len = 4, // 查找函数key的大小
.hash = arp_hash,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache", // 标识协议的ID,标识内存池
.parms = {
.tbl = &arp_tbl, // ARP表
.reachable_time = 30 * HZ, // 最近可到达的时间
.data = {
[NEIGH_VAR_MCAST_PROBES] = 3, // number of multicast solicitations that can be sent to resolve a neighbor’s address
[NEIGH_VAR_UCAST_PROBES] = 3, // number of unicast solicitations that can be sent to confirm the reachability of an address.
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ, // 重传时间
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, // interval of time since the most recent proof of reachability was received.
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, // how long a neighbor in the NUD_DELAY state waits before entering the NUD_PROBE state.
[NEIGH_VAR_GC_STALETIME] = 60 * HZ, // A neighbour structure is removed if it has not been used for gc_staletime
// time and no one holds a reference to it
[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, // arp_queue 的最大长度
[NEIGH_VAR_PROXY_QLEN] = 64, // proxy_queue 的最大长度
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, // Amount of time that neighboring protocol packets handled by a
// proxy should be kept in a queue before being processed.
[NEIGH_VAR_LOCKTIME] = 1 * HZ, // Minimum time that has to pass between two updates of the fields of a neighbour entry
},
},
.gc_interval = 30 * HZ, // 垃圾回收:回收频率
.gc_thresh1 = 128, // 缓冲中的内存使用水平阈值
.gc_thresh2 = 512, // 缓冲中的内存使用水平阈值
.gc_thresh3 = 1024, // 缓冲中的内存使用水平阈值
};
EXPORT_SYMBOL(arp_tbl);
对于设备,它可以对ARP进行配置,配置的属性如下:
arp_announce : 本地发出的ARP请求包,当源地址使用本机的IP地址时,提供一个等级限制:
0:(默认)使用任何接口上的任何IP地址
1:不使用不在目标子网内的本机地址
2:只使用最适合目标地址的本机地址
arp_ignore:当收到ARP请求包,当目标地址是本机地址时的回应模式
0:(默认)响应任何接口上的任何IP地址
1:只响应目标地址和接收数据的设备的地址相同的请求
2:只响应目标地址和接收数据的设备的地址相同的请求,并且发送请求的IP和目标地址在同一个子网内
3:不响应配置为 scope host 的地址,只响应 resolutions for global and link addresses
4-7: -保留
8:全部不响应
ARP协议的头部定义如下:
[ include/uapi/linux/if_arp.h ]
[ net/ipv4/arp.c ]
[ net/core/neighbour.c ]
[ net/ipv4/arp.c ]
当ARP被调用时,要调用构造函数,在arp_table设置它为arp_constructor
[ net/ipv4/arp.c ]
[ include/net/neighbour.h ]
[ net/core/neighbour.c ]
[ include/net/neighbour.h ]
ARP协议的头部定义如下:
[ include/uapi/linux/if_arp.h ]
/*
* This structure defines an ethernet arp header.
*/
struct arphdr {
__be16 ar_hrd; /* format of hardware address */
__be16 ar_pro; /* format of protocol address */
unsigned char ar_hln; /* length of hardware address */
unsigned char ar_pln; /* length of protocol address */
__be16 ar_op; /* ARP opcode (command) */
#if 0
/*
* Ethernet looks like this : This bit is variable sized however...
*/
unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
unsigned char ar_sip[4]; /* sender IP address */
unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
unsigned char ar_tip[4]; /* target IP address */
#endif
};
在初始化IP模块时,会对ARP模块进行初始化,调用的函数为:
[ net/ipv4/arp.c ]
void __init arp_init(void)
{
neigh_table_init(&arp_tbl); // 初始化arp_tbl
dev_add_pack(&arp_packet_type); // 处理接收类型为ARP的包
arp_proc_init(); // proc下创建arp文件
#ifdef CONFIG_SYSCTL
neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); // sysctl文件
#endif
register_netdevice_notifier(&arp_netdev_notifier); // 加入通知队列
}
初始化arp_tbl:
[ net/core/neighbour.c ]
void neigh_table_init(struct neigh_table *tbl)
{
struct neigh_table *tmp;
neigh_table_init_no_netlink(tbl); // 初始化表
write_lock(&neigh_tbl_lock);
/* neigh_tables是一个全局列表,所有的表都挂在它下面
*/
for (tmp = neigh_tables; tmp; tmp = tmp->next) {
if (tmp->family == tbl->family)
break;
}
tbl->next = neigh_tables;
neigh_tables = tbl;
write_unlock(&neigh_tbl_lock);
if (unlikely(tmp)) {
pr_err("Registering multiple tables for family %d\n",
tbl->family);
dump_stack();
}
}
EXPORT_SYMBOL(neigh_table_init);
static void neigh_table_init_no_netlink(struct neigh_table *tbl)
{
unsigned long now = jiffies; // 当前时间
unsigned long phsize;
write_pnet(&tbl->parms.net, &init_net); // 设置网络为init_net
atomic_set(&tbl->parms.refcnt, 1); // 设置引用计数为1
// 最近可到达的时间为随机数
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
tbl->stats = alloc_percpu(struct neigh_statistics); // 状态,是一个PER_CPU变量
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
#ifdef CONFIG_PROC_FS
if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
&neigh_stat_seq_fops, tbl))
panic("cannot create neighbour proc dir entry");
#endif
RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); // 初始化哈希表
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); // 初始化代理地址的哈希表
if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
// 在缓冲中的neighbour对象数量
if (!tbl->entry_size)
tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
tbl->key_len, NEIGH_PRIV_ALIGN);
else
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
rwlock_init(&tbl->lock);
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); // 垃圾回收操作
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); // 地址代理定时器
// 初始化代理地址队列
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
tbl->last_flush = now; // 最近的垃圾回收时间
tbl->last_rand = now + tbl->parms.reachable_time * 20; // 更新时间,和neigh_parms->reachable_time相关联
}
/*
* It is random distribution in the interval (1/2)*base...(3/2)*base.
* It corresponds to default IPv6 settings and is not overridable,
* because it is really reasonable choice.
*/
unsigned long neigh_rand_reach_time(unsigned long base)
{
return base ? (prandom_u32() % base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
为处理类型为ARP的包,要提供一个packet_type类型,并注册到内核中:
[ net/ipv4/arp.c ]
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
其中arp_rcv就是用来接收ARP包的函数。
当ARP被调用时,要调用构造函数,在arp_table设置它为arp_constructor
[ net/ipv4/arp.c ]
static int arp_constructor(struct neighbour *neigh)
{
__be32 addr = *(__be32 *)neigh->primary_key; // 目标地址
struct net_device *dev = neigh->dev; // 网络设备
struct in_device *in_dev;
struct neigh_parms *parms;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev); // IPv4 specific data
if (in_dev == NULL) {
rcu_read_unlock();
return -EINVAL;
}
neigh->type = inet_addr_type(dev_net(dev), addr); // 地址类型RTN_UNICAST,RTN_MULTICAST....
parms = in_dev->arp_parms; // 决定neighbour协议的行为的参数
__neigh_parms_put(neigh->parms); // neigh->parms引用计数
neigh->parms = neigh_parms_clone(parms); // 设置neigh->parms
rcu_read_unlock();
if (!dev->header_ops) { // 不用ARP
neigh->nud_state = NUD_NOARP; // do not need any protocol to resolve the L3-to-L2 mapping
neigh->ops = &arp_direct_ops;
neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
ARPHRD_ETHER: (ethernet, apfddi)
ARPHRD_FDDI: (fddi)
ARPHRD_IEEE802: (tr)
ARPHRD_METRICOM: (strip)
ARPHRD_ARCNET:
etc. etc. etc.
ARPHRD_IPDDP will also work, if author repairs it.
I did not it, because this driver does not work even
in old paradigm.
*/
#if 1
/* So... these "amateur" devices are hopeless.
The only thing, that I can say now:
It is very sad that we need to keep ugly obsolete
code to make them happy.
They should be moved to more reasonable state, now
they use rebuild_header INSTEAD OF hard_start_xmit!!!
Besides that, they are sort of out of date
(a lot of redundant clones/copies, useless in 2.1),
I wonder why people believe that they work.
*/
switch (dev->type) {
default:
break;
case ARPHRD_ROSE:
#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:/* CCITT X.25 */
#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:/* from KA9Q: NET/ROM pseudo */
#endif
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
#else
break;
#endif
}
#endif
if (neigh->type == RTN_MULTICAST) { // 多播地址
neigh->nud_state = NUD_NOARP; // do not need any protocol to resolve the L3-to-L2 mapping
arp_mc_map(addr, neigh->ha, dev, 1); // 设置neigh硬件地址
} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { // no ARP protocol 或环回接口
neigh->nud_state = NUD_NOARP; // do not need any protocol to resolve the L3-to-L2 mapping
memcpy(neigh->ha, dev->dev_addr, dev->addr_len); // 设置neigh硬件地址为dev->dev_addr(硬件地址)
} else if (neigh->type == RTN_BROADCAST ||
(dev->flags & IFF_POINTOPOINT)) { // 广播地址或interface is has p-p link
neigh->nud_state = NUD_NOARP; // do not need any protocol to resolve the L3-to-L2 mapping
memcpy(neigh->ha, dev->broadcast, dev->addr_len); // 设置neigh硬件地址为广播地址
}
if (dev->header_ops->cache) // 存在cache
neigh->ops = &arp_hh_ops;
else
neigh->ops = &arp_generic_ops;
if (neigh->nud_state & NUD_VALID) // 不支持ARP
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
}
return 0;
}
/* 得到多播地址
*/
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
switch (dev->type) {
case ARPHRD_ETHER: // ethernet, apfddi
case ARPHRD_FDDI: // fddi
case ARPHRD_IEEE802: // tr
ip_eth_mc_map(addr, haddr); // Map a multicast IP onto multicast MAC for type ethernet.
return 0;
case ARPHRD_INFINIBAND: // InfiniBand
ip_ib_mc_map(addr, dev->broadcast, haddr);
return 0;
case ARPHRD_IPGRE: // GRE over IP
ip_ipgre_mc_map(addr, dev->broadcast, haddr);
return 0;
default:
if (dir) {
memcpy(haddr, dev->broadcast, dev->addr_len);
return 0;
}
}
return -EINVAL;
}
int arp_find(unsigned char *haddr, struct sk_buff *skb)
{
struct net_device *dev = skb->dev; // 网络设备
__be32 paddr;
struct neighbour *n;
if (!skb_dst(skb)) { // 不存在路由
pr_debug("arp_find is called with dst==NULL\n");
kfree_skb(skb);
return 1;
}
/* skb的目标地址
* 网关地址或是skb中IP头中的目标地址
*/
paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
/* 此函数可处理:
* 本机地址
* 多播地址
* 广播地址
*/
if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
paddr, dev))
return 0;
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); // 在arp_tbl中找到neighbour
if (n) {
n->used = jiffies; // 设置被使用的时间戳
/* 不支持ARP
*/
if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
neigh_ha_snapshot(haddr, n, dev); // n中与IP地址对应的硬件地址赋值给haddr
neigh_release(n);
return 0;
}
neigh_release(n);
} else
kfree_skb(skb);
return 1;
}
EXPORT_SYMBOL(arp_find);
缓冲中查找
[ include/net/neighbour.h ]
static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
struct neighbour *n = neigh_lookup(tbl, pkey, dev); // 哈希表中找到neighbour
if (n || !creat)
return n;
n = neigh_create(tbl, pkey, dev); // 如果没找到,创建一个neighbour
return IS_ERR(n) ? NULL : n;
}
其中调用:
[ net/core/neighbour.c ]
// pkey : 目标地址
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
int key_len = tbl->key_len; // 地址长度
u32 hash_val;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht); // neighbour对象的哈希表
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); // 计算出pkey的哈希值
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); // 哈希表子表
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { // 设备和KEY都相同
if (!atomic_inc_not_zero(&n->refcnt)) // 增加引用计数
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits); // 记录状态
break;
}
}
rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup);
如果没找到,创建一个:
[ include/net/neighbour.h ]
static inline struct neighbour *neigh_create(struct neigh_table *tbl,
const void *pkey,
struct net_device *dev)
{
return __neigh_create(tbl, pkey, dev, true);
}
[ net/core/neighbour.c ]
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;// 查找函数key的大小,如:arp为4
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); // 分配一个neighbour
struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);// IP地址,被当作key,用于在缓冲中查找
n->dev = dev; // 通过此设备neighbor是可到达的
dev_hold(dev);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
if (dev->netdev_ops->ndo_neigh_construct) {
error = dev->netdev_ops->ndo_neigh_construct(n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); // 时间戳(确定地址可到达的最近时间)
write_lock_bh(&tbl->lock);
// neighbour对象的哈希表
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
// 哈希表数量加1
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
// 计算哈希值
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) { // 是否被删除
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
// 找到在哈希表中的位置
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0; // 不可被删除
if (want_ref)
neigh_hold(n);
// 插入到哈希表中
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
EXPORT_SYMBOL(__neigh_create);