TCP/IP协议栈源代码分析
前言
Linux内核的整体架构如下图所示:
根据内核的核心功能,Linux内核提出了5个子系统,分别负责如下的功能:
- Process Scheduler,也称作进程管理、进程调度。负责管理CPU资源,以便让各个进程可以以尽量公平的方式访问CPU。
- Memory Manager,内存管理。负责管理Memory(内存)资源,以便让各个进程可以安全地共享机器的内存资源。另外,内存管理会提供虚拟内存的机制,该机制可以让进程使用多于系统可用Memory的内存,不用的内存会通过文件系统保存在外部非易失存储器中,需要使用的时候,再取回到内存中。
- VFS(Virtual File System),虚拟文件系统。Linux内核将不同功能的外部设备,例如Disk设备(硬盘、磁盘、NAND Flash、Nor Flash等)、输入输出设备、显示设备等等,抽象为可以通过统一的文件操作接口(open、close、read、write等)来访问。
- Network,网络子系统。负责管理系统的网络设备,并实现多种多样的网络标准。
- IPC(Inter-Process Communication),进程间通信。IPC不管理任何的硬件,它主要负责Linux系统中进程之间的通信。
我们主要关注的是第四部分:网络子系统。
Linux基于TCP/IP协议的内核网络栈实现结构如下图所示:
以linux-5.4.34源代码为基础,对TCP/IP协议栈源代码进行了分析,现对以下几个问题,给出自己的思考与理解。
inet_init是如何被调用的?从start_kernel到inet_init调用路径
先给出从start_kernel到inet_init的调用路径:
即先运行start_kernel函数,进行一系列初始化(内存,计时器,调度器等),然后在函数的最后调用rest_init函数,该函数会通过kernel_thread函数启动一个内核线程,然后调用kernel_init_freeable函数初始化内核的一些关键部分(页表,中断等)然后调用do_basic_setup函数初始化剩余部分,最后调用do_initcalls函数,do_initcalls()循环调用do_initcall_level(level),level就是initcall的优先级数字,会按照等级,从 level0 ~ level7 来对内核的模块进行初始化,这时候就会调用inet_init()。do_initcalls() 是从特定的内存区域取出初始化函数的指针,而这些函数指针数组的声明在链接脚本arch/x86/kernel/vmlinux.lds中,由fs_initcall宏定义来添加。
下面通过源码来解析inet_init()是如何被调用的:
可以从net/ipv4/af_inet.c中找到inet_init函数的定义,定义如下:
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
raw_init();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
ipfrag_init();
dev_add_pack(&ip_packet_type);
ip_tunnel_core_init();
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
goto out;
}
fs_initcall(inet_init);
函数的最后使用了fs_initcall(inet_init),这是一个宏定义,会把指定的函数添加到对应的section中,并返回一个函数指针,编译的时候需要把这个函数指针变量放置在对应位置,在这里会把inet_init添加到initcall中,将内核模块初始化函数按照功能优先级分组放置到.init段,.init段在链接脚本arch/x86/kernel/vmlinux.lds中定义。vmlinux.lds定义了整个内核编译之后的链接过程,决定内核各个段的存储位置,以及入口地址。
通过查看/init/main.c,可以发现其中定义了一个静态的initcall_levels数组,这是一个指针数组,数组的每个元素都是指向一个数组的指针,而这个initcall_levels数组就是具体添加每个内核模块初始化函数的地方(例如inet_init)。定义如下:
extern initcall_entry_t __initcall_start[];
extern initcall_entry_t __initcall0_start[];
extern initcall_entry_t __initcall1_start[];
extern initcall_entry_t __initcall2_start[];
extern initcall_entry_t __initcall3_start[];
extern initcall_entry_t __initcall4_start[];
extern initcall_entry_t __initcall5_start[];
extern initcall_entry_t __initcall6_start[];
extern initcall_entry_t __initcall7_start[];
extern initcall_entry_t __initcall_end[];
static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
__initcall3_start,
__initcall4_start,
__initcall5_start,
__initcall6_start,
__initcall7_start,
__initcall_end,
};
然后在do_initcalls()被调用时,会循环调用do_initcall_level(level),会按照等级,从 level0 ~ level7 来调用对应函数,而inet_init已经被链接到了其中,所以会被调用。
do_initcalls函数和do_initcall_level函数的定义如下:
static void __init do_initcall_level(int level)
{
initcall_entry_t *fn;
strcpy(initcall_command_line, saved_command_line);
parse_args(initcall_level_names[level],
initcall_command_line, __start___param,
__stop___param - __start___param,
level, level,
NULL, &repair_env_string);
trace_initcall_level(initcall_level_names[level]);
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
do_one_initcall(initcall_from_entry(fn));
}
static void __init do_initcalls(void)
{
int level;
for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
do_initcall_level(level);
}
跟踪分析TCP/IP协议栈如何将自己与上层套接口与下层数据链路层关联起来的?
简单的说,对于上层,TCP/IP协议栈通过数据结构struct socket、struct sock和定义在其上的一些函数操作与之关联。
而对于下层数据链路层,通过数据结构struct net_device和定义在其上的一些函数操作来与设备接口层进行关联。
TCP/IP协议栈会通过proto_register和inet_add_protocol等函数,注册每个协议自己的处理函数,这就使得不同的数据包到来时,会进行不同的处理,而这些处理需要通过上述两个数据结构来完成上下层之间的数据沟通。
TCP的三次握手源代码跟踪分析,跟踪找出设置和发送SYN/ACK的位置,以及状态转换的位置
状态转换的过程可以由下图表示:
客户端发送SYN报文:
跟踪tcp_v4_connect函数的执行过程,可以发现该函数建立了一个TCP连接,并且设置了 TCP_SYN_SENT并进一步调用了 tcp_connect(sk),tcp_connect调用tcp_connect_init初始化传输控制块相关数据(发送队列、发送窗口、序号等),调用tcp_connect_queue_skb将SYN报文添加到发送队列,调用tcp_transmit_skb填充TCP协议各字段,然后调用ip_queue_xmit发送报文到网络层,发送之后,调用inet_csk_reset_xmit_timer重置启动超时重传定时器。
对于服务器端,跟踪tcp_v4_rcv函数的执行过程:
服务器接收SYN报文(第一次握手):
发现tcp_v4_rcv调用__inet_lookup查找接收数据的socket,先调用__inet_lookup_established查找已建立连接的socket,相关的端口地址已建立连接的话,就不能再次建立连接,没有建立连接就调用__inet_lookup_listener检查是否有监听的socket,最后返回处理报文的socket,对于监听状态的socket,调用tcp_v4_do_rcv处理请求报文。
状态转换主要通过tcp_set_state函数进行,该函数位于
SYN报文处理及发送SYN+ACK报文(第二次握手):
tcp_v4_do_rcv最终调用tcp_rcv_state_process处理各种状态的输入,对于监听状态的socket,检查过滤掉各种不合法的标志的报文,只处理SYN标志的报文。tcp_conn_request调用inet_reqsk_alloc分配连接请求块(TCP_NEW_SYN_RECV),调用inet_csk_reqsk_queue_hash_add、inet_ehash_insert将请求控制块加入到全局的ehash散列表中(__inet_lookup_established从里面查找socket),调用tcp_v4_send_synack构造发送SYN+ACK报文。
客户端接收SYN+ACK发送ACK(第三次握手):
调用tcp_v4_rcv、tcp_v4_do_rcv、tcp_rcv_state_process处理SYN+ACK报文,客户端在TCP_SYN_SENT状态,处理第二次握手的SYN+ACK报文,调用tcp_rcv_synsent_state_process处理报文,初始化发送窗口,调用tcp_finish_connect完成连接(设置socket状态为TCP_ESTABLISHED,如果有等待socket的线程,唤醒该线程),不延迟发送ACK的情况下,调用tcp_send_ack发送第三次握手的ACK报文。
状态转换:
主要通过调用tcp_set_state函数进行,定义如下:
void tcp_set_state(struct sock *sk, int state)
{
int oldstate = sk->sk_state;
/* We defined a new enum for TCP states that are exported in BPF
* so as not force the internal TCP states to be frozen. The
* following checks will detect if an internal state value ever
* differs from the BPF value. If this ever happens, then we will
* need to remap the internal value to the BPF value before calling
* tcp_call_bpf_2arg.
*/
BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
switch (state) {
case TCP_ESTABLISHED:
if (oldstate != TCP_ESTABLISHED)
TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
break;
case TCP_CLOSE:
if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
sk->sk_prot->unhash(sk);
if (inet_csk(sk)->icsk_bind_hash &&
!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
inet_put_port(sk);
/* fall through */
default:
if (oldstate == TCP_ESTABLISHED)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
}
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
}
EXPORT_SYMBOL_GPL(tcp_set_state);
该函数会根据上述握手挥手的过程,设置客户端和服务器的状态。
send在TCP/IP协议栈中的执行路径
从分层模型的角度来分析send的执行过程:
应用层:
对于TCP,应用程序在创建socket之后,调用connect()函数,通过socket使客户端和服务端建立连接。然后就可以调用send函数发送数据。
系统调用层:
send函数会调用系统调用sys_sendto,然后调用socket中的sock_sendmsg函数
传输层:
不同的协议有不同的发送函数,TCP调用tcp_sendmsg函数,而UDP则调用的是sock_sendmsg函数。tcp_sendmsg()的主要工作是传输用户层的数据,将数据放入skb中。然后调用tcp_push()发送,tcp_push函数调用tcp_write_xmit() 函数,依次调用发送函数tcp_transmit_skb将skb封装tcp头之后,回调ip_queue_xmit。
网络层:
ip_queue_xmit(skb)主要有路由查找校验、封装ip头和ip选项,最后通过ip_local_out发送数据包。
数据链路层:
网络层最后调用dev_queue_xmit函数来发送数据报。dev_queue_xmit函数会调用do_dev_queue_xmit函数,根据不同的设备调用不同的函数来发送数据。
上述调用过程可以概述如下:
以采用TCP协议为例,再加入网络层和数据链路层的调用,过程可以概括如下:
recv在TCP/IP协议栈中的执行路径
同样的,也可以分层对recv的执行过程进行分析:
数据链路层:
当数据包到达机器的物理网卡时会触发一个中断,中断处理程序分配skb_buff数据结构,并将从网卡I/O接收到的数据帧复制到skb_buff缓冲区,并设置skb_buff相应的参数。
然后发出软中断,通知内核接收新的数据帧。进入软中断处理流程,调用net_rx_action函数。进入 netif _receive_skb 处理流程。
netif_receive_skb 根据在全局数组 ptype_all 和 ptype_base 中注册的网络层数据报类型,将数据报发送到不同的网络层协议接收函数(INET域主要是ip_rcv和arp_rcv)。
网络层:
ip_rcv函数为网络层的入口函数。该函数做的第一件事就是数据校验,然后调用ip_rcv_finish这个函数。
ip_rcv_finish函数会调用ip_route_input函数来更新路由,然后寻找路由,决定消息是发送到本地机器,转发还是丢弃。
如果发送到本机,则调用ip_local_deliver函数,可以进行碎片整理(合并多个包),并调用ip_local_deliver_finish。最后调用下一层接口,包括tcp_v4_rcv(TCP)、udp_rcv(UDP)、icmp_rcv(ICMP)、igmp_rcv(IGMP)。如果需要转发,则进入转发流程,调用dev_queue_xmit,进入链路层处理流程。如果不是发送到本机,应该是转发,调用 ip_forward 进行转发 。
传输层:
在该层,我们会做一些完整性检查,如果发现问题就丢包。如果是tcp,则调用tcp_v4_do_rcv。
然后sk->sk_state == TCP_ESTABLISHED,调用tcp_rcv_builted,调用 tcp_data_queue 方法将消息放入队列。然后使用 tcp_ofo_queue 方法将消息插入接收到 Queued 。
应用层:
应用程序调用读取或者 recv 的时候,该调用被映射到 /net/socket.c 中的sys_recv系统调用,然后调用 sock_recvmsg 函数。
TCP 会调用 tcp_recvmsg。该函数从套接字缓冲区复制数据到缓冲区。
上述过程可以概括如下:
对于接收方而言,当有数据来的时候,都是通过终端来通知内核,最终通过回调,调用系统函数,过程如下:
综合上述两问,可以得到send和recv的完整过程:
路由表的结构和初始化过程
路由表结构
为了支持策略路由,Linux使用了多个路由表而不是一个,即使不使用策略路由,Linux也使用了两个路由表,一个用于上传给本地上层协议,另一个则用于转发。Linux使用多个路由表而不是一个,使不同策略的路由存放在不同的表中,有效地被免了查找庞大的路由表,在一定情度上提高了查找了效率。
路由表本身不是由一个结构表示,而是由多个结构组合而成。路由表可以说是一个分层的结构组合(trie树)。
在第一层,它先将所有的路由根据子网掩码(netmask)的长度(0~32)分成33个部分(structfn_zone),然后在同一子网掩码(同一层)中,再根据子网的不同(如10.1.1.0/24和10.1.2.0/24),划分为第二层(struct fib_node),在同一子网中,有可能由于TOS等属性的不同而使用不同的路由,这就是第三层(structfib_alias),第三层结构表示一个路由表项,而每个路由表项又包括一个相应的参数,如协议,下一跳路由地址(由structfib_nh存储)等等,这就是第四层(structfib_info)。分层的好处是显而易见的,它使路由表的更加优化,逻辑上也更加清淅,并且使数据可以共享(如structfib_info),从而减少了数据的冗余。
源码中struct fib_table *fib_tables[RT_TABLE_MAX+1]; // RT_TABLE_MAX 为255
是一个路由表的总体结构,是一个fib_table结构指针的数组,每个fib_table结构在内核中表示一个路由表,定义如下:
struct fib_table {
struct hlist_node tb_hlist;
u32 tb_id;
int tb_num_default;
struct rcu_head rcu;
unsigned long *tb_data;
unsigned long __data[0];
};
初始化过程
主要通过ip_fib_init函数完成初始化,定义如下:
void __init ip_fib_init(void)
{
fib_trie_init();
register_pernet_subsys(&fib_net_ops);
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
}
该函数完成了netlink路由添加、删除和dump命令处理函数部分的注册、路由表路由缓存的初始化、同时还会监听其他模块
通过目的IP查询路由表的到下一跳的IP地址的过程
会调用fib_lookup函数查找,定义如下:
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
struct fib_table *tb;
int err = -ENETUNREACH;
rcu_read_lock();
tb = fib_get_table(net, RT_TABLE_MAIN);
if (tb)
err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);
if (err == -EAGAIN)
err = -ENETUNREACH;
rcu_read_unlock();
return err;
}
该函数在收到数据包后会调用fib_get_table函数获取路由表,如果成果获取会调用fib_table_lookup进行查找,该函数会遍历上一问提到的数据结构(trie树),将路由信息填充到数据结构struct fib_result中。
ARP缓存的数据结构及初始化过程,包括ARP缓存的初始化
数据结构
通过查看net/ipv4/arp.c 可以发现arp_tbl的定义,它是一个neigh_table类型的结构体
struct neigh_table arp_tbl = {
.family = AF_INET,
.key_len = 4,
.protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
.key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
.reachable_time = 30 * HZ,
.data = {
[NEIGH_VAR_MCAST_PROBES] = 3,
[NEIGH_VAR_UCAST_PROBES] = 3,
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
[NEIGH_VAR_LOCKTIME] = 1 * HZ,
},
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
从net/neighbour.h中可以查看到neigh_table和neighbour的定义:
邻居表:
struct neigh_table {
int family;
unsigned int entry_size;
unsigned int key_len;
__be16 protocol;
__u32 (*hash)(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
bool (*key_eq)(const struct neighbour *, const void *pkey);
int (*constructor)(struct neighbour *);
int (*pconstructor)(struct pneigh_entry *);
void (*pdestructor)(struct pneigh_entry *);
void (*proxy_redo)(struct sk_buff *skb);
bool (*allow_add)(const struct net_device *dev,
struct netlink_ext_ack *extack);
char *id;
struct neigh_parms parms;
struct list_head parms_list;
int gc_interval;
int gc_thresh1;
int gc_thresh2;
int gc_thresh3;
unsigned long last_flush;
struct delayed_work gc_work;
struct timer_list proxy_timer;
struct sk_buff_head proxy_queue;
atomic_t entries;
atomic_t gc_entries;
struct list_head gc_list;
rwlock_t lock;
unsigned long last_rand;
struct neigh_statistics __percpu *stats;
struct neigh_hash_table __rcu *nht;
struct pneigh_entry **phash_buckets;
};
邻居:
struct neighbour {
struct neighbour __rcu *next;
struct neigh_table *tbl;
struct neigh_parms *parms;
unsigned long confirmed;
unsigned long updated;
rwlock_t lock;
refcount_t refcnt;
unsigned int arp_queue_len_bytes;
struct sk_buff_head arp_queue;
struct timer_list timer;
unsigned long used;
atomic_t probes;
__u8 flags;
__u8 nud_state;
__u8 type;
__u8 dead;
u8 protocol;
seqlock_t ha_lock;
unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
struct hh_cache hh;
int (*output)(struct neighbour *, struct sk_buff *);
const struct neigh_ops *ops;
struct list_head gc_list;
struct rcu_head rcu;
struct net_device *dev;
u8 primary_key[0];
} __randomize_layout;
其中:
struct neighbour :
存储邻居的有关信息,例如地址、NUD状态、访问该邻居经过的设备等,需要注意的是,一个neighbour项并不是与一台主机有关,而是与一个地址有关,因为一台主机可能有多个地址(比如路由器,可能对应多个neighbour项)
struct neigh_table:
描述一种邻居协议的参数和所用函数。每个邻居协议都有该结构的一个实例(比如上面提到的arp_tbl)。所有实例都插入到一个由静态变量neigh_tables指向的一个全局表中,并且由neigh_tbl_lock来加锁保护。该锁只保护全局表的完整性,但不保护表中每个条目的内容。
struct neigh_parms:
对每个设备上邻居协议行为进行调整的一组参数。由于在大部分接口上都可以同时启动多个协议,所以一个net_device结构可以关联多个neigh_parms结构。除此之外,每个协议所用的全部neigh_parms结构都会链接到一个单向列表中,其根节点会保存在该协议的neigh_table中。
struct neigh_ops:
一组函数,用来表示IP协议和dev_queue_xmit之间的接口。
初始化过程
用net/ipv4/arp.c中的arp_init函数来初始化ARP协议,函数的定义如下:
void __init arp_init(void)
{
neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
dev_add_pack(&arp_packet_type);
arp_proc_init();
#ifdef CONFIG_SYSCTL
neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier);
}
首先,该函数会调用neigh_table_init函数,注册一个虚函数表和ARP协议使用的其他常用参数(即arp_tbl)。
然后,调用dev_add_pack函数,安装ARP协议的处理函数,用于处理ARP封包。
接着,会调用arp_proc_init函数,该函数会建立/proc/net/arp文件,通过读取该文件就可以看到ARP缓存的内容(其中包括ARP代理的地址)。
如果内核支持sysctl,就可以通过neigh_sysctl_register函数创建一个目录
/proc/sys/net/ipv4/neigh,用于输出neigh_parms结构的默认调节参数,第一个参数为NULL表示想注册为默认目录。
最后,register_netdevice_notifier会向内核注册一个回调函数,用于接收设备状态和配置变化的通知。
ARP的缓存即arp_tbl是通过neigh_table_init函数初始化的,这个函数主要完成以下工作:
- 为neighbour结构分配预备的内存池。
- 分配一个neigh_statistics结构来收集协议的统计信息。
- 分配两个hash表:hash_buckets和phash_buckets。这两个表分别作为解析过的地址关联缓存和代理的地址数据库。
- 在/proc/net中建立一个文件,用于转存缓存的内容。文件名来自neigh_table->id
- 启动gc_timer垃圾回收定时器。
- 初始化proxy_timer代理定时器和相关的proxy_queue队列。
- 添加neigh_table结构到neigh_tables全局列表中。
- 初始化其他参数,例如reachable_time等。
如何将IP地址解析出对应的MAC地址
在Linux源代码中,IP地址解析为对应的MAC地址主要通过ARP协议来完成,具体可以分为以下几个步骤:
- ARP请求发送:当需要解析某个IP地址对应的MAC地址时,内核会通过ARP协议发送一个请求广播到本地网络,包含发送方的IP地址和MAC地址,以及目标的IP地址。
- ARP表查找:内核会查找本地的ARP缓存表,看是否存在目标IP对应的MAC地址,如果存在则直接使用该MAC地址。
- ARP表缺失:如果ARP表中没有对应条目,内核会广播发送ARP请求。
- ARP应答:当目标主机收到ARP请求后,会回应ARP应答,其中包含自己的MAC地址,发送ARP请求的主机收到ARP应答后,会更新自己的ARP缓存表。
下面从源码的角度分析上述过程:
我们在上一个问题中,已经谈到,ARP协议初始化的过程中,会通过调用dev_add_pack函数,安装ARP协议的处理函数,用于处理ARP封包。
在net/ipv4/arp.c中可以看到该处理函数:
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
const struct arphdr *arp;
/* do not tweak dropwatch on an ARP we will ignore */
if (dev->flags & IFF_NOARP ||
skb->pkt_type == PACKET_OTHERHOST ||
skb->pkt_type == PACKET_LOOPBACK)
goto consumeskb;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto freeskb;
arp = arp_hdr(skb);
if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
goto freeskb;
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
dev_net(dev), NULL, skb, dev, NULL,
arp_process);
consumeskb:
consume_skb(skb);
return NET_RX_SUCCESS;
freeskb:
kfree_skb(skb);
out_of_mem:
return NET_RX_DROP;
}
ARP包中提取的参数:
ARP包中的字段 | 局部变量名 |
---|---|
发送方Ethernet地址 | sha |
发送方IP地址 | sip |
目的Ehernet地址 | tha |
目的IP地址 | tip |
ARP封包可以从skb缓冲区中得到,skb也是函数arp_rcv的输入参数;ARP的包头就是skb->nh.arph。该函数的首要任务就是确保收到的ARP封包不是碎片,即内存中该封包可以线性访问。如果不是碎片,arp_rcv就会调用函数pskb_may_pull来保证在主缓存区中有足够的空间用于存放ARP包头和负载。
该函数也会判断是否将封包丢弃,如果满足以下条件就会被丢弃:
- 设备有IFF_NOARP标识,即收到封包的设备没有使用ARP协议。
- 封包的目的地址不是收到封包的接口(即目的地址不是接口的地址或广播地址)
当缓冲区被共享时,arp_rcv将调用skb_share_check函数克隆一个缓冲区,保证处理ARP封包时没有其他进程改变skb的内容。
如果Netfilter没有拦截封包,就会调用arp_process函数负责处理封包。
arp_process函数的定义如下:
static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
struct dst_entry *reply_dst = NULL;
bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
*/
if (!in_dev)
goto out_free_skb;
arp = arp_hdr(skb);
switch (dev_type) {
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
* ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
* FDDI devices should accept ARP hardware of (1) Ethernet,
* however, to be more robust, we'll accept both 1 (Ethernet)
* or 6 (IEEE 802.2)
*/
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
goto out_free_skb;
break;
}
/* Understand only these message types */
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
goto out_free_skb;
/*
* Extract fields
*/
arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
case ARPHRD_IEEE1394:
break;
#endif
default:
tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out_free_skb;
/*
* For some 802.11 wireless deployments (and possibly other networks),
* there will be an ARP proxy and gratuitous ARP frames are attacks
* and thus should not be accepted.
*/
if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
*/
if (dev_type == ARPHRD_DLCI)
sha = dev->broadcast;
/*
* Process entry. The idea here is we want to send a reply if it is a
* request for us or if it is a request for someone else that we hold
* a proxy for. We want to add an entry to our cache if it is a reply
* to us or if it is a request for our address.
* (The assumption for this last is that if someone is requesting our
* address, they are probably intending to talk to us, so it saves time
* if we cache their address. Their address is also probably not in
* our cache, since ours is not in their cache.)
*
* Putting this another way, we only care about replies if they are to
* us, in which case we add them to the cache. For requests, we care
* about those for us and those for our proxies. We reply to both,
* and in the case of requests for us we add the requester to the arp
* cache.
*/
if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
reply_dst = (struct dst_entry *)
iptunnel_metadata_reply(skb_metadata_dst(skb),
GFP_ATOMIC);
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
int dont_send;
dont_send = arp_ignore(in_dev, sip, tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
dont_send = arp_filter(sip, tip, dev);
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
neigh_release(n);
}
}
goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
(rt->dst.dev != dev &&
pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
goto out_free_dst;
}
goto out_consume_skb;
}
}
}
/* Update our ARP tables */
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
addr_type = -1;
if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
if (!n &&
(is_garp ||
(arp->ar_op == htons(ARPOP_REPLY) &&
(addr_type == RTN_UNICAST ||
(addr_type < 0 &&
/* postpone calculation to as late as possible */
inet_addr_type_dev_table(net, dev, sip) ==
RTN_UNICAST)))))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
if (n) {
int state = NUD_REACHABLE;
int override;
/* If several different ARP replies follows back-to-back,
use the FIRST one. It is possible, if several proxy
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
override = time_after(jiffies,
n->updated +
NEIGH_VAR(n->parms, LOCKTIME)) ||
is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
*/
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
neigh_update(n, sha, state,
override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
neigh_release(n);
}
out_consume_skb:
consume_skb(skb);
out_free_dst:
dst_release(reply_dst);
return NET_RX_SUCCESS;
out_free_skb:
kfree_skb(skb);
return NET_RX_DROP;
}
该函数的处理流程如下:
该函数首先会对它能理解的所有ARP封包类型进行合理性检查,然后根据封包类型执行相应的操作。函数的最后一部分是另一些常用代码,负责更新ARP缓存。
而arp_process只会处理ARPOP_REQUEST类型和ARPOP_REPLY类型封包,其他类型的ARP封包都会被丢弃。
ARPOP_REQUEST类型封包的处理过程如下:
ARPOP_REPLY类型封包的处理过程如下:
跟踪TCP send过程中的路由查询和ARP解析的最底层实现
路由查询的函数调用过程为:
ip_route_output_flow->
__ip_route_output_key->
ip_route_output_key_hash->
ip_route_output_key_hash_rcu->
fib_lookup
根据上一问的解析,ARP的底层主要通过以下几个函数实现:
发送ARP请求:
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
const unsigned char *dest_hw,
const unsigned char *src_hw,
const unsigned char *target_hw)
{
struct sk_buff *skb;
struct arphdr *arp;
unsigned char *arp_ptr;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
/*
* Allocate a buffer
*/
skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
if (!skb)
return NULL;
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
arp = skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
skb->protocol = htons(ETH_P_ARP);
if (!src_hw)
src_hw = dev->dev_addr;
if (!dest_hw)
dest_hw = dev->broadcast;
/*
* Fill the device header for the ARP frame
*/
if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
goto out;
/*
* Fill out the arp protocol part.
*
* The arp hardware type should match the device type, except for FDDI,
* which (according to RFC 1390) should always equal 1 (Ethernet).
*/
/*
* Exceptions everywhere. AX.25 uses the AX.25 PID value not the
* DIX code for the protocol. Make these device structure fields.
*/
switch (dev->type) {
default:
arp->ar_hrd = htons(dev->type);
arp->ar_pro = htons(ETH_P_IP);
break;
#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
arp->ar_hrd = htons(ARPHRD_AX25);
arp->ar_pro = htons(AX25_P_IP);
break;
#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
arp->ar_hrd = htons(ARPHRD_NETROM);
arp->ar_pro = htons(AX25_P_IP);
break;
#endif
#endif
#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
arp->ar_hrd = htons(ARPHRD_ETHER);
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
}
arp->ar_hln = dev->addr_len;
arp->ar_pln = 4;
arp->ar_op = htons(type);
arp_ptr = (unsigned char *)(arp + 1);
memcpy(arp_ptr, src_hw, dev->addr_len);
arp_ptr += dev->addr_len;
memcpy(arp_ptr, &src_ip, 4);
arp_ptr += 4;
switch (dev->type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
case ARPHRD_IEEE1394:
break;
#endif
default:
if (target_hw)
memcpy(arp_ptr, target_hw, dev->addr_len);
else
memset(arp_ptr, 0, dev->addr_len);
arp_ptr += dev->addr_len;
}
memcpy(arp_ptr, &dest_ip, 4);
return skb;
out:
kfree_skb(skb);
return NULL;
}
EXPORT_SYMBOL(arp_create);
void arp_send(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
const unsigned char *dest_hw, const unsigned char *src_hw,
const unsigned char *target_hw)
{
arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
target_hw, NULL);
}
EXPORT_SYMBOL(arp_send);
arp_rcv函数:
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
const struct arphdr *arp;
/* do not tweak dropwatch on an ARP we will ignore */
if (dev->flags & IFF_NOARP ||
skb->pkt_type == PACKET_OTHERHOST ||
skb->pkt_type == PACKET_LOOPBACK)
goto consumeskb;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto freeskb;
arp = arp_hdr(skb);
if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
goto freeskb;
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
dev_net(dev), NULL, skb, dev, NULL,
arp_process);
consumeskb:
consume_skb(skb);
return NET_RX_SUCCESS;
freeskb:
kfree_skb(skb);
out_of_mem:
return NET_RX_DROP;
}
ARP处理函数:
static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
struct dst_entry *reply_dst = NULL;
bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
*/
if (!in_dev)
goto out_free_skb;
arp = arp_hdr(skb);
switch (dev_type) {
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
* ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
* FDDI devices should accept ARP hardware of (1) Ethernet,
* however, to be more robust, we'll accept both 1 (Ethernet)
* or 6 (IEEE 802.2)
*/
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
goto out_free_skb;
break;
}
/* Understand only these message types */
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
goto out_free_skb;
/*
* Extract fields
*/
arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
case ARPHRD_IEEE1394:
break;
#endif
default:
tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out_free_skb;
/*
* For some 802.11 wireless deployments (and possibly other networks),
* there will be an ARP proxy and gratuitous ARP frames are attacks
* and thus should not be accepted.
*/
if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
*/
if (dev_type == ARPHRD_DLCI)
sha = dev->broadcast;
/*
* Process entry. The idea here is we want to send a reply if it is a
* request for us or if it is a request for someone else that we hold
* a proxy for. We want to add an entry to our cache if it is a reply
* to us or if it is a request for our address.
* (The assumption for this last is that if someone is requesting our
* address, they are probably intending to talk to us, so it saves time
* if we cache their address. Their address is also probably not in
* our cache, since ours is not in their cache.)
*
* Putting this another way, we only care about replies if they are to
* us, in which case we add them to the cache. For requests, we care
* about those for us and those for our proxies. We reply to both,
* and in the case of requests for us we add the requester to the arp
* cache.
*/
if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
reply_dst = (struct dst_entry *)
iptunnel_metadata_reply(skb_metadata_dst(skb),
GFP_ATOMIC);
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
int dont_send;
dont_send = arp_ignore(in_dev, sip, tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
dont_send = arp_filter(sip, tip, dev);
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
neigh_release(n);
}
}
goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
(rt->dst.dev != dev &&
pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
goto out_free_dst;
}
goto out_consume_skb;
}
}
}
/* Update our ARP tables */
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
addr_type = -1;
if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
if (!n &&
(is_garp ||
(arp->ar_op == htons(ARPOP_REPLY) &&
(addr_type == RTN_UNICAST ||
(addr_type < 0 &&
/* postpone calculation to as late as possible */
inet_addr_type_dev_table(net, dev, sip) ==
RTN_UNICAST)))))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
if (n) {
int state = NUD_REACHABLE;
int override;
/* If several different ARP replies follows back-to-back,
use the FIRST one. It is possible, if several proxy
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
override = time_after(jiffies,
n->updated +
NEIGH_VAR(n->parms, LOCKTIME)) ||
is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
*/
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
neigh_update(n, sha, state,
override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
neigh_release(n);
}
out_consume_skb:
consume_skb(skb);
out_free_dst:
dst_release(reply_dst);
return NET_RX_SUCCESS;
out_free_skb:
kfree_skb(skb);
return NET_RX_DROP;
}
查找路由表函数fib_lookup:
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
struct fib_table *tb;
int err = -ENETUNREACH;
rcu_read_lock();
tb = fib_get_table(net, RT_TABLE_MAIN);
if (tb)
err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);
if (err == -EAGAIN)
err = -ENETUNREACH;
rcu_read_unlock();
return err;
}
static inline int fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res, unsigned int flags)
{
struct fib_table *tb;
int err = -ENETUNREACH;
flags |= FIB_LOOKUP_NOREF;
if (net->ipv4.fib_has_custom_rules)
return __fib_lookup(net, flp, res, flags);
rcu_read_lock();
res->tclassid = 0;
tb = rcu_dereference_rtnl(net->ipv4.fib_main);
if (tb)
err = fib_table_lookup(tb, flp, res, flags);
if (!err)
goto out;
tb = rcu_dereference_rtnl(net->ipv4.fib_default);
if (tb)
err = fib_table_lookup(tb, flp, res, flags);
out:
if (err == -EAGAIN)
err = -ENETUNREACH;
rcu_read_unlock();
return err;
}
总结
之前一直有过阅读Linux源码的想法,但一直没有付诸实践,借着本次机会对Linux部分源码进行了阅读,加深了对之前学过的一些网络协议的理解,同时感叹于Linux内核的丰富与强大,接下来还会对源码的其他部分进行阅读,丰富自己对Linux的理解。
最后特别感谢孟宁老师一学期以来的辛勤付出,学生受益匪浅,祝您工作顺利,生活愉快。
参考
Linux路由表及路由设置
TCP之send & recv
linux网络协议栈源码分析 - 传输层(TCP连接的建立)
TCP/IP协议栈在Linux内核中的运行时序分析
Linux源码在线阅读
课件:TCP协议栈源代码分析
书:深入理解Linux网络技术内幕 ChristianBenvenuti 夏安等