背景
最近在学习Linux协议栈相关内容,当被问到Linux 内核网卡收报的过程,我们能很快的简述一个大概的过程,但是对于报文外出,对外发送时的处理过程却有些模糊,故借此机会,对Linux内核报文发送过程进一步学习。
以ipv4为例,Linux内核报文发送分两种情况:1.本机外出报文发送 2.转发报文发送;依据这两种情况,我们分别了解学习。
发送处理
本机报文
本机外发报文有多种情况,比如arp回应报文, icmp回应报文,网络不可达报文,本地特殊协议报文等等;在本机报文外发前 需要做些主准备工作,以ipv4报文为例:
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
}
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
rt = skb_rtable(skb); /* 获取skb路由条目指针地址 */
/* 如果rt不为空,说明skb路由已确定,直接跳转后续处理 */
if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
/* rt为空,skb还未确定路由信息,则依据skb信息查找路由并设置 */
if (!rt) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet->inet_daddr;
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
/* 依据报文流信息查路由,如果失败传输层一直尝试直达路由出现或连接超时 */
rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
}
/* 设置skb内部路由条目指针信息 */
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
/* 确定路由后,分配并建立IP header */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
/* 设置ip头信息 */
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb->h.foo itself. */
/* 设置IP头部选项信息 */
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
/* 本机报文外出 */
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
return -EHOSTUNREACH;
}
对于IP报文而言,主要做了如下准备工作:
1) 确定 skb路由条目信息
2)构建并初始化ip_hdr头部信息
3)调用本地外发接口ip_local_out对外发送。
从内核代码来看,尽管本机报文外发有多种情况,但有一个共同的处理点:ip_local_out,函数处理逻辑如下:
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
/* 本机报文外发的核心处理函数 */
err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
由此可见,核心处理函数为:__ip_local_out;
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
/* 获取报文ip头 */
struct iphdr *iph = ip_hdr(skb);
/* 设置报文总长度 */
iph_set_totlen(iph, skb->len);
/* ip头校验码设置,这里只是针对ip头部的校验,因为在ip报文正常转发过程中(不考虑nat等其他操作),内容不变,只是头部字段
ttl发生变化,因此只对头部检验,可减轻cpu负载 */
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_out(sk, skb);
if (unlikely(!skb))
return 0;
/* 设置L3协议信息 */
skb->protocol = htons(ETH_P_IP);
/* 经过netfilter(内核防火墙)的处理后,通过dst_output继续外发处理 */
/* netfilter 通过四表五链配置并维护报文匹配过滤的规则,同时在对应的HOOK点
注册钩子函数,对于在特定HOOK点匹配特定规则的报文,通过对应的钩子函数进行处理 */
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
当ip报文经过netfilter框架处理后,通过dst_output继续外发;
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return INDIRECT_CALL_INET(skb_dst(skb)->output,
ip6_output, ip_output,
net, sk, skb);
}
通过dst_output处理逻辑可知,其作用主要依据报文协议(IPV4/IPV6)进一步选择外出处理函数,对于IPV4而言,对应ip_output,net,sk,skb作为参数进一步传入ip_output。
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;/* 报文入接口*/
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev; /* 出接口 */
skb->protocol = htons(ETH_P_IP);
/* POSTROUTING 可作SNAT处理等 */
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
报文经过Netfilter处理后后续处理过程如下:
ip_finish_output函数
__ip_finish_output函数
1)设置mtu
2)判断报文是否需要分片,若是,报文分片
3)ip_finish_output2函数处理
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
/* 更新MIB统计信息 */
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
if (!skb)
return -ENOMEM;
}
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
if (res < 0 || res == LWTUNNEL_XMIT_DONE)
return res;
}
rcu_read_lock();
/* 根据skb, gw查找邻居信息 */
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
/* 邻居信息确认,更新邻居信息超时时间 */
sock_confirm_neigh(skb, neigh);
/* if crossing protocols, can not use the cached header */
/* 报文通过邻居接口对外发送 */
res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock();
return res;
}
rcu_read_unlock();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
return -EINVAL;
}
ip_finish_output2的作用:
1)根据skb, gw信息查找邻居信息
2)邻居信息确认,更新邻居信息超时时间
3)依据邻居输出接口neigh_output对外发送
neigh_output核心调用neigh_resolve_output函数
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
int rc = 0;
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
neigh_hh_init(neigh);
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
/* 设置报文mac层信息 */
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
/* 报文向下发送 */
rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
neigh_resolve_output函数处理
dev_queue_xmit函数处理
__dev_queue_xmit(skb, NULL);处理
1)选择发送队列
2)确定队列排队规则
3)__dev_xmit_skb(skb, q, dev, txq)继续向下发送。
1)如果硬件队列允许发送更多的包,且排队规则不空则允许绕过排队队列发送。__qdisc_run(q)函数处理,开始发送;
void __qdisc_run(struct Qdisc *q)
{
int quota = READ_ONCE(dev_tx_weight);
int packets;
/* qdisc_restart 循环从队列中取出skb,并发送 */
while (qdisc_restart(q, &packets)) {
quota -= packets;
if (quota <= 0) {
if (q->flags & TCQ_F_NOLOCK)
set_bit(__QDISC_STATE_MISSED, &q->state);
else
/* netif调度,触发发包软中断NET_TX_SOFTIRQ */
__netif_schedule(q);
break;
}
}
}
__qdisc_run()函数核心作两件事:
1)skb出队,并通过sch_direct_xmit()继续对外发送。
2)报文硬件发送后,触发发包软中断,进行发包后的善后处理。
硬件发送后续处理流程如下:
1)sch_direct_xmit函数处理
1) dev_hard_start_xmit函数处理
1)xmit_one()函数处理
1)netdev_start_xmit()函数处理
1)__netdev_start_xmit函数处理
__netdev_start_xmit函数具体定义如下:
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
return ops->ndo_start_xmit(skb, dev);
}
由此可见这里最终调用网络设备挂载时定义的发送接口,在此前博客网卡收报过程学习研究时有提到,以e1000网卡为例,网络操作定义如下:
static const struct net_device_ops e100_netdev_ops = {
.ndo_open = e100_open,
.ndo_stop = e100_close,
.ndo_start_xmit = e100_xmit_frame,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_rx_mode = e100_set_multicast_list,
.ndo_set_mac_address = e100_set_mac_address,
.ndo_eth_ioctl = e100_do_ioctl,
.ndo_tx_timeout = e100_tx_timeout,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = e100_netpoll,
#endif
.ndo_set_features = e100_set_features,
};
发送接口为e100_xmit_frame(每种设备都有自己定义的网络设备操作接口);e100_xmit_frame核心处理为:
e100_exec_cb(nic, skb, e100_xmit_prepare); 在了解该函数前,有必要先了解下发包准备函数e100_xmit_prepare:
static int e100_xmit_prepare(struct nic *nic, struct cb *cb,
struct sk_buff *skb)
{
dma_addr_t dma_addr;
cb->command = nic->tx_command;
/* 将skb数据映射到网卡对应的dma区域(内存中专门为网卡dma分配的一块区域)*/
dma_addr = dma_map_single(&nic->pdev->dev, skb->data, skb->len,
DMA_TO_DEVICE);
/* If we can't map the skb, have the upper layer try later */
if (dma_mapping_error(&nic->pdev->dev, dma_addr))
return -ENOMEM;
/*
* Use the last 4 bytes of the SKB payload packet as the CRC, used for
* testing, ie sending frames with bad CRC.
*/
/* 设置报文发送参数,为网卡读取报文数据,发送作准备 */
if (unlikely(skb->no_fcs))
cb->command |= cpu_to_le16(cb_tx_nc);
else
cb->command &= ~cpu_to_le16(cb_tx_nc);
/* interrupt every 16 packets regardless of delay */
if ((nic->cbs_avail & ~15) == nic->cbs_avail)
cb->command |= cpu_to_le16(cb_i);
cb->u.tcb.tbd_array = cb->dma_addr + offsetof(struct cb, u.tcb.tbd);
cb->u.tcb.tcb_byte_count = 0;
cb->u.tcb.threshold = nic->tx_threshold;
cb->u.tcb.tbd_count = 1;
cb->u.tcb.tbd.buf_addr = cpu_to_le32(dma_addr);
cb->u.tcb.tbd.size = cpu_to_le16(skb->len);
skb_tx_timestamp(skb);
return 0;
}
在将报文映射到dma缓冲区后,网卡直接读取dma数据对外发送。
static int e100_exec_cb(struct nic *nic, struct sk_buff *skb,
int (*cb_prepare)(struct nic *, struct cb *, struct sk_buff *))
{
struct cb *cb;
unsigned long flags;
int err;
spin_lock_irqsave(&nic->cb_lock, flags);
if (unlikely(!nic->cbs_avail)) {
err = -ENOMEM;
goto err_unlock;
}
cb = nic->cb_to_use;
nic->cb_to_use = cb->next;
nic->cbs_avail--;
cb->skb = skb;
/* skb发送前的准备工作,主要是将skb映射到网卡dma缓冲区 */
err = cb_prepare(nic, cb, skb);
if (err)
goto err_unlock;
if (unlikely(!nic->cbs_avail))
err = -ENOSPC;
/* Order is important otherwise we'll be in a race with h/w:
* set S-bit in current first, then clear S-bit in previous. */
cb->command |= cpu_to_le16(cb_s);
dma_wmb();
cb->prev->command &= cpu_to_le16(~cb_s);
while (nic->cb_to_send != nic->cb_to_use) {
/* 报文真正的发送操作,从dma缓冲区拷贝到网卡内存对外发送 */
if (unlikely(e100_exec_cmd(nic, nic->cuc_cmd,
nic->cb_to_send->dma_addr))) {
/* Ok, here's where things get sticky. It's
* possible that we can't schedule the command
* because the controller is too busy, so
* let's just queue the command and try again
* when another command is scheduled. */
if (err == -ENOSPC) {
//request a reset
schedule_work(&nic->tx_timeout_task);
}
break;
} else {
nic->cuc_cmd = cuc_resume;
nic->cb_to_send = nic->cb_to_send->next;
}
}
err_unlock:
spin_unlock_irqrestore(&nic->cb_lock, flags);
return err;
}
转发报文
同样以IPv4报文为例,转发报文的共同处理点为ip_forward()函数,由ip_forward的函数调用路径可知,ip_forward也是在报文经过路由后处理。
ip_forward具体处理过程如下:
int ip_forward(struct sk_buff *skb)
{
u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
struct net *net;
SKB_DR(reason);
/* that should never happen */
/* ip报文检查确认是发给自己的,同时需要对外转发的 */
if (skb->pkt_type != PACKET_HOST)
goto drop;
if (unlikely(skb->sk))
goto drop;
if (skb_warn_if_lro(skb))
goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
}
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
/* 转发校验码状态设置为CHECKSUM_NONE,后续重新校验 */
skb_forward_csum(skb);
net = dev_net(skb->dev);
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
/* ttl检查 */
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
/* 与加密报文有关 */
if (!xfrm4_route_forward(skb)) {
SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
}
/* 获取skb路由条目信息 */
rt = skb_rtable(skb);
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
/* skb flags设置为IPSKB_FORWARDED */
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
SKB_DR_SET(reason, PKT_TOO_BIG);
goto drop;
}
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority))
skb->priority = rt_tos2priority(iph->tos);
/* 转发报文netfilter处理,匹配过滤 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
SKB_DR_SET(reason, IP_INHDR);
drop:
kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
通过上述代码可知,转发报文经过Netfilter处理后由函数ip_forward_finish接管,后续处理过程如下:
1)dst_output函数处理
2)ip_output函数处理
3)后续的处理逻辑,通本机报文外发一致了,这里不在重复。
软中断处理
通过上述处理逻辑可知,在报文发送后会触发发送软中断NET_TX_SOFTIRQ,对应注册的处理函数如下:
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
即net_tx_action,函数定义如下:
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
/* 当skb完成发送时,会移入当前cpu的完成队列中 */
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
/* 遍历完成队列,释放skb内存 */
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(refcount_read(&skb->users));
if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
get_kfree_skb_cb(skb)->reason);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__napi_kfree_skb(skb,
get_kfree_skb_cb(skb)->reason);
}
}
/* 如果当前cpu输出(发送)队列不为空, 通过排队发送接口继续发送 */
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
rcu_read_lock();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
if (!(q->flags & TCQ_F_NOLOCK)) {
root_lock = qdisc_lock(q);
spin_lock(root_lock);
} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
&q->state))) {
/* There is a synchronize_net() between
* STATE_DEACTIVATED flag being set and
* qdisc_reset()/some_qdisc_is_busy() in
* dev_deactivate(), so we can safely bail out
* early here to avoid data race between
* qdisc_deactivate() and some_qdisc_is_busy()
* for lockless qdisc.
*/
clear_bit(__QDISC_STATE_SCHED, &q->state);
continue;
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
}
rcu_read_unlock();
}
xfrm_dev_backlog(sd);
}
至此,Linux内核对外发包处理过程已经结束,本人也在学习过程中,可能存在误解或不清晰的地方,希望一起交流学习。