内容主要参考:《深入理解Linux网络》,公众号「开发内功修炼」
一、数据包收包
1,linux启动部分
2,硬中断、软中断处理
2.1硬中断
注:当RingBuffer满的时候,新来的数据包将给丢弃。ifconfig查看网卡时,可以里面有个overruns,表示因为环形队列满被丢弃的包。如果发现有丢包,可能需要通过ethtool命令来加大环形队列的长度。
-
DMA 操作完成以后,网卡会向 CPU 发起⼀个硬中断,通知 CPU 有数据到达
-
网卡的硬中断注册的处理函数是igb_msix_ring
// drivers/net/ethernet/intel/igb/igb_main.c
static irqreturn_t igb_msix_ring(int irq, void *data)
{
struct igb_q_vector *q_vector = data;
/* Write the ITR value calculated from the previous interrupt. */
igb_write_itr(q_vector);
napi_schedule(&q_vector->napi);
return IRQ_HANDLED;
}
igb_write_itr 只是记录⼀下硬件中断频率(在减少对 CPU 的中断频率时用到)
顺着 napi_schedule 调用⼀路跟踪下去, __napi_schedule —> ____napi_schedule
//net/core/dev.c
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
...
list_add_tail(&napi->poll_list, &sd->poll_list);
//触发一个软中断NET_RX_SOFTIRQ
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
//include/linux/list.h
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
__list_add(new, head->prev, head);
}
//list_add_tail=>__list_add
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
if (!__list_add_valid(new, prev, next))
return;
//在两个已知的连续条目之间插入新条目
next->prev = new;
new->next = next;
new->prev = prev;
WRITE_ONCE(prev->next, new);
}
list_add_tail 修改了 CPU 变量 softnet_data ⾥的 poll_list ,将驱动napi_struct 传过来的 poll_list 添加了进来。(在sd→poll_list的prev和自身间插入napi→poll_list)
__raise_softirq_irqoff (唤醒中断)触发了⼀个软中断 NET_RX_SOFTIRQ, 这个所谓的触发过程只是对⼀个变量进⾏了⼀次或运算⽽已。
追踪or_softtriq_pending()函数,可以找到它最终操作的ia64_ .cpu_ info 的结构体为cpuinfo jia64( arch/ia64/include/asm/processor.h )。该结构定义了CPU类型,硬件BUG标志,CPU状态等。(https://blog.csdn.net/wangquan1992/article/details/117361412)可以看到,跟软中断相关的字段是每个CPU都有一个64位(32位机器就是32位)掩码的字段。他描述挂起的软中断。每一位对应相应的软中断。比如0位代表HI_ SOFTIRQ。明白了or_ oftirq pending函数设置了CPU中第四个软中断NET_ RX_ SOFTIRQ, 表示软中断挂起。
void __raise_softirq_irqoff(unsigned int nr) {
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
//include/linux/interrupt.h
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
硬中断里只完成简单必要的⼯作,剩下的⼤部分的处理都是转交给软中断的。
2.2软中断
判断softirq_pending标志。和硬中断中调用了同⼀个函数 local_softirq_pending 。不同的是硬中断位置是为了写入标记,这里仅仅只是读取。如果硬中断中设置了 NET_RX_SOFTIRQ,这里自然能读取的到。接下来会真正进⼊线程函数中 run_ksoftirqd 处理
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
执行run_ksoftirqd->__do_softirq,判断根据当前 CPU 的软中断类型,调⽤其注册的 action ⽅法
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
while ((softirq_bit = ffs(pending))) {
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
wakeup_softirqd();
}
...
}
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
...
for (;;) {
struct napi_struct *n;
...
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
...
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
}
核心逻辑是获取到当前 CPU变量 softnet_data,对其 poll_list 进⾏遍历, 然后执行到网卡驱动注册到的 poll 函数
//net/core/dev.c
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
for (;;) {
struct napi_struct *n;
n = list_first_entry(&list, struct napi_struct, poll_list);
//变量sd,调用poll函数
budget -= napi_poll(n, &repoll);
//budget 与 time_limit控制退出
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
对于igb 网卡来说,就是 igb 驱动里的 igb_poll 函数。在读取操作中, igb_poll 的重点工作是对 igb_clean_rx_irq 的调用
static int igb_poll(struct napi_struct *napi, int budget)
{
if (q_vector->tx.ring)
clean_complete = igb_clean_tx_irq(q_vector, budget);
if (q_vector->rx.ring) {
int cleaned = igb_clean_rx_irq(q_vector, budget);
}
}
static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
{
struct igb_adapter *adapter = q_vector->adapter;
struct igb_ring *rx_ring = q_vector->rx.ring;
struct sk_buff *skb = rx_ring->skb;
...
while (likely(total_packets < budget)) {
...
/* retrieve a buffer from the ring */
skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb);
/* fetch next buffer in frame if non-eop */
if (igb_is_non_eop(rx_ring, rx_desc))
continue;
}
/* verify the packet layout is correct */
if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
skb = NULL;
continue;
}
/* populate checksum, timestamp, VLAN, and protocol */
igb_process_skb_fields(rx_ring, rx_desc, skb);
napi_gro_receive(&q_vector->napi, skb);
}
...
return total_packets;
}
- 从ringbuff中取出数据skb(sk_buff);
- 收取完数据以后,对其进行⼀些校验
- 设置 sbk 变量的 timestamp, VLAN id, protocol
- 进入到napi_gro_receive 中
napi_gro_receive函数代表的是网卡 GRO 特性,可以简单理解成把相关的小包合并成⼀个大包。
//net/core/gro.c
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
skb_gro_reset_offset(skb);
ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
}
napi_skb_finish , 这个函数主要就是调⽤了 netif_receive_skb,数据包被送到协议栈中。
//file: net/core/gro.c
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb(skb))
ret = GRO_DROP;
break;
......
}
3,网络协议栈处理
netif_receive_skb 函数会根据包的协议,假如是 udp 包,会将包依次送到 ip_rcv(),udp_rcv() 协议处理函数中进行处理。
netif_receive_skb→netif_receive_skb_internal(RPS处理)→__netif_receive_skb→__netif_receive_skb_one_core→__netif_receive_skb_core
新版本内核多了严谨的跳转步骤,但思路和关键处理不变
//net/core/dev.c
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
//pcap逻辑,这⾥会将数据送⼊抓包点。tcpdump就是从这个⼊⼝获取包的
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
...
return ret;
}
//遍历list
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
struct packet_type **pt,
struct net_device *orig_dev,
__be16 type,
struct list_head *ptype_list)
{
struct packet_type *ptype, *pt_prev = *pt;
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
if (pt_prev)
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
*pt = pt_prev;
}
//fun
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
函数处理的任务
- type = skb->protocol取出协议信息,
- 遍历注册在这个协议上的回调函数列表, ptype_base 是 hash table初始化时注册的
- pt_prev->func 协议层注册的处理函数ip_rcv
IP层处理
//net/ipv4/ip_input.c
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct
packet_type *pt, struct net_device *orig_dev)
{
......
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev,
NULL,
ip_rcv_finish);
}
NF_HOOK 是⼀个钩⼦函数,当执⾏完注册的钩⼦后就会执⾏到最后⼀个参数指向的函数
ip_rcv_finish
ip_rcv_finish→ip_rcv_finish_core→ip_route_input_noref(file: net/ipv4/route.c)→ip_route_input_rcu→ip_route_input_mc→rt_dst_alloc
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
bool nopolicy, bool noxfrm, bool will_cache)
{
rt->dst.output = ip_output;
if (flags & RTCF_LOCAL)
rt->dst.input = ip_local_deliver;
...
}
函数 ip_local_deliver 被赋值给了 dst.input
回到 ip_rcv_finish 中的 return dst_input(skb)
static inline int dst_input(struct sk_buff *skb)
{
return skb_dst(skb)->input(skb);
}
skb_dst(skb)->input 调用的 input 方法就是路由子系统赋的 ip_local_deliver.
回到ip_local_deliver→ip_local_deliver_finish→ip_protocol_deliver_rcu
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
const struct net_protocol *ipprot;
int raw, ret;
resubmit:
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
...
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
...
}
}
协议注册小节看到 inet_protos 中保存着 tcp_v4_rcv() 和 udp_rcv() 的函数地址。这里将会根据包中的协议类型选择进行分发,在这里 skb 包将会进⼀步被派送到更上层的协议中,UDP 和 TCP。