【Linux内核】内核收包过程

zq.xidian

于 2024-09-30 19:39:07 发布

阅读量543

点赞数 11

分类专栏： Linux内核文章标签： linux 网络协议计算机网络 c语言

本文链接：https://blog.csdn.net/zq563100792/article/details/142662717

版权

Linux内核专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章目录

- 一、数据包收包

内容主要参考：《深入理解Linux网络》，公众号「开发内功修炼」

一、数据包收包

1，linux启动部分

见【Linux内核】内核协议栈结构和初始化

2，硬中断、软中断处理

2.1硬中断

在这里插入图片描述

注：当RingBuffer满的时候，新来的数据包将给丢弃。ifconfig查看网卡时，可以里面有个overruns，表示因为环形队列满被丢弃的包。如果发现有丢包，可能需要通过ethtool命令来加大环形队列的长度。

DMA 操作完成以后，网卡会向 CPU 发起⼀个硬中断，通知 CPU 有数据到达
网卡的硬中断注册的处理函数是igb_msix_ring

// drivers/net/ethernet/intel/igb/igb_main.c
static irqreturn_t igb_msix_ring(int irq, void *data)
{
  struct igb_q_vector *q_vector = data;

  /* Write the ITR value calculated from the previous interrupt. */
  igb_write_itr(q_vector);

  napi_schedule(&q_vector->napi);

  return IRQ_HANDLED;
}

igb_write_itr 只是记录⼀下硬件中断频率（在减少对 CPU 的中断频率时用到）

顺着 napi_schedule 调用⼀路跟踪下去， __napi_schedule —> ____napi_schedule

//net/core/dev.c
static inline void ____napi_schedule(struct softnet_data *sd,
             struct napi_struct *napi)
{
...
  list_add_tail(&napi->poll_list, &sd->poll_list);
    //触发一个软中断NET_RX_SOFTIRQ
  __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

//include/linux/list.h
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
  __list_add(new, head->prev, head);
}
//list_add_tail=>__list_add
static inline void __list_add(struct list_head *new,
            struct list_head *prev,
            struct list_head *next)
{
  if (!__list_add_valid(new, prev, next))
    return;
//在两个已知的连续条目之间插入新条目
  next->prev = new;
  new->next = next;
  new->prev = prev;
  WRITE_ONCE(prev->next, new);
}

list_add_tail 修改了 CPU 变量 softnet_data ⾥的 poll_list ，将驱动napi_struct 传过来的 poll_list 添加了进来。(在sd→poll_list的prev和自身间插入napi→poll_list)

__raise_softirq_irqoff （唤醒中断）触发了⼀个软中断 NET_RX_SOFTIRQ，这个所谓的触发过程只是对⼀个变量进⾏了⼀次或运算⽽已。

追踪or_softtriq_pending（）函数，可以找到它最终操作的ia64_ .cpu_ info 的结构体为cpuinfo jia64( arch/ia64/include/asm/processor.h )。该结构定义了CPU类型，硬件BUG标志，CPU状态等。（https://blog.csdn.net/wangquan1992/article/details/117361412）可以看到，跟软中断相关的字段是每个CPU都有一个64位(32位机器就是32位)掩码的字段。他描述挂起的软中断。每一位对应相应的软中断。比如0位代表HI_ SOFTIRQ。明白了or_ oftirq pending函数设置了CPU中第四个软中断NET_ RX_ SOFTIRQ, 表示软中断挂起。

void __raise_softirq_irqoff(unsigned int nr) {
 trace_softirq_raise(nr);
 or_softirq_pending(1UL << nr);
}
//include/linux/interrupt.h
enum
{
  HI_SOFTIRQ=0,
  TIMER_SOFTIRQ,
  NET_TX_SOFTIRQ,
  NET_RX_SOFTIRQ,
  BLOCK_SOFTIRQ,
  IRQ_POLL_SOFTIRQ,
  TASKLET_SOFTIRQ,
  SCHED_SOFTIRQ,
  HRTIMER_SOFTIRQ,
  RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

  NR_SOFTIRQS
};

硬中断里只完成简单必要的⼯作，剩下的⼤部分的处理都是转交给软中断的。

2.2软中断

在这里插入图片描述

判断softirq_pending标志。和硬中断中调用了同⼀个函数 local_softirq_pending 。不同的是硬中断位置是为了写入标记，这里仅仅只是读取。如果硬中断中设置了 NET_RX_SOFTIRQ，这里自然能读取的到。接下来会真正进⼊线程函数中 run_ksoftirqd 处理

static int ksoftirqd_should_run(unsigned int cpu)
{
  return local_softirq_pending();
}

执行run_ksoftirqd->__do_softirq，判断根据当前 CPU 的软中断类型，调⽤其注册的 action ⽅法

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
 
  while ((softirq_bit = ffs(pending))) {
 
    trace_softirq_entry(vec_nr);
    h->action(h);
    trace_softirq_exit(vec_nr);
    
    wakeup_softirqd();
  }
...
}

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
  struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  unsigned long time_limit = jiffies +
    usecs_to_jiffies(netdev_budget_usecs);
  int budget = netdev_budget;
...

  for (;;) {
    struct napi_struct *n;
  ...
    n = list_first_entry(&list, struct napi_struct, poll_list);
    budget -= napi_poll(n, &repoll);

...
    }
  }

  local_irq_disable();

  list_splice_tail_init(&sd->poll_list, &list);
  list_splice_tail(&repoll, &list);
  list_splice(&list, &sd->poll_list);
  if (!list_empty(&sd->poll_list))
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);

  net_rps_action_and_irq_enable(sd);
}

核心逻辑是获取到当前 CPU变量 softnet_data，对其 poll_list 进⾏遍历, 然后执行到网卡驱动注册到的 poll 函数

//net/core/dev.c
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
  struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  unsigned long time_limit = jiffies +
    usecs_to_jiffies(netdev_budget_usecs);
  int budget = netdev_budget;
 
  for (;;) {
    struct napi_struct *n;
 
    n = list_first_entry(&list, struct napi_struct, poll_list);
        //变量sd，调用poll函数
    budget -= napi_poll(n, &repoll);
 
        //budget 与 time_limit控制退出
    if (unlikely(budget <= 0 ||
           time_after_eq(jiffies, time_limit))) {
      sd->time_squeeze++;
      break;
    }
  }

对于igb 网卡来说，就是 igb 驱动里的 igb_poll 函数。在读取操作中， igb_poll 的重点工作是对 igb_clean_rx_irq 的调用

static int igb_poll(struct napi_struct *napi, int budget)
{
 
  if (q_vector->tx.ring)
    clean_complete = igb_clean_tx_irq(q_vector, budget);
 
  if (q_vector->rx.ring) {
    int cleaned = igb_clean_rx_irq(q_vector, budget);
  }
 
}
 

static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
{
  struct igb_adapter *adapter = q_vector->adapter;
  struct igb_ring *rx_ring = q_vector->rx.ring;
  struct sk_buff *skb = rx_ring->skb;
  ...
  
  while (likely(total_packets < budget)) {
    ...
    /* retrieve a buffer from the ring */
   skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb);
   /* fetch next buffer in frame if non-eop */
   if (igb_is_non_eop(rx_ring, rx_desc))
   continue;
   }
   /* verify the packet layout is correct */
   if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
   skb = NULL;
   continue;
   }
   /* populate checksum, timestamp, VLAN, and protocol */
   igb_process_skb_fields(rx_ring, rx_desc, skb);
   napi_gro_receive(&q_vector->napi, skb);
  }
  ...
  return total_packets;
}

从ringbuff中取出数据skb（sk_buff）；
收取完数据以后，对其进行⼀些校验
设置 sbk 变量的 timestamp, VLAN id, protocol
进入到napi_gro_receive 中

napi_gro_receive函数代表的是网卡 GRO 特性，可以简单理解成把相关的小包合并成⼀个大包。

//net/core/gro.c
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
  skb_gro_reset_offset(skb);
 
  ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
  trace_napi_gro_receive_exit(ret);
 
}

napi_skb_finish , 这个函数主要就是调⽤了 netif_receive_skb，数据包被送到协议栈中。

//file: net/core/gro.c
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
 switch (ret) {
 case GRO_NORMAL:
 if (netif_receive_skb(skb))
 ret = GRO_DROP;
 break;
 ......
}

3，网络协议栈处理

在这里插入图片描述

netif_receive_skb 函数会根据包的协议，假如是 udp 包，会将包依次送到 ip_rcv()，udp_rcv() 协议处理函数中进行处理。

netif_receive_skb→netif_receive_skb_internal（RPS处理）→__netif_receive_skb→__netif_receive_skb_one_core→__netif_receive_skb_core

新版本内核多了严谨的跳转步骤，但思路和关键处理不变

//net/core/dev.c
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
            struct packet_type **ppt_prev)
{
  //pcap逻辑，这⾥会将数据送⼊抓包点。tcpdump就是从这个⼊⼝获取包的    
  list_for_each_entry_rcu(ptype, &ptype_all, list) {
    if (pt_prev)
      ret = deliver_skb(skb, pt_prev, orig_dev);
    pt_prev = ptype;
  }
  type = skb->protocol;
  /* deliver only exact match when indicated */
  if (likely(!deliver_exact)) {
    deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
               &ptype_base[ntohs(type) &
               PTYPE_HASH_MASK]);
  }
...
  return ret;
}
 
//遍历list
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
            struct packet_type **pt,
            struct net_device *orig_dev,
            __be16 type,
            struct list_head *ptype_list)
{
  struct packet_type *ptype, *pt_prev = *pt;
 
  list_for_each_entry_rcu(ptype, ptype_list, list) {
    if (ptype->type != type)
      continue;
    if (pt_prev)
      deliver_skb(skb, pt_prev, orig_dev);
    pt_prev = ptype;
  }
  *pt = pt_prev;
}
 
//fun
static inline int deliver_skb(struct sk_buff *skb,
            struct packet_type *pt_prev,
            struct net_device *orig_dev)
{
 
  return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

函数处理的任务

type = skb->protocol取出协议信息，
遍历注册在这个协议上的回调函数列表， ptype_base 是 hash table初始化时注册的
pt_prev->func 协议层注册的处理函数ip_rcv

IP层处理

//net/ipv4/ip_input.c
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct
packet_type *pt, struct net_device *orig_dev)
{
 ......
 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev,
NULL,
 ip_rcv_finish);
}

NF_HOOK 是⼀个钩⼦函数，当执⾏完注册的钩⼦后就会执⾏到最后⼀个参数指向的函数
ip_rcv_finish

ip_rcv_finish→ip_rcv_finish_core→ip_route_input_noref（file: net/ipv4/route.c）→ip_route_input_rcu→ip_route_input_mc→rt_dst_alloc

struct rtable *rt_dst_alloc(struct net_device *dev,
          unsigned int flags, u16 type,
          bool nopolicy, bool noxfrm, bool will_cache)
{
    rt->dst.output = ip_output;
    if (flags & RTCF_LOCAL)
      rt->dst.input = ip_local_deliver;
        ...
}

函数 ip_local_deliver 被赋值给了 dst.input

回到 ip_rcv_finish 中的 return dst_input(skb)

static inline int dst_input(struct sk_buff *skb)
{
 return skb_dst(skb)->input(skb);
}

skb_dst(skb)->input 调用的 input 方法就是路由子系统赋的 ip_local_deliver.

回到ip_local_deliver→ip_local_deliver_finish→ip_protocol_deliver_rcu

void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
  const struct net_protocol *ipprot;
  int raw, ret;

resubmit:
  ipprot = rcu_dereference(inet_protos[protocol]);
  if (ipprot) {
    ...
    ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
              skb);
    ...
  }
}