网卡接收数据包过程(以stmmac为例)

zzZhangYiLong

于 2024-07-29 17:33:47 发布

阅读量718

点赞数 5

分类专栏： Linux 文章标签：网络 linux 运维

本文链接：https://blog.csdn.net/zzZhangYiLong/article/details/140776136

版权

Linux 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

1.硬件中断部分

1.1 硬中断回调函数

以太网数据包到达网卡后，会触发网卡的硬件中断，执行硬件中断回调函数。该回调在MAC驱动的open函数中注册：

stmmac_dvr_probe
{
	//注册MAC操作函数集
	ndev->netdev_ops = &stmmac_netdev_ops;
	//注册NAPI函数
	stmmac_napi_add(ndev);
}

static const struct net_device_ops stmmac_netdev_ops = {
    .ndo_open = stmmac_open,
    .ndo_start_xmit = stmmac_xmit,
    .ndo_stop = stmmac_release,
	.ndo_change_mtu = stmmac_change_mtu,
	……
};
ifconfig up会调用MAC的open
static int stmmac_open(struct net_device *dev)
{
	bfsize = stmmac_set_16kib_bfsize(priv, dev->mtu);
	priv->dma_buf_sz = bfsize;//1536

priv->dma_rx_size = DMA_DEFAULT_RX_SIZE;//512
	priv->dma_tx_size = DMA_DEFAULT_TX_SIZE;//512
ret = alloc_dma_desc_resources(priv);
	ret = init_dma_desc_rings(dev, GFP_KERNEL);
//扫描网卡PHY
	ret = stmmac_init_phy(dev);
	//注册硬件中断回调函数
    ret = request_irq(dev->irq, stmmac_interrupt,
              IRQF_SHARED, dev->name, dev);
	
}

struct stmmac_priv {
    struct net_device *dev;
    /* RX Queue */
    struct stmmac_rx_queue rx_queue[MTL_MAX_RX_QUEUES];
    unsigned int dma_rx_size;
    /* TX Queue */
    struct stmmac_tx_queue tx_queue[MTL_MAX_TX_QUEUES];
    unsigned int dma_tx_size;
    /* Generic channel for NAPI */
    struct stmmac_channel channel[STMMAC_CH_MAX];

1.2 MAC的rx_queue与网卡DMA映射关系

alloc_dma_desc_resources会给stmmac_priv的rx_queue和tx_queue分配空间。

int alloc_dma_rx_desc_resources(struct stmmac_priv *priv)
    /* RX queues buffers and DMA */
    for (queue = 0; queue < rx_count; queue++) {
        struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
	//每个pool大小512
    pp_params.pool_size = priv->dma_rx_size;
    //pages数量 1536+4K/4K = 1
	pp_params.order = num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE);
	pp_params.nid = dev_to_node(priv->device);
	//分配内存pool，里面包含一个ring_buffer
	rx_q->page_pool = page_pool_create(&pp_params);
	//分配512份buf_pool，大小512
	rx_q->buf_pool = kcalloc
	rx_q->dma_rx = dma_alloc_coherent (priv->device,
	                              priv->dma_rx_size *
	                              sizeof(struct dma_desc),
	                              &rx_q->dma_rx_phy,
	                              GFP_KERNEL);

init_dma_desc_rings会将rx_q->dma_rx的地址与DMA建立映射，这样收到网络数据包时，网卡会通过DMA将包转入ringbuffer中。
关于rx_q->dma_rx_phy，这个是CPU的DMA物理地址，获取如下：

struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
    *dma_handle = dma_get_device_base(dev, mem) +
            ((dma_addr_t)pageno << PAGE_SHIFT);

具体建立映射过程如下：

int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
{
    for (queue = 0; queue < rx_count; queue++) {
        struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
        for (i = 0; i < priv->dma_rx_size; i++) {//执行512次
            struct dma_desc *p;
            p = rx_q->dma_rx + i;
            ret = stmmac_init_rx_buffers(priv, p, i, flags,
                             queue);
}

为每个rx_queue分配dma_desc描述符：

int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p, int i, gfp_t flags, u32 queue)
{
    struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
    struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i];
    //分配rx buffer页
	buf->page = page_pool_dev_alloc_pages(rx_q->page_pool);
	//page = pool->alloc.cache[--pool->alloc.count];
	//获取页的dma_addr
	buf->addr = page_pool_get_dma_addr(buf->page);
	    //将addr与DMA建立映射
	stmmac_set_desc_addr(priv, p, buf->addr);
}

调用的方法如下：

static void dwmac4_set_sec_addr(struct dma_desc *p, dma_addr_t addr, bool buf2_valid)
{
    p->des2 = cpu_to_le32(lower_32_bits(addr));
    p->des3 = cpu_to_le32(upper_32_bits(addr));
    if (buf2_valid)
        p->des3 |= cpu_to_le32(RDES3_BUFFER2_VALID_ADDR);
    else
        p->des3 &= cpu_to_le32(~RDES3_BUFFER2_VALID_ADDR);
}

下图为网卡DMA与CPU的bufpool映射图，当数据到来时，网卡通过DMA将数据放入buf_pool中，供软中断处理。
在这里插入图片描述

1.3 NAPI注册

注册NAPI机制的poll函数，关于NAPI机制：
NAPI(New API)， NAPI 是一种高效的网络处理技术。
NAPI 的核心思想就是不全部采用中断来读取网络数据，而是采用中断来唤醒数据接收服务程序，在接收服务程序中采用 POLL 的方法来轮询处理数据。这种方法的好处就是可以提高短数据包的接收效率，减少中断处理的时间。

void stmmac_napi_add(struct net_device *dev)
{
	for (queue = 0; queue < maxq; queue++) {
		struct stmmac_channel *ch = &priv->channel[queue];
        ch->priv_data = priv;
        ch->index = queue;
		//为每个channel->napi注册poll函数
        if (queue < priv->plat->rx_queues_to_use) {
            netif_napi_add(dev, &ch->rx_napi, stmmac_napi_poll_rx,
                       NAPI_POLL_WEIGHT);
        }
        if (queue < priv->plat->tx_queues_to_use) {
            netif_tx_napi_add(dev, &ch->tx_napi, stmmac_napi_poll_tx,
    					NAPI_POLL_WEIGHT);
        }
}
}
NAPI_POLL_WEIGHT是传入的weight值 64
struct napi_struct {
struct list_head    poll_list;
    int         weight;
	int         (*poll)(struct napi_struct *, int);
	struct net_device   *dev;
	struct sk_buff      *skb;
};

channel结构体：

struct stmmac_channel {
    struct napi_struct rx_napi ____cacheline_aligned_in_smp;
    struct napi_struct tx_napi ____cacheline_aligned_in_smp;
    struct stmmac_priv *priv_data;
    spinlock_t lock;
    u32 index;
};

硬中断处理回调

irqreturn_t stmmac_interrupt(int irq, void *dev_id)
{
    /* To handle DMA interrupts */
    stmmac_dma_interrupt(priv);
}

为每个channel开启napi

void stmmac_dma_interrupt(struct stmmac_priv *priv)
{
    for (chan = 0; chan < channels_to_check; chan++)
        status[chan] = stmmac_napi_check(priv, chan);
}

调用__napi_schedule, 根据是TX或RX来传入tx_napi或rx_napi

{
struct stmmac_channel *ch = &priv->channel[chan];
    if ((status & handle_rx) && (chan < priv->plat->rx_queues_to_use))  	{
stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 0);
	__napi_schedule(&ch->rx_napi);
	}
    if ((status & handle_tx) && (chan < priv->plat->tx_queues_to_use))
	{
	    stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 0, 1);
	    __napi_schedule(&ch->tx_napi);
	}
}

该函数会触发软中断：

void __napi_schedule(struct napi_struct *n)
{
    unsigned long flags;
    local_irq_save(flags);
    ____napi_schedule(this_cpu_ptr(&softnet_data), n);
    local_irq_restore(flags);
    preempt_check_resched_rt();
}

将napi的poll_list链表挂到softnet_data的poll_list上，

static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

置位NET_RX_SOFTIRQ标志

void __raise_softirq_irqoff(unsigned int nr)
{
    lockdep_assert_irqs_disabled();
    trace_softirq_raise(nr);
    or_softirq_pending(1UL << nr);
}

2、软中断接收部分

2.1内核启动软中断注册流程

Linux软中断是在内核线程ksoftirqd中处理，该线程的创建流程如下：

smpboot_register_percpu_thread
	__smpboot_create_thread
    	tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, ht->thread_comm);
                    
int smpboot_thread_fn(void *data)
	{
		while (1)
		{
		if (!ht->thread_should_run(td->cpu)) {
            preempt_enable_no_resched();
            schedule();
        } else {
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            ht->thread_fn(td->cpu);
        }
		}
}

由percpu可见，每个核心都会注册smpboot_thread_fn内核线程。
thread_should_run和thread_fn的赋值如下：

static struct smp_hotplug_thread softirq_threads = {
    .store          = &ksoftirqd,
    .thread_should_run  = ksoftirqd_should_run,
    .thread_fn      = run_ksoftirqd,
    .thread_comm        = "ksoftirqd/%u",
};
	先检查本地 __softirq_pending是否被置位。
	static int ksoftirqd_should_run(unsigned int cpu)
	{
	    return local_softirq_pending();
	}
	然后根据标志执行处理。
static void run_ksoftirqd(unsigned int cpu)
{
	//关闭硬件中断
    ksoftirqd_run_begin();
    if (local_softirq_pending()) {
        /*
         * We can safely run softirq on inline stack, as we are not deep
         * in the task stack here.
         */
        __do_softirq();
		//打开硬件中断
        ksoftirqd_run_end();
        cond_resched();
        return;
    }
    ksoftirqd_run_end();
}

下面看一下h->action(h)执行的是什么回调：
1.内核的软中断向量表:

//interrupt.h
enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    IRQ_POLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
    NR_SOFTIRQS
};

在net初始化时，为每个cpu分配sd，并会给softirq_vec注册NET_TX_SOFTIRQ、NET_RX_SOFTIRQ两个软中断回调

int __init net_dev_init(void)
{
    for_each_possible_cpu(i) {
        struct softnet_data *sd = &per_cpu(softnet_data, i);
		INIT_LIST_HEAD(&sd->poll_list);
	}
    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
    open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
    softirq_vec[nr].action = action;
}

softirq_vec是个全局数组，

struct softirq_action softirq_vec[NR_SOFTIRQS]
{
    void    (*action)(struct softirq_action *);
};

相当于：
softirq_vec[NET_RX_SOFTIRQ].action = net_rx_action;
softirq_vec[NET_TX_SOFTIRQ].action = net_tx_action;

2.2 net_rx_action分析

基于上面分析，h->action(h)执行的是net_rx_action。

void net_rx_action(struct softirq_action *h)
{
	//获取该cpu的sd结构体
    struct softnet_data *sd = this_cpu_ptr(&softnet_data);
    unsigned long time_limit = jiffies +
    usecs_to_jiffies(netdev_budget_usecs);
	local_irq_disable();
//从sd->poll_list上取下napi->list
    list_splice_init(&sd->poll_list, &list);
	local_irq_enable();
    for (;;) {
        n = list_first_entry(&list, struct napi_struct, poll_list);
        budget -= napi_poll(n, &repoll);
        if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) {
            sd->time_squeeze++;
            break;
}	
}

net_rx_action首先找到发生硬中断的cpu的sd结构体，并从sd的poll_list链表取下napi(硬中断中设置)，然后在for循环执行，time_limit控制执行时间,以免一次软中断占用过多CPU时间。

int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
    work = 0;
    if (test_bit(NAPI_STATE_SCHED, &n->state)) {
        work = n->poll(n, weight);
        trace_napi_poll(n, work, weight);
}
}

n->poll调用了stmmac_napi_add注册的poll方法：

int stmmac_napi_poll_rx(struct napi_struct *napi, int budget)
{    
	struct stmmac_channel *ch =
	container_of(napi, struct stmmac_channel, rx_napi);
	struct stmmac_priv *priv = ch->priv_data;
	u32 chan = ch->index;
	work_done = stmmac_rx(priv, budget, chan);
	return work_done;
}

int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
{
	struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
	struct stmmac_channel *ch = &priv->channel[queue];
	struct sk_buff *skb = NULL;
	//获取帧长度
	buf1_len = stmmac_rx_buf1_len(priv, p, status, len);
	len += buf1_len;
	buf2_len = stmmac_rx_buf2_len(priv, p, status, len);
	len += buf2_len;
	skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
	    { skb->dev = napi->dev;}
	//CPU准备读取内存的数据
	dma_sync_single_for_cpu(priv->device, buf->addr,
	                        buf1_len, DMA_FROM_DEVICE);
	//将ringbuff数据拷贝到skb
	skb_copy_to_linear_data(skb, page_address(buf->page), buf1_len);
	//设置skb长度
	skb_put(skb, buf1_len);
	napi_gro_receive(&ch->rx_napi, skb);
}

获取帧长度回调

static int dwmac4_wrback_get_rx_frame_len(struct dma_desc *p, int rx_coe)
{
    return (le32_to_cpu(p->des3) & RDES3_PACKET_SIZE_MASK);
}

3、软中断结束至协议栈之前

gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
    gro_result_t ret;
    skb_mark_napi_id(skb, napi);
    trace_napi_gro_receive_entry(skb);
    skb_gro_reset_offset(skb, 0);
    ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
    trace_napi_gro_receive_exit(ret);
    return ret;
}

dev_gro_receive作用主要是把小数据包合并成大包再发送，提高效率。

gro_result_t napi_skb_finish(struct napi_struct *napi,
                    struct sk_buff *skb, gro_result_t ret)
    case GRO_NORMAL:
        gro_normal_one(napi, skb, 1);
        break;

gro_normal_one会将skb挂到napi上，并计数：

void gro_normal_one(struct napi_struct *napi,struct sk_buff *skb,int segs)
{
    list_add_tail(&skb->list, &napi->rx_list);
    napi->rx_count += segs;
    if (napi->rx_count >= gro_normal_batch)
        gro_normal_list(napi);
}

最终会调用：

int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                    struct packet_type **ppt_prev)
{
    orig_dev = skb->dev;
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
        if (pt_prev)
            ret = deliver_skb(skb, pt_prev, orig_dev);
        pt_prev = ptype;
    }
……
}

该函数有个重要的链表遍历：
首先遍历ptype_all，在RAW Socket创建时，会将.func = packet_rcv的packet_type结构体挂到ptype_all链表上。
因此对应RAW Socket来说，这里deliver_skb执行的是packet_rcv，这里也是tcpdump的入口点。

int deliver_skb(struct sk_buff *skb,
                  struct packet_type *pt_prev,
                  struct net_device *orig_dev)
{
    return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

packet_rcv主要的作用就是将skb放到socket的接收队列，然后唤醒阻塞等待接收的进程。

packet_rcv(struct sk_buff *skb, struct net_device *dev,
              struct packet_type *pt, struct net_device *orig_dev)
{
    struct sock *sk;
    struct sockaddr_ll *sll;
    struct packet_sock *po;
    u8 *skb_head = skb->data;
    int skb_len = skb->len;
	sk = pt->af_packet_priv;// po->prot_hook.af_packet_priv = sk;
    po = pkt_sk(sk);
    ……
	spin_lock(&sk->sk_receive_queue.lock);
    po->stats.stats1.tp_packets++;
    __skb_queue_tail(&sk->sk_receive_queue, skb);
    spin_unlock(&sk->sk_receive_queue.lock);
    sk->sk_data_ready(sk);
    return 0;
}

可见，该函数会从packet_type上获取sk，并将skb挂到sk的接收队列sk_receive_queue上，然后执行sk_data_ready(sk)唤醒等待的进程。在packet_create时sk_data_ready被设置成了sock_def_readable。

void sock_def_readable(struct sock *sk)
{
	struct socket_wq *wq;
	wq = rcu_dereference(sk->sk_wq);
    if (skwq_has_sleeper(wq))
        wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                        EPOLLRDNORM | EPOLLRDBAND);
    sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    rcu_read_unlock();
}

首先从sk->sk_wq等待队列取下wait即wait_queue_head，然后调用：

__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);

传入1说明一次只唤醒一个进程。

 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
                        wake_flags, key, &bookmark);

__wake_up_common会从wq_head上取下wait_queue_entry，它的private成员保存着阻塞线程的current指针，
.private = current
.func = receiver_wake_function,
__wake_up_common调用func即receiver_wake_function进行唤醒。

ret = curr->func(curr, mode, wake_flags, key);

最终调用default_wake_function进行线程唤醒：

int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key)             
{
	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
    return try_to_wake_up(curr->private, mode, wake_flags);
}