1.硬件中断部分
1.1 硬中断回调函数
以太网数据包到达网卡后,会触发网卡的硬件中断,执行硬件中断回调函数。该回调在MAC驱动的open函数中注册:
stmmac_dvr_probe
{
//注册MAC操作函数集
ndev->netdev_ops = &stmmac_netdev_ops;
//注册NAPI函数
stmmac_napi_add(ndev);
}
static const struct net_device_ops stmmac_netdev_ops = {
.ndo_open = stmmac_open,
.ndo_start_xmit = stmmac_xmit,
.ndo_stop = stmmac_release,
.ndo_change_mtu = stmmac_change_mtu,
……
};
ifconfig up会调用MAC的open
static int stmmac_open(struct net_device *dev)
{
bfsize = stmmac_set_16kib_bfsize(priv, dev->mtu);
priv->dma_buf_sz = bfsize;//1536
priv->dma_rx_size = DMA_DEFAULT_RX_SIZE;//512
priv->dma_tx_size = DMA_DEFAULT_TX_SIZE;//512
ret = alloc_dma_desc_resources(priv);
ret = init_dma_desc_rings(dev, GFP_KERNEL);
//扫描网卡PHY
ret = stmmac_init_phy(dev);
//注册硬件中断回调函数
ret = request_irq(dev->irq, stmmac_interrupt,
IRQF_SHARED, dev->name, dev);
}
struct stmmac_priv {
struct net_device *dev;
/* RX Queue */
struct stmmac_rx_queue rx_queue[MTL_MAX_RX_QUEUES];
unsigned int dma_rx_size;
/* TX Queue */
struct stmmac_tx_queue tx_queue[MTL_MAX_TX_QUEUES];
unsigned int dma_tx_size;
/* Generic channel for NAPI */
struct stmmac_channel channel[STMMAC_CH_MAX];
1.2 MAC的rx_queue与网卡DMA映射关系
alloc_dma_desc_resources会给stmmac_priv的rx_queue和tx_queue分配空间。
int alloc_dma_rx_desc_resources(struct stmmac_priv *priv)
/* RX queues buffers and DMA */
for (queue = 0; queue < rx_count; queue++) {
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
//每个pool大小512
pp_params.pool_size = priv->dma_rx_size;
//pages数量 1536+4K/4K = 1
pp_params.order = num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE);
pp_params.nid = dev_to_node(priv->device);
//分配内存pool,里面包含一个ring_buffer
rx_q->page_pool = page_pool_create(&pp_params);
//分配512份buf_pool,大小512
rx_q->buf_pool = kcalloc
rx_q->dma_rx = dma_alloc_coherent (priv->device,
priv->dma_rx_size *
sizeof(struct dma_desc),
&rx_q->dma_rx_phy,
GFP_KERNEL);
init_dma_desc_rings会将rx_q->dma_rx的地址与DMA建立映射,这样收到网络数据包时,网卡会通过DMA将包转入ringbuffer中。
关于rx_q->dma_rx_phy,这个是CPU的DMA物理地址,获取如下:
struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
*dma_handle = dma_get_device_base(dev, mem) +
((dma_addr_t)pageno << PAGE_SHIFT);
具体建立映射过程如下:
int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
{
for (queue = 0; queue < rx_count; queue++) {
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
for (i = 0; i < priv->dma_rx_size; i++) {//执行512次
struct dma_desc *p;
p = rx_q->dma_rx + i;
ret = stmmac_init_rx_buffers(priv, p, i, flags,
queue);
}
为每个rx_queue分配dma_desc描述符:
int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p, int i, gfp_t flags, u32 queue)
{
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i];
//分配rx buffer页
buf->page = page_pool_dev_alloc_pages(rx_q->page_pool);
//page = pool->alloc.cache[--pool->alloc.count];
//获取页的dma_addr
buf->addr = page_pool_get_dma_addr(buf->page);
//将addr与DMA建立映射
stmmac_set_desc_addr(priv, p, buf->addr);
}
调用的方法如下:
static void dwmac4_set_sec_addr(struct dma_desc *p, dma_addr_t addr, bool buf2_valid)
{
p->des2 = cpu_to_le32(lower_32_bits(addr));
p->des3 = cpu_to_le32(upper_32_bits(addr));
if (buf2_valid)
p->des3 |= cpu_to_le32(RDES3_BUFFER2_VALID_ADDR);
else
p->des3 &= cpu_to_le32(~RDES3_BUFFER2_VALID_ADDR);
}
下图为网卡DMA与CPU的bufpool映射图,当数据到来时,网卡通过DMA将数据放入buf_pool中,供软中断处理。
1.3 NAPI注册
注册NAPI机制的poll函数,关于NAPI机制:
NAPI(New API), NAPI 是一种高效的网络处理技术。
NAPI 的核心思想就是不全部采用中断来读取网络数据,而是采用中断来唤醒数据接收服务程序,在接收服务程序中采用 POLL 的方法来轮询处理数据。这种方法的好处就是可以提高短数据包的接收效率,减少中断处理的时间。
void stmmac_napi_add(struct net_device *dev)
{
for (queue = 0; queue < maxq; queue++) {
struct stmmac_channel *ch = &priv->channel[queue];
ch->priv_data = priv;
ch->index = queue;
//为每个channel->napi注册poll函数
if (queue < priv->plat->rx_queues_to_use) {
netif_napi_add(dev, &ch->rx_napi, stmmac_napi_poll_rx,
NAPI_POLL_WEIGHT);
}
if (queue < priv->plat->tx_queues_to_use) {
netif_tx_napi_add(dev, &ch->tx_napi, stmmac_napi_poll_tx,
NAPI_POLL_WEIGHT);
}
}
}
NAPI_POLL_WEIGHT是传入的weight值 64
struct napi_struct {
struct list_head poll_list;
int weight;
int (*poll)(struct napi_struct *, int);
struct net_device *dev;
struct sk_buff *skb;
};
channel结构体:
struct stmmac_channel {
struct napi_struct rx_napi ____cacheline_aligned_in_smp;
struct napi_struct tx_napi ____cacheline_aligned_in_smp;
struct stmmac_priv *priv_data;
spinlock_t lock;
u32 index;
};
硬中断处理回调
irqreturn_t stmmac_interrupt(int irq, void *dev_id)
{
/* To handle DMA interrupts */
stmmac_dma_interrupt(priv);
}
为每个channel开启napi
void stmmac_dma_interrupt(struct stmmac_priv *priv)
{
for (chan = 0; chan < channels_to_check; chan++)
status[chan] = stmmac_napi_check(priv, chan);
}
调用__napi_schedule, 根据是TX或RX来传入tx_napi或rx_napi
{
struct stmmac_channel *ch = &priv->channel[chan];
if ((status & handle_rx) && (chan < priv->plat->rx_queues_to_use)) {
stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 0);
__napi_schedule(&ch->rx_napi);
}
if ((status & handle_tx) && (chan < priv->plat->tx_queues_to_use))
{
stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 0, 1);
__napi_schedule(&ch->tx_napi);
}
}
该函数会触发软中断:
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
preempt_check_resched_rt();
}
将napi的poll_list链表挂到softnet_data的poll_list上,
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
置位NET_RX_SOFTIRQ标志
void __raise_softirq_irqoff(unsigned int nr)
{
lockdep_assert_irqs_disabled();
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
2、软中断接收部分
2.1内核启动软中断注册流程
Linux软中断是在内核线程ksoftirqd中处理,该线程的创建流程如下:
smpboot_register_percpu_thread
__smpboot_create_thread
tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, ht->thread_comm);
int smpboot_thread_fn(void *data)
{
while (1)
{
if (!ht->thread_should_run(td->cpu)) {
preempt_enable_no_resched();
schedule();
} else {
__set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
}
}
由percpu可见,每个核心都会注册smpboot_thread_fn内核线程。
thread_should_run和thread_fn的赋值如下:
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
先检查本地 __softirq_pending是否被置位。
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
然后根据标志执行处理。
static void run_ksoftirqd(unsigned int cpu)
{
//关闭硬件中断
ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
//打开硬件中断
ksoftirqd_run_end();
cond_resched();
return;
}
ksoftirqd_run_end();
}
下面看一下h->action(h)执行的是什么回调:
1.内核的软中断向量表:
//interrupt.h
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
在net初始化时,为每个cpu分配sd,并会给softirq_vec注册NET_TX_SOFTIRQ、NET_RX_SOFTIRQ两个软中断回调
int __init net_dev_init(void)
{
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
INIT_LIST_HEAD(&sd->poll_list);
}
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}
softirq_vec是个全局数组,
struct softirq_action softirq_vec[NR_SOFTIRQS]
{
void (*action)(struct softirq_action *);
};
相当于:
softirq_vec[NET_RX_SOFTIRQ].action = net_rx_action;
softirq_vec[NET_TX_SOFTIRQ].action = net_tx_action;
2.2 net_rx_action分析
基于上面分析,h->action(h)执行的是net_rx_action。
void net_rx_action(struct softirq_action *h)
{
//获取该cpu的sd结构体
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
local_irq_disable();
//从sd->poll_list上取下napi->list
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
net_rx_action首先找到发生硬中断的cpu的sd结构体,并从sd的poll_list链表取下napi(硬中断中设置),然后在for循环执行,time_limit控制执行时间,以免一次软中断占用过多CPU时间。
int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
}
n->poll调用了stmmac_napi_add注册的poll方法:
int stmmac_napi_poll_rx(struct napi_struct *napi, int budget)
{
struct stmmac_channel *ch =
container_of(napi, struct stmmac_channel, rx_napi);
struct stmmac_priv *priv = ch->priv_data;
u32 chan = ch->index;
work_done = stmmac_rx(priv, budget, chan);
return work_done;
}
int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
{
struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
struct stmmac_channel *ch = &priv->channel[queue];
struct sk_buff *skb = NULL;
//获取帧长度
buf1_len = stmmac_rx_buf1_len(priv, p, status, len);
len += buf1_len;
buf2_len = stmmac_rx_buf2_len(priv, p, status, len);
len += buf2_len;
skb = napi_alloc_skb(&ch->rx_napi, buf1_len);
{ skb->dev = napi->dev;}
//CPU准备读取内存的数据
dma_sync_single_for_cpu(priv->device, buf->addr,
buf1_len, DMA_FROM_DEVICE);
//将ringbuff数据拷贝到skb
skb_copy_to_linear_data(skb, page_address(buf->page), buf1_len);
//设置skb长度
skb_put(skb, buf1_len);
napi_gro_receive(&ch->rx_napi, skb);
}
获取帧长度回调
static int dwmac4_wrback_get_rx_frame_len(struct dma_desc *p, int rx_coe)
{
return (le32_to_cpu(p->des3) & RDES3_PACKET_SIZE_MASK);
}
3、软中断结束至协议栈之前
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
gro_result_t ret;
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb, 0);
ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
return ret;
}
dev_gro_receive作用主要是把小数据包合并成大包再发送,提高效率。
gro_result_t napi_skb_finish(struct napi_struct *napi,
struct sk_buff *skb, gro_result_t ret)
case GRO_NORMAL:
gro_normal_one(napi, skb, 1);
break;
gro_normal_one会将skb挂到napi上,并计数:
void gro_normal_one(struct napi_struct *napi,struct sk_buff *skb,int segs)
{
list_add_tail(&skb->list, &napi->rx_list);
napi->rx_count += segs;
if (napi->rx_count >= gro_normal_batch)
gro_normal_list(napi);
}
最终会调用:
int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
orig_dev = skb->dev;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
……
}
该函数有个重要的链表遍历:
首先遍历ptype_all,在RAW Socket创建时,会将.func = packet_rcv的packet_type结构体挂到ptype_all链表上。
因此对应RAW Socket来说,这里deliver_skb执行的是packet_rcv,这里也是tcpdump的入口点。
int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
packet_rcv主要的作用就是将skb放到socket的接收队列,然后唤醒阻塞等待接收的进程。
packet_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_ll *sll;
struct packet_sock *po;
u8 *skb_head = skb->data;
int skb_len = skb->len;
sk = pt->af_packet_priv;// po->prot_hook.af_packet_priv = sk;
po = pkt_sk(sk);
……
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
return 0;
}
可见,该函数会从packet_type上获取sk,并将skb挂到sk的接收队列sk_receive_queue上,然后执行sk_data_ready(sk)唤醒等待的进程。在packet_create时sk_data_ready被设置成了sock_def_readable。
void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
首先从sk->sk_wq等待队列取下wait即wait_queue_head,然后调用:
__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
传入1说明一次只唤醒一个进程。
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
__wake_up_common会从wq_head上取下wait_queue_entry,它的private成员保存着阻塞线程的current指针,
.private = current
.func = receiver_wake_function,
__wake_up_common调用func即receiver_wake_function进行唤醒。
ret = curr->func(curr, mode, wake_flags, key);
最终调用default_wake_function进行线程唤醒:
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
return try_to_wake_up(curr->private, mode, wake_flags);
}
curr->private即阻塞线程的current指针,指向task_struct。try_to_wake_up将线程的state设置为TASK_RUNNING,并放入就绪队列,等待调度。
网络数据包从网卡进入,上行至sock的接收队列过程图解如下: