中断处理函数经常拆分成中断上半部分top half和中断下半部分。top half用来执行中断发生后最关键的操作,执行时间尽量短。Bottom half则执行一些比较的耗时的工作。Linux的Bottom Half机制有两个缺点,也即:- (1)在任意一时刻,系统只能有一个CPU可以执行Bottom Half代码,以防止两个或多个CPU同时来执行Bottom Half函数而相互干扰。因此BH代码的执行是严格“串行化”的。
- (2)BH函数不允许嵌套。
- 这两个缺点在单CPU系统中是无关紧要的,但在SMP系统中却是非常致命的。因为BH机制的严格串行化执行显然没有充分利用SMP系统的多CPU特点。为此,Linux2.4内核在BH机制的基础上进行了扩展,这就是所谓的“软中断请求”(softirq)机制。
softirq与tasklet的区别:
1.tasklet是通过softirq实现的,在softirq上下文中运行,tasklet代码必须是原子的
2.tasklet始终运行在被初始提交的同一处理器上,(即驱动调用schedule_tasklet后,进入do_softirq时此tasklet在哪个cpu上执行了,紧接着再来此tasklet就只会在这个cpu上执行。 如果tasklet count 为0后,后面就会重新随机选择cpu了。)
tasklet 参考《linux 内核设计与实现》第三版介绍。 在do_softirq里面调用tasklet_action 判断tasklet是否
TASKLET_STATE_RUN 状态,若是则表明其他core正在执行此tasklet就跳过。 如果不是次状态就置为次状态,这样其他core就不会执行次tasklet 了。
softirq 可以重入的, softirq执行时,此cpu上软中断被关闭,但其他处理器软中断依然可以处理软中断。 而且当同一个软中断在cpu0上正在执行行, 如果此时又来了次软中断,其他cpu可以处理此软中断。 所以软中断处理程序里数据是percpu类型,就可以避免锁抢占了。
HARDIRQ是关闭中断关闭抢占执行的,SOFTIRQ(比如tasklet)是开启中断关闭抢占执行的。基于中断上下文的,中间都不能睡眠。
工作队列是基于进程上下文的。
目前代码里softirq 中断类型:
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
BLOCK_IOPOLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
注册的地方,下面为wifi用到了两个软中断的注册:
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
INIT_LIST_HEAD(&offload_base);
if (register_pernet_subsys(&netdev_net_ops))
goto out;
/*
* Initialise the packet receive queues.
*/
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
sd->completion_queue = NULL;
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
sd->csd.func = rps_trigger_softirq;
sd->csd.info = sd;
sd->csd.flags = 0;
sd->cpu = i;
#endif
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
sd->backlog.gro_list = NULL;
sd->backlog.gro_count = 0;
}
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
* is present in a network namespace the loopback device must
* be present. Since we now dynamically allocate and free the
* loopback device ensure this invariant is maintained by
* keeping the loopback device as the first device on the
* list of network devices. Ensuring the loopback devices
* is the first device that appears and the last network device
* that disappears.
*/
if (register_pernet_device(&loopback_net_ops))
goto out;
if (register_pernet_device(&default_device_ops))
goto out;
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
rc = 0;
out:
return rc;
}
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}
注册完成后怎么使用呢?
如果要启动某个软中断,则需要置位irq_stat[cpu].__softirq_pending中的相应位,而后续的处理工作则由do_softirq处理。设置__softirq_pending的操作则由函数raise_softirq_irqoff来实现。可以通过以下三种方法启动软中断:
1)在中断上下文中,通过调用函数raise_softirq,置位irq_stat[cpu].__softirq_pending中的相应软中断位,则会在硬中断结束后在函数irq_exit中调用invoke_softirq,实现软中断处理;
2)在非中断上下文中,通过调用raise_softirq_irqoff,置位irq_stat[cpu].__softirq_pending中的相应软中断位,并唤醒软中断守护进程,通过软中断守护进程实现软中断的处理;
3)在__do_softirq中,当该函数执行完时还有未决的软中断,则唤醒软中断守护进程,由软中断守护进程继续处理未决的软中断;
以上3种方法中,不管是通过调用函数invoke_softirq,还是通过软中断守护进程来处理软中断,最终都会调用函数do_softirq、__do_softirq。
1. 一种是在中断上下文里,在中断退出时
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
WARN_ON_ONCE(!irqs_disabled());
#endif
account_irq_exit_time(current);
trace_hardirq_exit();
sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq(); //这里触发do_softirq
tick_irq_exit();
rcu_irq_exit();
}
static inline void invoke_softirq(void)
{
if (!force_irqthreads) {
/*
* We can safely execute softirq on the current stack if
* it is the irq stack, because it should be near empty
* at this stage. But we have no way to know if the arch
* calls irq_exit() on the irq stack. So call softirq
* in its own stack to prevent from any overrun on top
* of a potentially deep task stack.
*/
do_softirq();
} else {
wakeup_softirqd();
}
}
2.
void raise_softirq(unsigned int nr)
{
unsigned long flags;
local_irq_save(flags);
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}
inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
if (!in_interrupt())
wakeup_softirqd();
}
wifi 里面调用 流程
static int wlan_rx_skb_process(const unsigned char vif_id, unsigned char *pData, unsigned short len)
{
struct sk_buff *skb;
struct net_device *ndev = g_wlan.netif[vif_id].ndev;
if((NULL == pData) || (0 == len) || (NULL == ndev))
{
printkd("[%s][%d][err]\n", __func__, (int )vif_id);
return ERROR;
}
skb = dev_alloc_skb(len + NET_IP_ALIGN);
if(NULL == skb)
return ERROR;
skb_reserve(skb, NET_IP_ALIGN);
memcpy(skb->data, pData, len);
skb_put(skb, len);
skb->dev = ndev;
skb->protocol = eth_type_trans(skb, ndev);
ndev->stats.rx_packets++;
printkp("rx_skb:%d\n", (int)(ndev->stats.rx_packets) );
ndev->stats.rx_bytes += skb->len;
if ( in_interrupt() ){
printk(KERN_ERR "cyx interuput\n");
netif_rx(skb);
}
else{
printk(KERN_ERR "cyx not interuput\n");
netif_rx_ni(skb);
}
return OK;
}
int netif_rx_ni(struct sk_buff *skb)
{
int err;
preempt_disable(); //禁止抢占,相当于当前CPU不会被切出去了,当前CPU会一直执行下面的代码。
err = netif_rx(skb);
if (local_softirq_pending())
do_softirq(); //这里wifi的调用流程是先进skb加入队列里,然后此处显示调用do_softirq
preempt_enable();
return err;
}
netif_rx()调用enqueue_to_backlog()来处理。
首先获取当前cpu的softnet_data实例sd,然后:
1. 如果接收队列sd->input_pkt_queue不为空,说明已经有软中断在处理数据包了,
则不需要再次触发软中断,直接将数据包添加到接收队列尾部即可。
2. 如果接收队列sd->input_pkt_queue为空,说明当前没有软中断在处理数据包,
则把虚拟设备backlog添加到sd->poll_list中以便进行轮询,最后设置NET_RX_SOFTIRQ
标志触发软中断。
3. 如果接收队列sd->input_pkt_queue满了,则直接丢弃数据包。
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
drop:
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
执行软中断
asmlinkage void do_softirq(void)
{
__u32 pending;
unsigned long flags;
if (in_interrupt()) //这个判断更侧重于当前是否已经在进行底半的处理了,这个函数应该是判断是否中断上下文的。这里的意思个人理解是如果已经再中断上下文了,那就不触发软中断
return;
/*local_irq_save函数相当于local_irq_disable,local_irq_save会在关闭中断前,将处理器当前的标志位保持在一个unsigned long flags中,在调用local_irq_restore时,在将保存的flags恢复到处理器的FLAGS寄存器中。这样做是为了防止在一个关闭中断的环境中因为调用local_irq_disable和local_irq_enable破坏之前的中断响应状态*/
local_irq_save(flags);
pending = local_softirq_pending();
if (pending)
__do_softirq();
local_irq_restore(flags);
}
从下面函数可以看到,在执行可延迟函数第一件事就是开中断。但在开始之前,禁用了下半部中断(__local_bh_disable),其实就是关抢占。这样就算被中断了,返回内核时也不会被抢占,还是执行这里的代码。也不会被调度。
那么这样的后果就是软中断上下文里的会一直执行下去,直到到达了限定次数,然后唤醒守护进程
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable(); //开中断
h = softirq_vec;
do {
if (pending & 1) {
unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h); //这里action 就是刚才注册时的net_rx_action函数
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
printk(KERN_ERR "huh, entered softirq %u %s %p"
"with preempt_count %08x,"
" exited with %08x?\n", vec_nr,
softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count() = prev_count;
}
rcu_bh_qs(cpu);
}
h++;
pending >>= 1;
} while (pending); //遍历检测各软中断是否被置pending,如果置上了说明有软中断需要处理,于是调用对应的action函数
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;
wakeup_softirqd();
}
lockdep_softirq_exit();
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&sd->poll_list)) { //这个队列是上面____napi_schedule ,这里应该是napi的机制原理
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
printk("cyx before poll \n");
work = n->poll(n, weight); // 这个poll 函数即wifi 网络设备打开的时候注册的sd->backlog.poll = process_backlog;
printk("cyx after poll \n");
trace_napi_poll(n);
}
WARN_ON_ONCE(work > weight);
budget -= work;
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else {
if (n->gro_list) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
local_irq_enable();
napi_gro_flush(n, HZ >= 1000);
local_irq_disable();
}
list_move_tail(&n->poll_list, &sd->poll_list);
}
}
netpoll_poll_unlock(have);
}
out:
net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
dma_issue_pending_all();
#endif
return;
softnet_break:
sd->time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = weight_p;
local_irq_disable();
while (1) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) { //取skb
rcu_read_lock();
local_irq_enable();
__netif_receive_skb(skb); //将skb按协议类型继而转发
rcu_read_unlock();
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
local_irq_enable();
return work;
}
}
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set
* on backlog.
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
rps_unlock(sd);
break;
}
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
rps_unlock(sd);
}
local_irq_enable();
return work;
}
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
ncls:
#endif
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb))) {
if (skb_vlan_tag_get_id(skb))
skb->pkt_type = PACKET_OTHERHOST;
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
skb->vlan_tci = 0;
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); //这里上报的
} else {
drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
return ret;
}
ip_input.c ip_rcv //走到IP层了哦
packet_type的func函数初始化
[af_inet.c]
static struct packet_type ip_packet_type __read_mostly = {
/*
*协议类型为:
#define ETH_P_IP 0x0800 /*Internet Protocol packet
*/
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
};
static int __init inet_init(void)
{
dev_add_pack(&ip_packet_type);
==========================数据的发送软中断流程==========================
3.1 没有软中断的发送流程
当tcp/ip层有数据持续下发时,一直在喂狗,此时dev_watchdog不会被触发,则不去执行net_tx_action,这里发送的流程如下
数据的发送:
… run_timer_softirq
… call_timer_fn
… tcp_write_timer
tcp_timer.c tcp_write_timer_handler
tcp_timer.c tcp_retransmit_timer
tcp_output.c tcp_retransmit_skb
tcp_output.c __tcp_retransmit_skb
tcp_output.c tcp_transmit_skb //组建tcp header 20字节
err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
ip_output.c ip_queue_xmit //组建IP header 8字节
ip_output.c ip_local_out
ip_output.c ip_output
ip_finish_output
ip_output.c ip_finish_output2 entry
ip_output.c ip_finish_output2 before dst_neigh_output
neighbour.h neigh_hh_output before dev_queue_xmit
dev.c dev_queue_xmit
dev.c __dev_xmit_skb
sch_generic.c sch_direct_xmit
dev.c dev_hard_start_xmit
ndo_start_xmit
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*/
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
spin_unlock(root_lock);
printk("cyx sch_generic.c %s entry \n",__FUNCTION__);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq)) {
printk("cyx sch_generic.c %s before dev_hard_start_xmit \n",__FUNCTION__);
ret = dev_hard_start_xmit(skb, dev, txq);
3.2 软中断时的数据发送流程
当没有数据时,看门狗会定时触发dev_watchdog,然后触发net_tx_action 来检测链路是否有异常 ,这里面会从队列里提取帧,如果没有帧则返回,有帧则调用sch_direct_xmit发送。似乎这个net_tx_action值是用来发送队列里未发送的帧
sch_generic.c dev_watchdog
netdevice.h netif_tx_unlock
netdevice.h netif_schedule_queue
dev.c __netif_reschedule
dev.c net_tx_action
dev.c net_tx_action
sch_generic.c __qdisc_run
net_tx_action
1)释放那些已成功传输的缓冲区的sk_buff结构。调用dev_kfree_skb_irq,net_tx_action会把completion_queue设成NULL以清除该列表,
因为net_tx_action是在中断环境外运行的,设备驱动程序可以在任何时刻添加元素,所以,net_tx_action在访问softnet_data 结构时,必须关闭中断功能。 为了尽可能让中断功能关闭时间短一点,net_tx_action会把complete_queue设成NULL以清除该列表,然后把指向该列表的指针存储在一个局部变量clist。如此一来,net_rx_action就能遍历此列表,用__kfree_skb释放每个元素,同时驱动程序又能持续把新元素加到completion_queue 里。
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
2) 跟上面的道理一样,这里也使用了局部变量 head来过渡待传送的数据帧,原始的 sd->output_queue则被置成NULL,可以继续添加数据帧进来
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock;
head = head->next_sched;
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
printk("cyx dev.c %s before qdisc_run \n ",__FUNCTION__);
qdisc_run(q);
spin_unlock(root_lock);
} else {
if (!test_bit(__QDISC_STATE_DEACTIVATED,
&q->state)) {
printk("cyx dev.c %s before __netif_reschedule \n ",__FUNCTION__);
__netif_reschedule(q);
3.2.1 队列的停止和恢复
当内存不足时,我们直接停止发送队列,即修改__LINK_STATE_XOFF 状态为 0
static int itm_wlan_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct itm_priv *priv = netdev_priv(dev);
struct sblock blk;
int ret;
18
/*
* Get a free sblock.
*/
ret = sblock_get(WLAN_CP_ID, WLAN_SBLOCK_CH, &blk, 0);
if (ret) {
dev_err(&dev->dev, "Failed to get free sblock (%d)\n", ret);
netif_stop_queue(dev);
priv->ndev->stats.tx_fifo_errors++;
return NETDEV_TX_BUSY;
}
那什么时候恢复队列呢?两种情况:
1)设备驱动程序会使用一个看门狗定时器,令挂起的传输得以恢复,net_device->tx_timeout 通常会复位网络卡,然后netif_wake_queue
每个网络设备都对应有定时器(当设备以dev_activate启动时,由dev_watchdog_up启动)
watchdog_timer ,定时器,当定时器到期所执行的处理函数是dev_watchdog
watchdog_timeo,等待的时间量,当其设为0时,watchdog_timer 就不会启动.默认是5秒,接口越慢时限就越大。
2)当设备通知驱动程序已有足够的内存可处理一个特定尺寸的帧的传输时,该设备就可被唤醒。
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK))
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count()) //可以看出来,这个函数是判断当天是否有硬件中断或者软件中断或者NMI