中断机制的出现,主要用来解决CPU频率(以及cache/memory访问速率)和IO速率之间越来越不匹配的问题。当CPU需要访问外部设备数据时,有两种处理策略:
(1)轮询方式:CPU发送命令给设备控制寄存器,通知外部设备从内存读取数据/将设备数据写入内存。然后,CPU不停的查询设备状态寄存器,了解数据是否传输完成,直到完成后,CPU再去处理其他任务。在数据传输过程中,CPU不能去做其他任务,只能不停的轮询设备状态,当传输的数据量较大/外设速率较低时,浪费的CPU资源是很可观的。这种情况下,出现了中断机制,如(2)所述。
(2)中断机制:当CPU需要访问外设数据时,发送命令通知外设去内存中读写数据。然后,CPU继续执行其他任务(一般是,CPU将当前进程睡眠在等待队列,切换到其他进程运行)。当外设处理完数据传输之后,设置好自身状态寄存器,再通过中断控制线,给CPU发送中断请求。CPU收到外设的中断请求后,查询设备的状态寄存器,唤醒之前睡眠在等待队列上的进程,执行后续处理。这样,就避免了轮询方式下,CPU资源的浪费。
那么,Linux中,中断是如何实现的?中断机制的实现依赖于硬件和软件的协作处理,硬件上需要CPU和外设都能够支持中断,软件上需要处理好中断请求线跟中断号的对应关系。更进一步,提供更灵活的机制,使得用户可以针对特定的硬件注册特定的中断处理函数。以x86 Linux平台为例,分析中断机制的实现。
我们知道,x86 CPU架构提供了中断机制的支持。CPU在特权指令模式下,提供了IDT(中断描述符表)来索引中断处理函数。IDT是中断描述符表,以中断向量为索引,以中断处理例程起始运行地址为值。IDT中断向量为8位,可以提供256个中断向量。x86 CPU提供了很多内置的中断处理例程,Linux在启动过程中,切换到保护模式下后,对x86预置的中断映射表进行了改写:前32个中断向量保留intel内置的异常处理。从0x30~0x3f开始,映射外部中断。0x80作为软中断处理函数的中断向量。
中断向量0x20~0x2f映射的是8259A的16个中断请求。8259A是intel的中断控制器,可以提供8个外部中断,两个8259A级联(Master/Slave)后,总共可以提供了15个外部中断。每个8259A芯片提供两个端口地址(Master:0x20和0x21, Slave:0xA0和0xA1)供程序员进行变成控制。这里不去深入讨论8259A的编程细节,主要讨论Linux对中断机制的封装和处理。
Linux使用struct irq_desc,struct irq_data 和 struct irqaction三个数据结构描述了中断处理的主架构。irq_desc是对某个中断请求线的描述,最重要的结构是包含了irq_data和irqaction。irq_data包含了芯片硬件相关的信息,抽象了中断控制器的操作接口。irqaction是中断处理链表:一个中断请求线可能被多个硬件设备共享,这样一个中断请求线可以对应多个中断处理函数。
数据结构:
struct irq_desc {
struct irq_common_data irq_common_data;
struct irq_data irq_data; // 中断控制器信息
unsigned int __percpu *kstat_irqs;
irq_flow_handler_t handle_irq; // 中断处理函数,通常被实现成:轮询action链表,依次调用每个action的中断处理例程
#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
irq_preflow_handler_t preflow_handler;
#endif
struct irqaction *action; /* IRQ action list */ // 用户注册的中断处理函数链表(多个设备可以共享同一个中断请求线)
unsigned int status_use_accessors;
unsigned int core_internal_state__do_not_mess_with_it;
unsigned int depth; /* nested irq disables */
unsigned int wake_depth; /* nested wake enables */
unsigned int irq_count; /* For detecting broken IRQs */
unsigned long last_unhandled; /* Aging timer for unhandled count */
unsigned int irqs_unhandled;
atomic_t threads_handled;
int threads_handled_last;
raw_spinlock_t lock;
struct cpumask *percpu_enabled;
#ifdef CONFIG_SMP
const struct cpumask *affinity_hint;
struct irq_affinity_notify *affinity_notify;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
#endif
unsigned long threads_oneshot;
atomic_t threads_active;
wait_queue_head_t wait_for_threads;
#ifdef CONFIG_PM_SLEEP
unsigned int nr_actions;
unsigned int no_suspend_depth;
unsigned int cond_suspend_depth;
unsigned int force_resume_depth;
#endif
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *dir;
#endif
int parent_irq;
struct module *owner;
const char *name;
} ____cacheline_internodealigned_in_smp;
struct irq_data {
u32 mask;
unsigned int irq; // 当前中断请求线对应的中断irq
unsigned long hwirq;
struct irq_common_data *common;
struct irq_chip *chip; // 中断控制器芯片相关信息和操作
struct irq_domain *domain;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
struct irq_data *parent_data;
#endif
void *chip_data; // 中断控制器的私有数据信息
};
struct irqaction {
irq_handler_t handler; // 实际的中断处理函数
void *dev_id; // 设备的私有数据
void __percpu *percpu_dev_id;
struct irqaction *next; // 下一个action
irq_handler_t thread_fn;
struct task_struct *thread;
struct irqaction *secondary;
unsigned int irq; // 中断请求irq
unsigned int flags;
unsigned long thread_flags;
unsigned long thread_mask;
const char *name;
struct proc_dir_entry *dir;
} ____cacheline_internodealigned_in_smp;
源码分析:
(1)irq初始化:
void __init init_IRQ(void)
{
int i;
/*
* On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
* If these IRQ's are handled by legacy interrupt-controllers like PIC,
* then this configuration will likely be static after the boot. If
* these IRQ's are handled by more mordern controllers like IO-APIC,
* then this vector space can be freed and re-used dynamically as the
* irq's migrate etc.
*/
for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); // 循环初始化 vector_irq[cpu0][0x30] ~ vector_irq[cpu0][0x3f],以外部中断向量为索引,值为 irq_desc。因此,每次外部中断进来,中断处理函数可以根据中断请求线irq找到相应的中断描述符。
x86_init.irqs.intr_init(); // 其中会调用 init_ISA_irqs()
}
void __init init_ISA_irqs(void)
{
struct irq_chip *chip = legacy_pic->chip;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
init_bsp_APIC();
#endif
legacy_pic->init(0);
for (i = 0; i < nr_legacy_irqs(); i++)
irq_set_chip_and_handler(i, chip, handle_level_irq); // 初始化每个irq_desc,设置irq_desc中平台相关的chip以及handle_irq
}
这样,一个中断请求irq进来时,我们就可以根据得到的 irq_desc,使用chip中的函数响应中断请求,使用handle_level_irq轮询每个irqaction,并调用其中的 handler 完成中断处理。从初始化的流程来看,我们已经初始化了从外部irq到最终的handler路径上的所有必要元素。剩下的是如何关联irq和处理函数handler之间的映射。
kernel中提供了 request_irq()和 free_irq()来注册和注销irq和handler之间的关系,供用户注册和注销自己的中断处理函数:
(2)中断处理函数注册:
request_irq()内部调用的是 request_threaded_irq()
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
int retval;
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*
* Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
* it cannot be set along with IRQF_NO_SUSPEND.
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
desc = irq_to_desc(irq); // 根据中断请求线获取预先分配好的 irq_desc
if (!desc)
return -EINVAL;
if (!irq_settings_can_request(desc) ||
WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
if (!handler) {
if (!thread_fn)
return -EINVAL;
handler = irq_default_primary_handler;
}
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); //无论是未分配的 irq_desc 还是设置shared标志的 irq_desc,都需要新分配一个action
if (!action)
return -ENOMEM;
// 初始化action
action->handler = handler;
action->thread_fn = thread_fn;
action->flags = irqflags;
action->name = devname;
action->dev_id = dev_id;
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action); // 将新的action注册进 irq_desc
chip_bus_sync_unlock(desc);
if (retval) {
kfree(action->secondary);
kfree(action);
}
#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
* to happen immediately, so let's make sure....
* We disable the irq to make sure that a 'real' IRQ doesn't
* run in parallel with our fake.
*/
unsigned long flags;
disable_irq(irq);
local_irq_save(flags);
handler(irq, dev_id);
local_irq_restore(flags);
enable_irq(irq);
}
#endif
return retval;
}
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
cpumask_var_t mask;
if (!desc)
return -EINVAL;
if (desc->irq_data.chip == &no_irq_chip)
return -ENOSYS;
if (!try_module_get(desc->owner))
return -ENODEV;
new->irq = irq;
/*
* Check whether the interrupt nests into another interrupt
* thread.
*/
nested = irq_settings_is_nested_thread(desc);
if (nested) {
if (!new->thread_fn) {
ret = -EINVAL;
goto out_mput;
}
/*
* Replace the primary handler which was provided from
* the driver for non nested interrupt handling by the
* dummy function which warns when called.
*/
new->handler = irq_nested_primary_handler;
} else {
if (irq_settings_can_thread(desc)) {
ret = irq_setup_forced_threading(new);
if (ret)
goto out_mput;
}
}
/*
* Create a handler thread when a thread function is supplied
* and the interrupt does not nest into another interrupt
* thread.
*/
if (new->thread_fn && !nested) {
ret = setup_irq_thread(new, irq, false);
if (ret)
goto out_mput;
if (new->secondary) {
ret = setup_irq_thread(new->secondary, irq, true);
if (ret)
goto out_thread;
}
}
if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
ret = -ENOMEM;
goto out_thread;
}
/*
* Drivers are often written to work w/o knowledge about the
* underlying irq chip implementation, so a request for a
* threaded irq without a primary hard irq context handler
* requires the ONESHOT flag to be set. Some irq chips like
* MSI based interrupts are per se one shot safe. Check the
* chip flags, so we can avoid the unmask dance at the end of
* the threaded handler for those.
*/
if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
new->flags &= ~IRQF_ONESHOT;
/*
* The following block of code has to be executed atomically
*/
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
old = *old_ptr;
if (old) {
/*
* Can't share interrupts unless both agree to and are
* the same type (level, edge, polarity). So both flag
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
(new->flags & IRQF_PERCPU))
goto mismatch;
/* add new interrupt at end of irq queue */
// 找到action链表的最后一个元素
do {
/*
* Or all existing action->thread_mask bits,
* so we can find the next zero bit for this
* new action.
*/
thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
} while (old);
shared = 1;
}
/*
* Setup the thread mask for this irqaction for ONESHOT. For
* !ONESHOT irqs the thread mask is 0 so we can avoid a
* conditional in irq_wake_thread().
*/
if (new->flags & IRQF_ONESHOT) {
/*
* Unlikely to have 32 resp 64 irqs sharing one line,
* but who knows.
*/
if (thread_mask == ~0UL) {
ret = -EBUSY;
goto out_mask;
}
/*
* The thread_mask for the action is or'ed to
* desc->thread_active to indicate that the
* IRQF_ONESHOT thread handler has been woken, but not
* yet finished. The bit is cleared when a thread
* completes. When all threads of a shared interrupt
* line have completed desc->threads_active becomes
* zero and the interrupt line is unmasked. See
* handle.c:irq_wake_thread() for further information.
*
* If no thread is woken by primary (hard irq context)
* interrupt handlers, then desc->threads_active is
* also checked for zero to unmask the irq line in the
* affected hard irq flow handlers
* (handle_[fasteoi|level]_irq).
*
* The new action gets the first zero bit of
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
new->thread_mask = 1 << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
/*
* The interrupt was requested with handler = NULL, so
* we use the default primary handler for it. But it
* does not have the oneshot flag set. In combination
* with level interrupts this is deadly, because the
* default primary handler just wakes the thread, then
* the irq lines is reenabled, but the device still
* has the level irq asserted. Rinse and repeat....
*
* While this works for edge type interrupts, we play
* it safe and reject unconditionally because we can't
* say for sure which type this interrupt really
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
irq);
ret = -EINVAL;
goto out_mask;
}
if (!shared) {
ret = irq_request_resources(desc);
if (ret) {
pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
new->name, irq, desc->irq_data.chip->name);
goto out_mask;
}
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
if (ret)
goto out_mask;
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
if (new->flags & IRQF_PERCPU) {
irqd_set(&desc->irq_data, IRQD_PER_CPU);
irq_settings_set_per_cpu(desc);
}
if (new->flags & IRQF_ONESHOT)
desc->istate |= IRQS_ONESHOT;
if (irq_settings_can_autoenable(desc))
irq_startup(desc, true);
else
/* Undo nested disables: */
desc->depth = 1;
/* Exclude IRQ from balancing if requested */
if (new->flags & IRQF_NOBALANCING) {
irq_settings_set_no_balancing(desc);
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
/* Set default affinity mask once everything is setup */
setup_affinity(desc, mask);
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
unsigned int omsk = irq_settings_get_trigger_mask(desc);
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
pr_warning("irq %d uses trigger mode %u; requested %u\n",
irq, nmsk, omsk);
}
*old_ptr = new; // 将新的action链接到action链表的末尾
irq_pm_install_action(desc, new);
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
/*
* Check whether we disabled the irq via the spurious handler
* before. Reenable it and give it another chance.
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
__enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
/*
* Strictly no need to wake it up, but hung_task complains
* when no hard interrupt wakes the thread up.
*/
if (new->thread)
wake_up_process(new->thread);
if (new->secondary)
wake_up_process(new->secondary->thread);
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
free_cpumask_var(mask);
return 0;
mismatch:
if (!(new->flags & IRQF_PROBE_SHARED)) {
pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
irq, new->flags, new->name, old->flags, old->name);
#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
#endif
}
ret = -EBUSY;
out_mask:
raw_spin_unlock_irqrestore(&desc->lock, flags);
free_cpumask_var(mask);
out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
new->thread = NULL;
kthread_stop(t);
put_task_struct(t);
}
if (new->secondary && new->secondary->thread) {
struct task_struct *t = new->secondary->thread;
new->secondary->thread = NULL;
kthread_stop(t);
put_task_struct(t);
}
out_mput:
module_put(desc->owner);
return ret;
}
__setup_irq()函数用于将新的action 链接进 action链表,增加一个新的独占中断处理函数或者共享中断处理函数。
(3)中断处理执行路径:
当外部中断触发CPU进入中断处理例程时,查询IDT表获取中断处理例程,外部中断的处理例程最终都会调用到 do_IRQ()函数,并将中断请求线 irq 当作参数传入 do_IRQ()中:
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc * desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax; // 中断请求线 irq
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]); // 根据 外部中断请求的中断向量(vector = 0x20 + irq)获取相应的irq_desc
if (!handle_irq(desc, regs)) { // 调用中断处理函数
ack_APIC_irq();
if (desc != VECTOR_RETRIGGERED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
exiting_irq();
set_irq_regs(old_regs);
return 1;
}
bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
int overflow = check_stack_overflow();
if (IS_ERR_OR_NULL(desc))
return false;
if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
print_stack_overflow();
generic_handle_irq_desc(desc);
}
return true;
}
static inline void generic_handle_irq_desc(struct irq_desc *desc)
{
desc->handle_irq(desc); // 调用 irq_desc中的 handle_irq,实际调用的是 handle_level_irq
}
void handle_level_irq(struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
if (!irq_may_run(desc))
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
* keep it masked and get out of here
*/
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
handle_irq_event(desc); // 处理中断
cond_unmask_irq(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
irqreturn_t handle_irq_event(struct irq_desc *desc)
{
irqreturn_t ret;
desc->istate &= ~IRQS_PENDING;
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
ret = handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
struct irqaction *action = desc->action;
do { // 循环处理 irq_desc 中的每个action
irqreturn_t res;
trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id); // 调用实际的中断处理函数
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
irq, action->handler))
local_irq_disable();
switch (res) {
case IRQ_WAKE_THREAD:
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}
__irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
flags |= action->flags;
break;
default:
break;
}
retval |= res;
action = action->next; // 处理下一个action
} while (action);
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
note_interrupt(desc, retval);
return retval;
}
以上是Linux中断的上半部,主要用来处理必要的和紧急的任务。由于中断请求线资源很少,并且中断处理函数的执行不能被调度,更不能睡眠,所以在中断的上半部处理函数中,执行的任务要尽量少,执行的时间要尽量短。对于中断中耗时的任务需要放到中断的下半部(软中断/tasklet/workqueue)去处理。