在kernel发生translation fault的时候会调用下面的函数
static int __kprobes do_translation_fault(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
if (addr < TASK_SIZE)
return do_page_fault(addr, esr, regs);
do_bad_area(addr, esr, regs);
return 0;
}
我们为调用do_bad_area 为例
static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->active_mm;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (user_mode(regs))
__do_user_fault(tsk, addr, esr, SIGSEGV, SEGV_MAPERR, regs);
else
__do_kernel_fault(mm, addr, esr, regs);
}
在do_bad_area 中有根据fault是发生在user space还是kernel space 调用不同的函数,我们以__do_kernel_fault为例
而判断是在user space还是kernel space的函数是
#define user_mode(regs) \
(((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t)
我们知道在arm64 分为4个级别el0就代表是user space,因此根据这个就可以判断是user space还是kernel space
static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
/*
* Are we prepared to handle this kernel fault?
* We are almost certainly not prepared to handle instruction faults.
*/
if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
return;
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr);
die("Oops", regs, esr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
在__do_kernel_fault 这个函数中我们就可以看到kernel挂掉是打印的Unable to handle kernel %s at virtual address 这句话
void die(const char *str, struct pt_regs *regs, int err)
{
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
在die中会根据是否在interrupt 打印不同的信息,因此仔细观察kernel挂掉时的log,可以分清是否是在中断中。
根据#define in_interrupt() (irq_count())
可以判断是否在中断中,也就是说只要进入中断irq_count()就会增加
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
可见这里irq_count,包括抢占/软件中断/硬件中断/NMI中断,其中NMI中断就是FRQ中断
在panic 中会调用smp_send_stop来让其他的cpu down掉
void smp_send_stop(void)
{
unsigned long timeout;
if (num_online_cpus() > 1) {
cpumask_t mask;
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
if (system_state == SYSTEM_BOOTING ||
system_state == SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
/* Wait up to one second for other CPUs to stop */
timeout = USEC_PER_SEC;
while (num_online_cpus() > 1 && timeout--)
udelay(1);
if (num_online_cpus() > 1)
pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
cpumask_pr_args(cpu_online_mask));
}
从smp_send_stop 中可以发现是发生fault的cpu要通过发送IPI_CPU_STOP来让其他cpu也down掉.
static int __kprobes do_translation_fault(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
if (addr < TASK_SIZE)
return do_page_fault(addr, esr, regs);
do_bad_area(addr, esr, regs);
return 0;
}
我们为调用do_bad_area 为例
static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->active_mm;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (user_mode(regs))
__do_user_fault(tsk, addr, esr, SIGSEGV, SEGV_MAPERR, regs);
else
__do_kernel_fault(mm, addr, esr, regs);
}
在do_bad_area 中有根据fault是发生在user space还是kernel space 调用不同的函数,我们以__do_kernel_fault为例
而判断是在user space还是kernel space的函数是
#define user_mode(regs) \
(((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t)
我们知道在arm64 分为4个级别el0就代表是user space,因此根据这个就可以判断是user space还是kernel space
static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
/*
* Are we prepared to handle this kernel fault?
* We are almost certainly not prepared to handle instruction faults.
*/
if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
return;
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr);
die("Oops", regs, esr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
在__do_kernel_fault 这个函数中我们就可以看到kernel挂掉是打印的Unable to handle kernel %s at virtual address 这句话
void die(const char *str, struct pt_regs *regs, int err)
{
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
在die中会根据是否在interrupt 打印不同的信息,因此仔细观察kernel挂掉时的log,可以分清是否是在中断中。
根据#define in_interrupt() (irq_count())
可以判断是否在中断中,也就是说只要进入中断irq_count()就会增加
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
可见这里irq_count,包括抢占/软件中断/硬件中断/NMI中断,其中NMI中断就是FRQ中断
在panic 中会调用smp_send_stop来让其他的cpu down掉
void smp_send_stop(void)
{
unsigned long timeout;
if (num_online_cpus() > 1) {
cpumask_t mask;
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
if (system_state == SYSTEM_BOOTING ||
system_state == SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
/* Wait up to one second for other CPUs to stop */
timeout = USEC_PER_SEC;
while (num_online_cpus() > 1 && timeout--)
udelay(1);
if (num_online_cpus() > 1)
pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
cpumask_pr_args(cpu_online_mask));
}
从smp_send_stop 中可以发现是发生fault的cpu要通过发送IPI_CPU_STOP来让其他cpu也down掉.