稳定性范畴, 参考5.x kernel。
kernel Oops
Oops指的就是内核的不正确行为,比如对驱动来说:static int i82092aa_pci_probe(struct pci_dev *dev,
const struct pci_device_id *id)
{
unsigned char configbyte;
int i, ret;
ret = pci_enable_device(dev);
if (ret)
return ret;
/* PCI Configuration Control */
pci_read_config_byte(dev, 0x40, &configbyte);
switch (configbyte&6) {
case 0:
socket_count = 2;
break;
case 2:
socket_count = 1;
break;
case 4:
case 6:
socket_count = 4;
break;
default:
dev_err(&dev->dev,
"Oops, you did something we didn't think of.\n");
ret = -EIO;
goto err_out_disable;
}
这里的PCI配置读出来有异常,我们就认为他是一个Oops,打印一个错误,探测失败。
分配内存失败也算一种Oops,只不过不需要打出错误信息。td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%zd)\n", td, sizeof (struct FS_BPENTRY));
if (!td) {
/* Oops out of mem */
return -ENOMEM;
}
在体系架构方面的Oops,比如arm64的bug Oops:static int bug_handler(struct pt_regs *regs, unsigned int esr)
{
switch (report_bug(regs->pc, regs)) {
case BUG_TRAP_TYPE_BUG:
die("Oops - BUG", regs, 0);
break;
如果report_bug()返回的是BUG_TRAP_TYPE_BUG,那就报个Oops log。
再比如非法访问也会走die("Oops", ):static void die_kernel_fault(const char *msg, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);
mem_abort_decode(esr);
show_pte(addr);
die("Oops", regs, esr); //tj
bust_spinlocks(0);
do_exit(SIGKILL);
}
看下die():void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;
raw_spin_lock_irqsave(&die_lock, flags);
oops_enter();
console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs); //tj
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();
if (in_interrupt())
panic("Fatal exception in interrupt"); //tj
if (panic_on_oops)
panic("Fatal exception"); //tj
raw_spin_unlock_irqrestore(&die_lock, flags);
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
再看下__die():static int __die(const char *str, int err, struct pt_regs *regs)
{
static int die_counter;
int ret;
pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
str, err, ++die_counter);
/* trap and error numbers are mostly meaningless on ARM */
ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV);
if (ret == NOTIFY_STOP)
return ret;
print_modules();
show_regs(regs);
dump_kernel_instr(KERN_EMERG, regs);
return ret;
}
打印类似如下log:35.449887: <6> Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
35.449893: <6> Modules linked in:
35.449901: <6> Process init (pid: 1, stack limit = 0x00000000826895f7)
后面会call panic(),不过是有条件的:if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
如果这个Oops在中断里,会走panic()。如果不在但if (panic_on_oops)成立,也走panic()。
可见,Oops不一定会导致panic。bug_handle()对BUG_TRAP_TYPE_BUG还不默认panic?
btw: arm64的Oops是怎么触发的了?稍后看。
Kernel panic
kernel panic就是不可恢复的错误了,怎么处理?我想复位or我就想定这。/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
void panic(const char *fmt, ...)
{
...
pr_emerg("Kernel panic - not syncing: %s\n", buf);
...
if (panic_timeout > 0) { //tj: 延迟重启
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
pr_emerg("Rebooting in %d seconds..\n", panic_timeout); //tj
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER