hardlockup
1.通过 /proc/sys/kernel/hardlockup_panic 控制是否使能,可以通过NMI_watchdog对cpu硬件故障导致的挂死进行检测后出发linux panic
[root@c20200314JANX ~]# cat /proc/sys/kernel/hardlockup_panic
1
当前默认的检测超时时间为10s,hardlockup_panic配置为1后使能触发panic
内核代码对应 \linux-4.19.126\kernel\watchdog_hld.c中定义处理回调 watchdog_overflow_callback
/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
}
if (!watchdog_check_timestamp())
return;
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
* fired multiple times before we overflow'd. If it hasn't
* then this is a good indication the cpu is stuck
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
/*
* Perform all-CPU dump only once to avoid multiple hardlockups
* generating interleaving traces
*/
if (sysctl_hardlockup_all_cpu_backtrace &&
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
}
__this_cpu_write(hard_watchdog_warn, false);
return;
}
hungtask
hungtask是linux内核提供的标准功能,用于检测进程长时间处于D状态(不可中断的深度睡眠状态,处于此状态的进程,不能通过singal来唤醒),通过hungtask可以检测:mutex、semaphore和rw_semaphore三种场景
当业务进程mutex、semaphore和rw_semaphore锁资源使用不当时(一般是代码bug),就会导致业务任务长时间处于D状态而得不到调度,最终导致业务中断。
linux内核自带的hungtask智能检测到哪个任务长期处于D状态,并没有相关的锁资源信息,需要在 mutex、semaphore和rw_semaphore互斥对象操作过程中预埋维测手段,在触发hungtask时通过内核日志输出出来增强问题定位能力。
linux内核是通过khungtaskd来进行检测,该内核线程每隔一段时间调度一次,可以动态配置检测阈值时间,默认为120s。
[root@c20200314JANX ~]# cat /proc/sys/kernel/hung_task_timeout_secs
120
[root@c20200314JANX ~]# cat /proc/sys/kernel/hung_task_panic
0
当khungtaskd检测到某个进程处于D状态的时间差超过预设的检测时间时,则认为该进程发生了hungtask,此时可以获取问题进程正在获取的(locking)和已经获取的( locked)的 mutex、semaphore和rw_semaphore互斥对象信息,以及相关的调用栈信息。
这个信息可以输出到kbox中,系统panic复位恢复系统后导出kbox信息进行问题定位。
kbox可以对接到非易失存储设备中,掉电后数据不丢失,没有非易失存储设备时,panic默认触发软复位,bios初始化时不回重新初始化kbox保留内存段,确保内存段中的信息不会丢失。
softlockup
softlockup是linux内核提供的标准功能,用于检测内核长时间不发生调度的情况,为定位系统异常提供一个维测方法。对于一些由于软件原因导致CPU被挂死的场景,比如内核dead loop,spinlock长时间持有,spinlock死锁(重复加锁和交叉加锁),长时间持有rwlock,rwlock死锁,中断风暴等场景。linux下的普通进程通过调度器来分配执行时间,从而实现系统交互性。如果系统出现上述异常时,会造成其他业务进程得不到调度。
softlockup在每个CPU上启动watchdog线程,每隔一段时间就更新时间戳,然后再时钟中断处理函数中获取当前时间与watchdog最近一次被调度的时间戳比较,如果时间差超过检测时间的一半,则将watchdog唤醒,让它再次更新时间。如果时间差超过预设的检测时间,则认为在这段时间内CPU没有发生调度,softlockup会收集当前CPU的进程、寄存器和调用栈信息,可以动态打开、关闭softlockup功能,动态配置检测阈值时间,默认为60s。
softlockup检测到某个CPU出现不调度后,记录一次信息,如果下个检测点检测到异常未被处理,则不会重复记录信息,直到调度异常退出后,再此发生长时间不调度才会再次记录。
如果线程由于在等待spinlock而造成一段时间没有调度,会将此spinlock的 owner线程信息记录下来,如果是rwlock,会将写者加锁时的owner线程信息记录下来。
[root@c20200314JANX ~]# cat /proc/sys/kernel/softlockup_panic
0
[root@c20200314JANX ~]# cat /proc/sys/kernel/softlockup_thresh
60
4.19内核的watchdog_thresh
4.19内核中softlockup与hardlockup共享了一个超时时间阈值设置
cat /proc/sys/kernel/watchdog_thresh
10
hardlockup超时时间=watchdog_thresh
softlockup超时时间=2watchdog_thresh
watchdog_thresh参数的范围是[0,60]
x86默认为20,softlockup超时时间默认为20s2=40s;
ARM64版本watchdog_thresh的默认超时时间设置为10,即softlockup超时时间默认为10s2=20s;
设置方式为echo X Y > /proc/sys/kernel/softlockup_thresh ,设置超时阈值为 X1000(ms)+Y(ms)
kernel\watchdog.c
//softlockup超时设置
/*
* /proc/sys/kernel/watchdog_thresh
*/
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
mutex_lock(&watchdog_mutex);
old = READ_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && write && old != READ_ONCE(watchdog_thresh))
proc_watchdog_update();
mutex_unlock(&watchdog_mutex);
return err;
}
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
* want a higher threshold for soft lockups than for hard lockups. So we couple
* the thresholds with a factor: we make the soft threshold twice the amount of
* time the hard threshold is.
*/
static int get_softlockup_thresh(void)
{
return watchdog_thresh * 2;
}
static void set_sample_period(void)
{
/*
* convert watchdog_thresh from seconds to ns
* the divide by 5 is to give hrtimer several chances (two
* or three with the current relation between the soft
* and hard thresholds) to increment before the
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
watchdog_update_hrtimer_threshold(sample_period);
}
//hardlockup超时设置
kernel\watchdog_hld.c
static int hardlockup_detector_event_create(void)
{
unsigned int cpu = smp_processor_id();
struct perf_event_attr *wd_attr;
struct perf_event *evt;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
PTR_ERR(evt));
return PTR_ERR(evt);
}
this_cpu_write(watchdog_ev, evt);
return 0;
}