转载自 https://blog.csdn.net/hzj_001/article/details/100054659
主体涉及到了3个机制:kernel watchodog线程,高精度定时器(时钟中断),基于PMU硬件perf event的NMI(不可屏蔽中断)。
基本思想:
1.)(soft lockup):抢占被长时间关闭而导致其余进程无法调度
2.)(hard lockup):中断被长时间关闭而导致
softlockup基本原理:
1)SoftLockup 检测首先需要对每一个CPU core注册叫做watchdog的kernel线程。即[watchdog/0],[watchdog/1],[watchdog/2]…
2)同时,系统会有一个高精度的计时器hrtimer,该计时器能定期产生时钟中断,该中断对应的中断回调函数是watchdog_timer_fn();此中断回调函数主要做3件事:
a.watchdog_interrupt_count函数更新hrtimer_interrupts变量(判断hardlockup会用)
b.wake_up_process唤醒watchdog线程(更新时间戳)
c.is_softlockup判断是否出现了soft_lockup
soft lock detector会检查时间戳,如果超过soft lockup threshold一直未更新,说明[watchdog/x]未得到运行机会,意味着CPU被霸占,也就是发生了soft lockup。
注意,这里面的内核线程[watchdog/x]的目的是更新时间戳,该时间戳是被watch的对象。而真正的看门狗,则是由时钟中断触发的 watchdog_timer_fn(),这里面 [watchdog/x]是被scheduler调用执行的,而watchdog_timer_fn()则是被中断触发的。
hardlockup基本原理:(也可参照另一篇https://blog.csdn.net/hzj_001/article/details/95059760)
1)注册一个基于PMU硬件的的perf event,经过watchdog_thresh(/proc/sys/kernel/watchdog_thresh)秒的时间会触发NMI中断
2)中断处理函数通过检测在二个NMI中断相应后的hrtimer_interrupts(上次的值hrtimer_interrupts_saved)值是否发生变化来判断是否发生hardlockup
3)保存中断计数hrtimer_interrupts_saved=hrtimer_interrupts
1.watchdog线程
系统会为每个cpu 注册一个一般的kernel线程,名字叫watchdog/0, watchdog/1...以此类推。
主要作用:将当前cpu时间戳,更新至watchdog_touch_ts
2.watchdog_enable会注册一个高精度定时器,通过时钟中断响应函数来实现一些看门狗功能
1)注册一个高精度时钟定时器
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
/* Enable the perf event */
watchdog_nmi_enable(cpu);
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED); /* initialize timestamp */
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
__touch_watchdog();
}
2)响应时钟中断,通过响应高精度时钟中断处理函数hrtimer_interrupt来执行中断回调函数watchdog_timer_fn。
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);//获取当前cpu的watchdog_touch_ts,该计数在watchdog内核线程被调度时更新
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
if (atomic_read(&watchdog_park_in_progress) != 0)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
watchdog_interrupt_count();//更新中断计数
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));//唤醒watchdog线程
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));//重启定时器
if (touch_ts == 0) {/*第一次执行,watchdog_touch_ts时间戳可能为零,需要更新touch_ts 为当前时间戳*/
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
* make sure the scheduler tick is up to date.
*/
__this_cpu_write(softlockup_touch_sync, false);
sched_clock_tick();
}
/* Clear the guest paused flag on watchdog reset */
kvm_check_and_clear_guest_paused();
__touch_watchdog();//更新watchdog_touch_ts
return HRTIMER_RESTART;
}
/* check for a softlockup
* This is done by making sure a high priority task is
* being scheduled. The task touches the watchdog to
* indicate it is getting cpu time. If it hasn't then
* this is a good indication some task is hogging the cpu
*//*检测系统是否超过20s未发生调度,并做出决策*/
duration = is_softlockup(touch_ts);//判断是否发生了软锁,原理是判断touch_ts(时间戳)是否超过一定时间没有更新
if (unlikely(duration)) {
/*
* If a virtual machine is stopped by the host it can look to
* the watchdog like a soft lockup, check to see if the host
* stopped the vm before we issue the warning
*/
if (kvm_check_and_clear_guest_paused())
return HRTIMER_RESTART;
/* only warn once */
if (__this_cpu_read(soft_watchdog_warn) == true) {
/*soft_watchdog_warn标识会在已经出现了一次看门狗超时的情况下置位,
此处的用意是对于同一个死锁进程,内核只做一次报警动作,如果死锁的进程发生了改变,那该标识会重新设置为false,将可以重新触发报警。*/
/*
* When multiple processes are causing softlockups the
* softlockup detector only warns on the first one
* because the code relies on a full quiet cycle to
* re-arm. The second process prevents the quiet cycle
* and never gets reported. Use task pointers to detect
* this.
*/
if (__this_cpu_read(softlockup_task_ptr_saved) !=
current) {
__this_cpu_write(soft_watchdog_warn, false);
__touch_watchdog();
}
return HRTIMER_RESTART;
}
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
* engaged in dumping cpu back traces
*/
if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
/* Someone else will report us. Let's give up */
__this_cpu_write(soft_watchdog_warn, true);
return HRTIMER_RESTART;
}
}
/*上面是一些条件的判断是否是真正的soft_lockup.下面是当soft_lockup出现的话会将 一些必要的信息dump出来.*/
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
__this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
if (softlockup_all_cpu_backtrace) {
/* Avoid generating two back traces for current
* given that one is already made above
*/
trigger_allbutself_cpu_backtrace();
clear_bit(0, &soft_lockup_nmi_warn);
/* Barrier to sync with other cpus */
smp_mb__after_atomic();
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)//如果配置了softlockup_panic(proc中配置),则panic
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true); //出现了一次超时,softlock。
} else
__this_cpu_write(soft_watchdog_warn, false);
return HRTIMER_RESTART;
}
3.watchdog_enable中会通过watchdog_nmi_enable注册一个基于PMU硬件的perf event,通过NMI中断回调函数watchdog_overflow_callback检测hardlockup
这个硬件在x86里叫performance monitoring,这个硬件有一个功能就是在cpu clock经过了多少个周期后发出一个NMI中断出来。
1)注册perf事件
static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
}
2)设置定时时间sample_period
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
return (u64)(cpu_khz) * 1000 * watchdog_thresh;
}
在这里,根据当前cpu的频率,算出一个值,也就是20秒cpu clock经过的周期数,太绕,实际就是20s会触发NMI中断
3)响应NMI中断,执行中断回调函数watchdog_overflow_callback,检测hardlockup
static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
if (hardlockup_panic)
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
else
WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
return;
}
return;
}
这个函数主要就是调用is_hardlockup
/* watchdog detector functions */
static bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
return true;
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
这个函数主要就是查看hrtimer_interrupts变量在时钟中断处理函数里有没有被更新。假如没有更新,就意味着中断出了问题,可能被错误代码长时间的关中断了。
https://wenku.baidu.com/view/7e8d303f571252d380eb6294dd88d0d233d43ceb.html
个人总结下:
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
1. 每个cpu创建线程,这个线程很奇怪:
smpboot_register_percpu_thread_cpumask -》__smpboot_create_thread-》
tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
ht->thread_comm);
所以线程主体是smpboot_thread_fn,这里while 1 会各种判断should_stop should_park should_run
while(1){
。。。
if (!ht->thread_should_run(td->cpu)) {
preempt_enable_no_resched();
schedule(); //可以看到不是should run就shedule调度了。 这里就是靠watchdog里time func里 wakeup 来唤醒的
} else {
__set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu); //thread_fn 即是 watchdog.c 里的watchdog 函数
}
}
这里while 1会一直判断should_run, should run是判断两个计数是否相等:
static int watchdog_should_run(unsigned int cpu) {
return __this_cpu_read(hrtimer_interrupts) !=
__this_cpu_read(soft_lockup_hrtimer_cnt);
}
/*
* The watchdog thread function - touches the timestamp.
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
*/
static void watchdog(unsigned int cpu) {
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts)); //把中断次数赋值给soft_lockup_hrtimer_cnt
__touch_watchdog();
}
/* Commands for resetting the watchdog */
static void __touch_watchdog(void) {
__this_cpu_write(watchdog_touch_ts, get_timestamp()); //更新watchdog* 线程最后调度的时间戳
}
// timer 回调函数,中断触发的。
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) {
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
if (atomic_read(&watchdog_park_in_progress) != 0)
return HRTIMER_NORESTART;
/* kick the hardlockup detector */
__this_cpu_inc(hrtimer_interrupts); //中断次数加1
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog)); //换醒当前cpu watchdog线程 smpboot_thread_fn
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
/* check for a softlockup
* This is done by making sure a high priority task is
* being scheduled. The task touches the watchdog to
* indicate it is getting cpu time. If it hasn't then
* this is a good indication some task is hogging the cpu
*/
duration = is_softlockup(touch_ts); //通过此timer的时间戳和watchdog的时间戳来判断watchdog是否一直没调度。
}
所以各cpu线程主体 smpboot_thread_fn 也就是 watchdog 函数, 多久执行一次取决于timer 里什么时候更新中断次数;watchdog线程其实只更新了下时间戳,将softlockup计数设置为中断次数。
目前timer 4秒执行一次,timer 回调函数是调动判断softlockup 和 hardlockup的核心;
int __read_mostly watchdog_thresh = 10;
sample_period = watchdog_thresh * 2 * ((u64)NSEC_PER_SEC / 5);
static int is_softlockup(unsigned long touch_ts) {
unsigned long now = get_timestamp();
if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
if (time_after(now, touch_ts + watchdog_thresh * 2)) //如果当前时间戳较上次 超过20秒就认为是softlockup 了
return now - touch_ts;
}
return 0;
}
/* watchdog detector functions */
hardlockup arm架构上没有NMI中断,所以现有code不行。可参考的修改方案思路:
用core0 上timer 回调判断core1上中断次数是否变化,core1 timer回调判断core2上中断次数。