// 1.debug选项LOCKUP_DETECTOR,开启/关闭kernel中的soft lockup和hard lockup探测
// 2.实现:kernel/watchdog.c
// 3.实现原理:
// 1.涉及到了3部分内容:kernel线程,时钟中断,NMI中断
// 优先级:kernel线程 < 时钟中断 < NMI中断。
// 2.利用它们之间优先级的区别,调试系统运行中的两种问题:
// 抢占被长时间关闭而导致进程无法调度(soft lockup)
// 中断被长时间关闭而导致更严重的问题(hard lockup)
// 参考:http://blog.csdn.net/panzhenjie/article/details/10074551
// 内核版本 3.8.6
// smp per-cpu watchdog核心线程
1.1 static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
.thread_fn = watchdog,
.thread_comm = "watchdog/%u",
.setup = watchdog_enable,
.park = watchdog_disable,
.unpark = watchdog_enable,
};
// lockup detector模块初始化
// 函数任务:
// 1.计算hrtimer运行的频率
// 2.注册watchdog核心线程
// 注:
1.2 void __init lockup_detector_init(void)
{
//计算hrtimer运行的频率
set_sample_period();
//注册watchdog核心线程
if (smpboot_register_percpu_thread(&watchdog_threads)) {
pr_err("Failed to create watchdog threads, disabled\n");
watchdog_disabled = -ENODEV;
}
}
// 设置watchdog timer运行频率
// 调用路径: lockup_detector_init->get_softlockup_thresh
// 注:
// 1.sample_period,即watchdog timer运行的频率
// 2.watchdog timer在一次soft lockup时间阈值内运行5次
1.3 static void set_sample_period(void)
{
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
}
// 认定发生了soft lockup的时间阈值
// 注:如果watchdog kthread在watchdog_thresh * 2 时间内未被调度,
// 则认为发生了soft lockup.
1.4 static int get_softlockup_thresh(void)
{
return watchdog_thresh * 2;
}
// 启动指定cpu上lockup检测
// 函数任务:
// 1.初始化watchdog timer
// 2.初始化hard lockup的nmi中断事件
// 3.启动watchdog timer
// 4.设置watchdog kthread调度策略FIFO
// 5.更新watchdog时间戳
// 注:设置watchdog kthread为FIFO的调度策略保证了watchdog timer
// 唤醒kthread之后,它可以因高优先级切换到cpu上执行。
2.1 static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
//lockup检测使用的hrtimer
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
//第一次启动watchdog,暂停current
if (!watchdog_enabled) {
kthread_park(current);
return;
}
//hard lockup检测机制
watchdog_nmi_enable(cpu);
//hrtimer sample时间之后运行
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
//watchdog进程FIFO策略
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
//执行一次更新
__touch_watchdog();
}
// 关闭指定cpu上的lockup检测
// 函数任务:
// 1.恢复watchdog正常优先级
// 2.取消hrtimer
// 3.关闭hard lockup检测机制的nmi中断
2.2 static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
//恢复watchdog正常优先级
watchdog_set_prio(SCHED_NORMAL, 0);
//取消hrtimer
hrtimer_cancel(hrtimer);
//关闭hard lockup检测机制的nmi中断
watchdog_nmi_disable(cpu);
}
// watchdog核心线程可运行的条件
// 函数任务:
// 1.保证watchdog kthread 运行频率 <= watchdog timer 运行频率
// 注:
// soft_lockup_hrtimer_cnt代表watchdog核心线程运行的次数
// hrtimer_interrupts代表watchdog timer运行的次数
2.3 static int watchdog_should_run(unsigned int cpu)
{
return __this_cpu_read(hrtimer_interrupts) !=
__this_cpu_read(soft_lockup_hrtimer_cnt);
}
// watchdog核心线程函数
// 函数任务:
// 1.更新soft_lockup_hrtimer_cnt=hrtimer_interrupts
// 2.更新watchdog运行时间戳
2.4 static void watchdog(unsigned int cpu)
{
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
}
// 更新watchdog运行时间戳
2.5 static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
__this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
}
// 定时器函数
// 函数主要任务:
// 1.获取watchdog上次运行的时间戳
// 2.递增watchdog timer运行次数
// 3.检查watchdog时间戳,是否发生了soft lockup
// 3.1 如果发生了,dump堆栈,打印信息
// 4.重调度timer
// 注:
// 在watchdog timer运行时唤醒watchdog kthread,保证kthread与timer相同的运行频率
3.1 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
//watchdog上次运行的时间戳
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
//在唤醒watchdog kthread之前递增hrtimer_interrupts,保证kthread更新其时间戳
watchdog_interrupt_count();
//唤醒watchdog kthread,保证kthread与timer相同的运行频率
wake_up_process(__this_cpu_read(softlockup_watchdog));
//再次调度hrtimer下一个周期运行
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
...
//检测是否发生soft lockup
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
print_irqtrace_events(current);
//dump 寄存器和堆栈
if (regs)
show_regs(regs);
else
dump_stack();
if (softlockup_panic)
panic("softlockup: hung tasks");
}
return HRTIMER_RESTART;
}
// 检查抢占被关闭的时间间隔
// watchdog kthread在watchdog timer的中断上下文中被唤醒,
// 当中断退出时,kthread会抢占cpu上的当前进程。如果
// 抢占被关闭的话,则不会发生抢占,watchdog便无法更新时
// 间戳,当抢占关闭的时间超过阈值时,核心认为发生了soft
// lock up。
// 注:soft lockup阈值 watchdog_thresh * 2 (20s)
3.2 static int is_softlockup(unsigned long touch_ts)
{
//当前时间戳
unsigned long now = get_timestamp(smp_processor_id());
//watchdog在 watchdog_thresh * 2 时间内未被调度过
if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;
return 0;
}
时间子系统16_soft lockup机制
最新推荐文章于 2024-03-26 08:00:00 发布