hung-task驱动加载为
subsys_initcall(hung_task_init);
能看出subsys_initcall定义主要有两个地方,一个地方是init.h另一个地方是module.h
两个地方的区别就是init.h用于内核内置所用,另一个module.h用于内核模块,内核
内置优先级是4,内核模块优先级则为6
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
/* Disable hung task detector on suspend */
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
}
hung_task_init函数初始化中主要做三件事:
- 注册panic回调函数,将panic_block回调函数加入panic_notifier_list
- 接收电源管理函数通知,当有电源管理事件进入之后,调用hungtask_pm_notify函数进行处理
- 开辟单独的线程watchdog进行系统状态监测
在hung_task注册的panic回调函数中,若系统有panic发生,则置位变量did_panic
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static bool hung_detector_suspended;
static int hungtask_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
switch (action) {
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
case PM_RESTORE_PREPARE:
hung_detector_suspended = true;
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
case PM_POST_RESTORE:
hung_detector_suspended = false;
break;
default:
break;
}
return NOTIFY_OK;
}
hung_task接收电源管理事件通知,其主要函数是hungtask_pm_notify,其中主要处理六类事件
#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */
#define PM_POST_HIBERNATION 0x0002 /* Hibernation finished */
#define PM_SUSPEND_PREPARE 0x0003 /* Going to suspend the system */
#define PM_POST_SUSPEND 0x0004 /* Suspend finished */
#define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */
#define PM_POST_RESTORE 0x0006 /* Restore failed */
HIBERNATION休眠,系统运行状态存储到硬盘,需要完全切断电源
SUSPEND 挂起,系统运行状态存储到内存,不能完全切断电源
当系统将要进行休眠、挂起或者恢复动作时,将会使能标志位hung_detector_suspend
当系统完成休眠、挂起或者恢复动作时,将会失能标志位hung_detector_suspend
/*
* kthread which checks for tasks stuck in D state
*/
static int watchdog(void *dummy)
{
unsigned long hung_last_checked = jiffies;
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
unsigned long interval = sysctl_hung_task_check_interval_secs;
long t;
if (interval == 0)
interval = timeout;
interval = min_t(unsigned long, interval, timeout);
t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
if (!atomic_xchg(&reset_hung_task, 0) &&
!hung_detector_suspended)
check_hung_uninterruptible_tasks(timeout);
hung_last_checked = jiffies;
continue;
}
schedule_timeout_interruptible(t);
}
return 0;
}
在LINUX的时钟中断中涉及至二个全局变量一个是xtime,它是timeval数据结构变量,另一个则是jiffies
xtime是从cmos电路中取得的时间,一般是从某一历史时刻开始到现在的时间,也就是为了取得我们操作系统上显示的日期。这个就是所谓的“实时时钟”,它的精确度是微秒。
jiffies是记录着从电脑开机到现在总共的时钟中断次数。在linux内核中jiffies远比xtime重要,那么他取决于系统的频率,单位是Hz,这里不得不说一下频率的单位,1MHz=1000,000Hz(6个零),1KHz=1000Hz(3个零)
set_user_nice(current, 0);即设当前进程nice值为0,也就是普通进程优先级
unsigned long timeout = sysctl_hung_task_timeout_secs;
unsigned long interval = sysctl_hung_task_check_interval_secs;
root@spc:/proc# sysctl -a | grep hung
kernel.hung_task_check_count = 4194304 //khungtaskd一次检测的最大线程数
kernel.hung_task_check_interval_secs = 0
kernel.hung_task_panic = 0 //是否将hung task检测结果转为panic
kernel.hung_task_timeout_secs = 120 //khungtaskd两次检测的最大timeout时间
kernel.hung_task_warnings = 10 //hung task警告信息的发送次数。
t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
if (!atomic_xchg(&reset_hung_task, 0) &&
!hung_detector_suspended)
check_hung_uninterruptible_tasks(timeout);
hung_last_checked = jiffies;
continue;
}
schedule_timeout_interruptible(t);
这段hu-ng_task定期检查的代码,在linux kernel源码git log中,作者有写出这样做的原因
在hung_task的函数hung_timeout_jiffies函数中
static long hung_timeout_jiffies(unsigned long last_checked,
unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
return timeout ? last_checked - jiffies + timeout * HZ :
MAX_SCHEDULE_TIMEOUT;
}
从hung_timeout_jiffies函数中可以看出,若timeout不为0,返回值应该是
last_checked - jiffies + timeout * HZ
若timeout值为0,返回值则是MAX_SCHEDULE_TIMEOUT
从传入的timeout参数interval来看,正常情况下,其值应该为非0值,所以正常情况下
hung_timeout_jiffies函数返回值应该为last_checked - jiffies + timeout * HZ
在watchdog的thread中,只有当函数hung_timeout_jiffies函数的返回值小于等于0,才会进入
check_hung_uninterruptible_tasks(timeout)去遍历系统中的进程是否处于D状态
因为从上文可知其返回值是last_checked - jiffies + timeout * HZ,因为当t>0时,last_checked的值
并不会更新,因为这里可以理解为,只有系统时钟中断在上一次对last_checked赋值后,经过大于120S之后,
算式last_checked - jiffies + timeout * HZ的值才会小于等于0,因为系统默认timout值为
sysctl_hung_task_timeout_secs 所以timeout * HZ这里计时应该是120S
也就是每隔120多S系统会检测thread的状态
接下来继续分析系统检测thread状态的函数
check_hung_uninterruptible_tasks
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if (test_taint(TAINT_DIE) || did_panic)
return;
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
last_break = jiffies;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
}
unlock:
rcu_read_unlock();
if (hung_task_show_lock)
debug_show_all_locks();
if (hung_task_show_all_bt) {
hung_task_show_all_bt = false;
trigger_all_cpu_backtrace();
}
if (hung_task_call_panic)
panic("hung_task: blocked tasks");
}
首先从代码中可以看出
if (test_taint(TAINT_DIE) || did_panic)
return;
如果系统处于Crash状态,系统将不会再进行hung_task检测
for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
last_break = jiffies;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
}
这里需要注意两个函数,for_each_process_thread和for_each_process,这两个函数的区别在于
for_each_process_thread会遍历系统中的所有进程,并且遍历进程下的所有线程
for_each_process会遍历系统中的所有进程
系统中的所有进程都挂在init_task的链表中,因为通过遍历init_task链表即可找到系统中所有进程
而进程中的所有线程,则都挂在进程的signal链表中,因此遍历进程的signal链表,即可找到这个进程对应
的所有线程
继续向下分析代码
if (!max_count--)
goto unlock;
这里可以看到max_count自减为0的时候,可以跳转到unlock函数,暂且不分析unlock函数执行的内容,
这里max_count是系统最大pid数,其定义可见
int max_count = sysctl_hung_task_check_count;
int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
这里PID_MAX_LIMIT通常是系统最大pid数
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
last_break = jiffies;
}
这一段代码的作用,作者在上传code的时候同样写上了原因
/*check_hung_uninterruptible_tasks() is currently calling rcu_lock_break()
for every 1024 threads. But check_hung_task() is very slow if printk()
was called, and is very fast otherwise.
If many threads within some 1024 threads called printk(), the RCU grace
period might be extended enough to trigger RCU stall warnings.
Therefore, calling rcu_lock_break() for every some fixed jiffies will be
safer.*/
这块代码的作用是因为在进行之后的函数
check_hung_task
的时候,假如有许多thread需要调用printk打印消息,那么这个过程就会变得很慢,因为在for_each_process_thread开始之前,这段函数已经拿住了
rcu锁,若大量thread调用printk,而久久不放开rcu锁,便会触发rcu锁报警,因此,这里会用time_after定期执行rcu_lock_break去释放rcu,若释放rcu锁
再拿住,就不会触发报警,并且如果rcu_lock_break调度回来发现进程已经死掉,便会直接退出遍历,走到unlock
具体可见代码逻辑
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
*
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* to exit the grace period. For classic RCU, a reschedule is required.
*/
static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
bool can_cont;
get_task_struct(g);
get_task_struct(t);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
can_cont = pid_alive(g) && pid_alive(t);
put_task_struct(t);
put_task_struct(g);
return can_cont;
}
接下来将进入本feature最核心的内容check_hung_task
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
从代码可见,如果thread状态为TASK_UNINTERRUPTIBLE状态,则会进行hung_task检测
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
/*
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
return;
/*
* When a freshly created task is scheduled once, changes its state to
* TASK_UNINTERRUPTIBLE without having ever been switched out once, it
* musn't be checked.
*/
if (unlikely(!switch_count))
return;
if (switch_count != t->last_switch_count) { //这里说明该thread有被调度
t->last_switch_count = switch_count; //更新task_struct的last_switch_count计数
t->last_switch_time = jiffies; //更新task_struct的last_switch_time计数
return;
}
if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
return;
//当程序走到这一步的时候,说明这个D状态的thread已经超过120s没有被调度,将进入对应的handling处理步骤
/*Currently check_hung_task() prints a warning if it detects the
problem, but it is not convenient to watch the system logs if
user-space wants to be notified about the hang.
Add the new trace_sched_process_hang() into check_hung_task(),
this way a user-space monitor can easily wait for the hang and
potentially resolve a problem.*/
trace_sched_process_hang(t);
if (sysctl_hung_task_panic) {
//如果定义了sysctl_hung_task_panic 置位hung_task_show_lock hung_task_call_panic
console_verbose();
hung_task_show_lock = true;
hung_task_call_panic = true;
}
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
if (sysctl_hung_task_warnings) { //如果定义sysctl_hung_task_warnings,打印warning信息
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
hung_task_show_lock = true;
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
}
/*touch nmi_watchdog相关的计数器,防止在此过程中触发nmi_watchdog*/
touch_nmi_watchdog();
}
最后,我们分析一下,这个feature的handling部分
unlock:
rcu_read_unlock();
if (hung_task_show_lock)
debug_show_all_locks();
if (hung_task_show_all_bt) {
hung_task_show_all_bt = false;
trigger_all_cpu_backtrace();
}
if (hung_task_call_panic)
panic("hung_task: blocked tasks");
可以看到,函数首先会释放rcu锁
若在之前的check_hung_task函数中,检测到有D状态thread 120S未被调度,并且置位相关标志,则会
调用debug_show_all_locks打印系统中所有lock信息
调用trigger_all_cpu_backtrace,把所有cpu上程序的backtrace打印出来
如果定义了hung_task_call_panic的话,则会call panic,进行系统错误处理程序