文章目录
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. TASK_INTERRUPTIBLE 和 TASK_UNINTERRUPTIBLE
2.1 语义
以下是从文章 Process Scheduling in the Kernel 摘录的对进程状态 TASK_INTERRUPTIBLE
和 TASK_UNINTERRUPTIBLE
的说明:
TASK_INTERRUPTIBLE
identifies a process that is suspended (sleeping) until some condition becomes true.
Raising an interrupt, releasing a system resource the process is waiting for, or
delivering a signal are examples of conditions that might wake up the process,
that is put its state back to TASK_RUNNNING.
TASK_UNINTERRUPTIBLE
identifies a process that is suspended like in the TASK_INTERRUPTIBLE state, except that
in this case delivering a signal will not wake up the process. This process state is
seldom used.
简单翻译一下:
. TASK_INTERRUPTIBLE
进程进入睡眠直到等待,中断、信号、或等待的条件成立时,可唤醒进程,并可能将进程重新置为运行态
(TASK_RUNNNING)。
. TASK_UNINTERRUPTIBLE
类似于 TASK_INTERRUPTIBLE,但无法通过信号唤醒进程。
2.2 实现
从 2.1
了解了对 TASK_INTERRUPTIBLE
和 TASK_UNINTERRUPTIBLE
的语义,本小节从代码层面看内核是如何实现
它们的。
2.2.1 TASK_INTERRUPTIBLE 实现
以 socket
通信 TCP 三次握手 过程中的 accept()
调用为例,来说明 TASK_INTERRUPTIBLE
的语义实现。
服务端调用 accept()
等待 TCP 连接的三次握手完成:
sys_accept()
sys_accept4()
sock->ops->accept() = inet_accept()
sk1->sk_prot->accept() = inet_csk_accept()
if (reqsk_queue_empty(queue)) {
/* 阻塞模式下,永不超时,即 timeout 为 MAX_SCHEDULE_TIMEOUT */
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
...
/* 等待 TCP 连接的三次握手完成 */
error = inet_csk_wait_for_connect(sk, timeo);
...
}
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
struct inet_connection_sock *icsk = inet_csk(sk);
DEFINE_WAIT(wait);
int err;
for (;;) {
/* 将进程添加到等待队列 sk_sleep(sk) */
prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo); /* (1) 主动调度:进入 TASK_INTERRUPTIBLE 睡眠等待 */
...
lock_sock(sk);
err = 0;
if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) /* 有连接完成三次握手, */
break; /* 正常结束等待 */
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
break;
err = sock_intr_errno(timeo);
if (signal_pending(current)) /* 进程有信号挂起, */
break; /* 终止等待过程,处理信号 */
err = -EAGAIN; /* 非阻塞方式下等待超时错误码 EAGAIN */
if (!timeo) /* 非阻塞方式下等待超时, */
break; /* 终止等待过程,用户收到错误码 EAGAIN,提示可以重试 */
}
/* 将进程从等待队列 sk_sleep(sk) 移除,重新进入 TASK_RUNNING 状态 */
finish_wait(sk_sleep(sk), &wait);
return err;
}
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
__set_current_state(TASK_RUNNING); /* 进程重新进入 TASK_RUNNING 状态 */
/*
* We can check for list emptiness outside the lock
* IFF:
* - we use the "careful" check that verifies both
* the next and prev pointers, so that there cannot
* be any half-pending updates in progress on other
* CPU's that we haven't seen yet (and that might
* still change the stack area.
* and
* - all other users take the lock (ie we can only
* have _one_ other CPU that looks at or modifies
* the list).
*/
if (!list_empty_careful(&wq_entry->entry)) {
spin_lock_irqsave(&wq_head->lock, flags);
list_del_init(&wq_entry->entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
看下调度细节:
/* kernel/time/timer.c */
signed long __sched schedule_timeout(signed long timeout)
{
...
schedule();
...
}
/* kernel/sched/core.c */
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable(); /* 关闭抢占 */
__schedule(false); /* 主动调度 */
sched_preempt_enable_no_resched(); /* 开启抢占 */
} while (need_resched());
}
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
...
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
...
local_irq_disable();
...
if (!preempt && prev->state) {
/*
* 如果进程 @prev 当前有信号挂起,不进入睡眠,
* 而是继续保持 可运行 状态,以备后续被调度时处理信号。
*/
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
/* 从可运行队列移除 */
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
...
}
}
next = pick_next_task(rq, prev, &rf); /* 挑选要执行的进程 */
...
if (likely(prev != next)) { /* 切换到不同进程 */
rq->nr_switches++;
rq->curr = next;
...
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf); /* 进程上下文切换 */
} else {
...
}
}
从上面分析看到,服务端在 accept()
中在 TASK_INTERRUPTIBLE
状态睡眠等待
。接着看在 3
种不同场景下唤醒进程的过程。
2.2.1.1 等待的条件成立时 唤醒
正常唤醒过程,在连接三次握手完成唤醒过程
:
tcp_child_process(sk, nsk, skb)
...
parent->sk_data_ready(parent) = sock_def_readable()
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
/* 唤醒在 accept() 中等待连接的进程 */
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI
POLLRDNORM | POLLRDBAND);
...
...
2.2.1.2 信号 唤醒
异常唤醒过程,通过信号唤醒进程
:
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
...
complete_signal(sig, t, group);
}
static void complete_signal(int sig, struct task_struct *p, int group)
{
...
signal_wake_up(t, sig == SIGKILL);
...
}
static inline void signal_wake_up(struct task_struct *t, bool resume)
{
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
set_tsk_thread_flag(t, TIF_SIGPENDING); /* 标记进程 @t 有信号挂起 */
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
/*
* 唤醒 TASK_INTERRUPTIBLE 状态的进程处理信号:
* . 将进程设置为 TASK_RUNNING 状态
* . 选择运行的 CPU
* . 设置 TIF_NEED_RESCHED 标志
* . 其它 ......
*/
if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
2.2.1.3 中断 唤醒
异常唤醒过程,中断唤醒
。如果被中断的进程因信号投递而被唤醒(设置了 TIF_NEED_RESCHED
标志),将发生中断处理结束时的抢占。由于进程被中断时,可能处于 内核态 和 用户态,所以有两种不同的执行路径。本文以 ARMv7
架构中断处理过程为例分别加以说明。
2.2.1.3.1 内核态的处理过程
/* arch/arm/kernel/entry-armv.S */
.align 5
__irq_svc:
svc_entry
irq_handler /* 处理 内核态 中断 */
#ifdef CONFIG_PREEMPT
/* 开启了 内核态抢占 的情形 */
ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
ldr r0, [tsk, #TI_FLAGS] @ get flags
teq r8, #0 @ if preempt count != 0
movne r0, #0 @ force flags to 0
tst r0, #_TIF_NEED_RESCHED /* 在本文场景,检查因信号投递而设置的 TIF_NEED_RESCHED 标志 */
blne svc_preempt /* 中断处理结束后,发起 内核态 抢占 */
#endif
svc_exit r5, irq = 1 @ return from exception
UNWIND(.fnend )
ENDPROC(__irq_svc)
#ifdef CONFIG_PREEMPT
svc_preempt:
mov r8, lr
/* 发起内核态抢占 */
1: bl preempt_schedule_irq @ irq en/disable is done inside
ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
tst r0, #_TIF_NEED_RESCHED
reteq r8 @ go again
b 1b
#endif
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
preempt_disable();
local_irq_enable();
__schedule(true); /* 抢占调度 */
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
2.2.1.3.2 用户态的处理过程
/* arch/arm/kernel/entry-armv.S */
.align 5
__irq_usr:
usr_entry
kuser_cmpxchg_check
irq_handler /* 处理 用户态 中断 */
get_thread_info tsk
mov why, #0
b ret_to_user_from_irq
UNWIND(.fnend )
ENDPROC(__irq_usr)
/* arch/arm/include/asm/thread_info.h */
/*
* Change these and you break ASM code in entry-common.S
*/
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE)
/* arch/arm/kernel/entry-common.S */
ENTRY(ret_to_user_from_irq)
...
ldr r1, [tsk, #TI_FLAGS]
tst r1, #_TIF_WORK_MASK /* 检查 是否需要调度、是否有信号要处理 等等 */
bne slow_work_pending /* 处理 调度、信号 等等 */
no_work_pending:
...
restore_user_regs fast = 0, offset = 0
ENDPROC(ret_to_user_from_irq)
/* 处理 调度、信号 等等 */
slow_work_pending:
...
bl do_work_pending
...
/* arch/arm/kernel/signal.c */
asmlinkage int
do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
{
...
do {
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule(); /* 执行调度 */
} else {
...
}
...
thread_flags = current_thread_info()->flags;
} while (thread_flags & _TIF_WORK_MASK);
...
}
2.2.2 TASK_UNINTERRUPTIBLE 实现
TASK_UNINTERRUPTIBLE
状态的典型场景是 msleep()
调用:
void msleep(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
}
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
return schedule_timeout(timeout);
}
msleep()
将进程设置为 TASK_UNINTERRUPTIBLE
状态,不会让人觉得意外。毕竟,msleep()
的本意就是让进程睡够指定时间才被唤醒,睡眠过程不可被中断(即 UNINTERRUPTIBLE
)。如果时间没睡够中途就被唤醒,这不符合 msleep()
的语义。
再看一个驱动代码片段示例:
/* drivers/hwmon/abituguru.c */
static int abituguru_send_address(struct abituguru_data *data,
u8 bank_addr, u8 sensor_addr, int retries)
{
...
for (;;) {
...
if (abituguru_wait(data, ABIT_UGURU_STATUS_INPUT)) {
if (retries) {
/* 进入 TASK_UNINTERRUPTIBLE 等待超时时间 ABIT_UGURU_RETRY_DELAY 到达 */
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(ABIT_UGURU_RETRY_DELAY);
retries--;
continue;
}
...
}
...
}
}
/* kernel/time/timer.c */
signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
schedule();
goto out;
default:
...
}
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire, false);
schedule(); /* 主动调度出去,等待超时时间、或等待的事件 到达 */
...
timeout = expire - jiffies; /* 剩余的超时时间: 有可能 等待的事件到达 而被 提前唤醒 */
out:
return timeout < 0 ? 0 : timeout;
}
/* 超时后唤醒进进程 */
static void process_timeout(unsigned long __data)
{
wake_up_process((struct task_struct *)__data);
}
/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_NORMAL, 0);
}
为什么 TASK_UNINTERRUPTIBLE
进程不会因中断或信号而唤醒?从前面的 signal_wake_up_state()
分析已经有了答案,这里再重复一下:
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
...
if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
可见,信号只会唤醒 TASK_INTERRUPTIBLE
状态的进程。既然信号不唤醒 TASK_UNINTERRUPTIBLE
状态进程,自然也不会进一步设置进程的 TIF_NEED_RESCHED
标志;同时,除非等待的条件满足(如超时到达),也没有其它可能对 TASK_UNINTERRUPTIBLE
状态进程设置 TIF_NEED_RESCHED
标志。综上所述,中断无法唤醒 TASK_INTERRUPTIBLE
状态的进程,因为中断处理结束时的抢占调度依赖于 TIF_NEED_RESCHED
标志。
在本小节驱动例子中,TASK_UNINTERRUPTIBLE
状态的进程等到超时时间 ABIT_UGURU_RETRY_DELAY
到达时被唤醒。
2.3 小结
TASK_INTERRUPTIBLE
在主动调度出去时,如果当前没有信号挂起
,就会从 CPU 的运行队列中移除
,但如果当前有信号挂起
,会继续保持 TASK_RUNNING
状态,且不会从 CPU 的运行队列中移除;TASK_UNINTERRUPTIBLE
在主动调度出去时,直接从 CPU 的运行队列中移除
。TASK_INTERRUPTIBLE
可看作浅度睡眠
。
TASK_INTERRUPTIBLE
睡眠期间,可能被等待的事件、信号、中断唤醒;TASK_UNINTERRUPTIBLE
睡眠期间,无法被 信号、中断唤醒,只能被等待的事件唤醒,如前面例子中的超时时间到达。TASK_UNINTERRUPTIBLE
可看作深度睡眠
。
TASK_INTERRUPTIBLE
工具观察为 S
态,TASK_UNINTERRUPTIBLE
工具观察为 D
态。
Linux 内核提供对长时间处于 TASK_UNINTERRUPTIBLE
态进程的监测机制,细节可参考博文 Linux: hung task 检测机制简析 。