waitqueue
内核中提供了等待队列,作用是实现阻塞操作。等待队列用于使进程等待某一特定的事件发生而无需频繁的轮询,进程在等待期间睡眠,在某些事件发生时,由内核自动唤醒。
首先,Linux中所有的进程都是由task_struct这个结构管理。在生成进程的时候会分配一个task_struct结构,之后将通过这个结构对进程进行管理。task_struct位于独立的连续区间。task_struct结构中有一个state成员,有下面几种状态:
状态 | 说明 |
---|---|
TASK_RUNNING | 执行可能状态 |
TASK_INTERRUPTIBLE | 等待状态,可接受信号 |
TASK_UNINTERRUPTIBLE | 等待状态,不能接受信号 |
TASK_ZOMBIE | 僵尸状态,exit后的状态 |
TASK_STOPPED | 延缓状态 |
1. 创建一个等待队列
Linux内核中,wait_queue_head_t代表一个等待队列头,wait_queue_head_t数据结构如下:
struct __wait_queue_head {
spinlock_t lock; // 自旋锁,确保对链表操作的原子性
struct list_head task_list; // 链表
};
typedef struct __wait_queue_head wait_queue_head_t;
等待队列中每个元素用wait_queue_t来表示,wait_queue_t数据结构如下:
typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
unsigned int flags; // WQ_FLAG_EXCLUSIVE-表示等待进程想要被独占地唤醒; 0-可以和其他进程一起唤醒。
#define WQ_FLAG_EXCLUSIVE 0x01 // 在结构体中定义宏跟一般的宏没区别,这里表示flags会用到该宏,提高直观性。
void *private; // 指向等待进程的task_struct地址
wait_queue_func_t func; // 用于唤醒被挂起任务的回调函数
struct list_head task_list; // 链表元素,用于链接到wait_queue_head_t中的task_list链表中
};
① 可以调用init_waitqueue_head接口来初始化此队列,init_waitqueue_head主要是将wait_queue_head_t结构体中的两个成员进行初始化。
staitc wait_queue_head_t prod_wq;
init_waitqueue_head(&prod_wq);
#define init_waitqueue_head(q) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((q), #q, &__key); \
} while (0)
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
spin_lock_init(&q->lock); // 初始化自旋锁
lockdep_set_class_and_name(&q->lock, key, name); //和防止死锁有关
INIT_LIST_HEAD(&q->task_list); // 初始化链表
}
② 也可以使用DECLARE_WAIT_QUEUE_HEAD来定义和初始化等待队列头。
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.task_list = { &(name).task_list, &(name).task_list } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
③ 定义和初始化等待队列项
#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \
.task_list = { NULL, NULL } }
#define DECLARE_WAITQUEUE(name, tsk) \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
2. 让当前进程开始等待
内核提供了如下的接口来让当前进程在条件不满足的情况下,阻塞等待:
wait_event(wq, condition)
wait_event_timeout(wq, condition, timeout)
wait_event_interruptible(wq, condition)
wait_event_interruptible_timeout(wq, condition, timeout)
2.1 wait_event
wait_event的实现如下:
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_tail(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
/*
* We can check for list emptiness outside the lock
* IFF:
* - we use the "careful" check that verifies both
* the next and prev pointers, so that there cannot
* be any half-pending updates in progress on other
* CPU's that we haven't seen yet (and that might
* still change the stack area.
* and
* - all other users take the lock (ie we can only
* have _one_ other CPU that looks at or modifies
* the list).
*/
if (!list_empty_careful(&wait->task_list)) {
spin_lock_irqsave(&q->lock, flags);
list_del_init(&wait->task_list);
spin_unlock_irqrestore(&q->lock, flags);
}
}
#define __wait_event(wq, condition) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
schedule(); \
} \
finish_wait(&wq, &__wait); \
} while (0)
#define wait_event(wq, condition) \
do { \
if (condition) \
break; \
__wait_event(wq, condition); \
} while (0)
里面有个宏定义即DEFINE_WAIT,详细如下:
#define DEFINE_WAIT_FUNC(name, function) \
wait_queue_t name = { \
.private = current, \
.func = function, \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
可以看到private成员是当前task对象的地址current, func成员是autoremove_wake_function(在下面wake_up时再做说明)。
所以整个wait_event的逻辑就是:
① 首先判断条件是否满足,如果满足,直接退出;如果不满足,调用__wait_event
② __wait_event中首先基于当前经常构建一个等待队列项;然后进入死循环:
- 调用prepare_to_wait,该函数将新建的等待队列项加入到等待队列中,并修改当前任务的state为TASK_UNINTERRUPTIBLE;(注,该函数flags的结果必然是0,也就是说这个函数是将非独占进程添加到等待队列当中。而add_wait_queue_exclusive函数则是将独占进程添加到等待队列的尾部,也就是说一个等待队列,非独占进程总是在前面,独占进程总是在后面)
- 判断condition条件,满足就退出循环,不满足继续
- 调用schedule()进行任务调度后,重新开始循环
③ 退出循环后调用finish_wait,将当前任务的state设置为TASK_RUNNING,并将新建的等待队列从任务队列中删除。
2.2 wait_event_timeout
wait_event_timeout 的实现如下:
signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
default:
/*
* Another bit of PARANOID. Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout() (since it
* should never happens anyway). You just have the printk()
* that will tell you if something is gone wrong and where.
*/
if (timeout < 0) {
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
current->state = TASK_RUNNING;
goto out;
}
}
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
schedule();
del_singleshot_timer_sync(&timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer);
timeout = expire - jiffies;
out:
return timeout < 0 ? 0 : timeout;
}
#define __wait_event_timeout(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
ret = schedule_timeout(ret); \
if (!ret) \
break; \
} \
if (!ret && (condition)) \
ret = 1; \
finish_wait(&wq, &__wait); \
} while (0)
#define wait_event_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!(condition)) \
__wait_event_timeout(wq, condition, __ret); \
__ret; \
})
wait_event_timeout 和 wait_event逻辑类似,就一个地方差异较大,即schedule_timeout。
schedule_timeout中构建了一个定时器,该定时器到期后将调用process_timeout(通过中断的形式),传入的参数则是当前进程的指针current。然后调用schedule,等待调度器回到该位置(由于任务状态为UNINTERRUPTABLE,不能通过调度或信号回到该位置)。这个时候就有两种情况(唤醒在后面wake_up部分详细说明):
① 超时了,调用process_timeout函数,该函数调用wake_up_process函数,核心代码类似wake_up_xxx(current)
② 在其他任务中调用了wake_up_xxx(wq)函数,将任务状态修改为TASK_RUNNING
一旦任务状态为TASK_RUNNING,就又回到了cpu的run queue中,可以通过调度回到函数中的schedule位置。
wait_event_timeout 返回值如下:
- 大于0: 表示condition满足,返回值表示距离设定超时还有多久(jiffies)
- 等于0: 表示超时发生
2.3 wait_event_interruptible
wait_event_interruptible 的实现如下:
#define __wait_event_interruptible(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
if (condition) \
break; \
if (!signal_pending(current)) { \
schedule(); \
continue; \
} \
ret = -ERESTARTSYS; \
break; \
} \
finish_wait(&wq, &__wait); \
} while (0)
#define wait_event_interruptible(wq, condition) \
({ \
int __ret = 0; \
if (!(condition)) \
__wait_event_interruptible(wq, condition, __ret); \
__ret; \
})
wait_event_interruptible的实现和wait_event类似,区别是有多了一个signal_pending操作。
signal_pending检查给定进程是否有信号需要处理,返回0表示没有信号需要处理。
所以此时退出循环的条件是: 满足 condition 和 有信号 两者之一就行 (如果执行到schedule,需要另外一个进程调用wake_up_xxx(&wq)操作,或者该进程收到了信号,将任务加入到run queue中。)
wait_event_interruptible 返回值如下:
- -ERESTARTSYS: 表示被信号激活唤醒。该错误的意思表示发生系统调用,任务正处在睡眠状态,等wakeup之后,会重新调用一次系统调用。
- 等于0: 表示condition满足
2.4 wait_event_interruptible_timeout
wait_event_interruptible_timeout 的实现如下:
#define __wait_event_interruptible_timeout(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \
if (condition) \
break; \
if (!signal_pending(current)) { \
ret = schedule_timeout(ret); \
if (!ret) \
break; \
continue; \
} \
ret = -ERESTARTSYS; \
break; \
} \
if (!ret && (condition)) \
ret = 1; \
finish_wait(&wq, &__wait); \
} while (0)
#define wait_event_interruptible_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!(condition)) \
__wait_event_interruptible_timeout(wq, condition, __ret); \
__ret; \
})
wait_event_interruptible_timeout的实现和上面wait_event等类似,退出循环的条件是: 满足 condition、timeout 和 有信号 三者之一(如果执行到schedule,需要另外一个进程调用wake_up_xxx(&wq)操作,或者超时了,或者该进程收到了信号,这三者都会将任务加入到run queue中。)。
wait_event_interruptible_timeout 返回值如下:
- -ERESTARTSYS: 表示被信号激活唤醒
- 大于0: 表示condition满足,返回值表示距离设定超时还有多久(jiffies)
- 等于0: 表示超时发生
3. 唤醒等待队列上的进程
内核提供了如下接口来唤醒等待队列上的进程:
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
可以到这些接口调用了三个函数__wake_up,__wake_up_locked,__wake_up_sync。先看看__wake_up的实现:
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
可以看到__wake_up会调到__wake_up_common函数,该函数的逻辑是,遍历等待队列上的的wait_queue_t结构体,进行如下的操作:
① 获取curr->flags值放入flags中。
② 进行判断,如果同时满足三个条件就退出循环。(注对于if来说,如果前面有一项不满足,后续的判断就不会做)
第一个条件是curr->func的返回结果,依据前面的说明,该函数实际上就是autoremove_wake_function,其详细说明如下,如果返回1,表明已经将相关的任务加入到cpu的run queue,并修改任务的状态成功。依据前面定义的wait_event_xxx的实现,该项正常来说均返回1。
第二个条件是flags & WQ_FLAG_EXCLUSIVE,如果该wait_queue_t是独占的,就为真。对于一个任务队列来说,只有前面的非互斥项执行curr->func之后,才轮到互斥进程,也只有互斥进程flags & WQ_FLAG_EXCLUSIVE才为真。
第三个条件是!–nr_exclusive,如果nr_exclusive为0,依据常理,该项始终为假;如果nr_exclusive为1,则第一次就为真;如果nr_exclusive为一个整数nr,则第nr次,该项为真。
所以可以得出:
wake_up 唤醒全部的非独占任务,唤醒一个独占任务。
wake_up_nr 唤醒全部的非独占任务,唤醒nr个独占任务。
wake_up_all 唤醒全部的非独占任务,唤醒全部独占任务。
wake_up_interruptible_xxx等函数类似上面。
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
success = 1; /* we're going to change ->state */
cpu = task_cpu(p); /* 获取最后执行该任务的CPU */
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
/* support smp 在很多架构上还不支持smp可以忽略此处
* 判断是否要将任务转移到另外一个CPU的执行队列上,负载均衡
* /
#ifdef CONFIG_SMP
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
while (p->on_cpu)
cpu_relax();
/*
* Pairs with the smp_wmb() in finish_lock_switch().
*/
smp_rmb();
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
}
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
if (ret) // 如果返回真,将该任务从等待队列中移除。
list_del_init(&wait->task_list);
return ret;
}
以上是autoremove_wake_function的实现,具体看try_to_wake_up函数。
该函数有3个参数:
- p 任务结构体指针
- state 需要唤醒的进程状态掩码,即需要唤醒符合该状态掩码的进程
- wake_flags 此处等待队列传过来的值为0。表示是同步唤醒sync,还是异步唤醒 async;