1 等待队列的实现类似于进程中异步中断的原理,都是为了最大化cpu的利用率,避免了cpu在资源无法满足时的轮询或空等待。
等待队列的实现原理举例:cpu向磁盘发出读数据请求,由于磁盘是速度慢,当前进程通过wait_queue_t结构睡眠在内核数据缓冲区的wait_queue_head_t中,并且发生进程切换;当磁盘准备好数据,向cpu发出异步中断请求,在中断处理过程中的软中断部分,cpu将数据拷贝到内核数据缓冲中,然后唤醒睡眠在等待队列中的进程,在调度的适当时机,该进程得到执行等待队列在system v进程间通信中和网络通信中大量使用
2 结构体
2.1 等待队列头部
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
2.2 等待队列项
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
void *private; //一般指向task_struct
wait_queue_func_t func;
struct list_head task_list;
};
typedef struct __wait_queue wait_queue_t;
3 等待队列使用流程
3-1 初始化等待队列头部
静态初始化:
#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.task_list = { &(name).task_list, &(name).task_list } }
动态初始化
static wait_queue_head_t head;
#define init_waitqueue_head(q) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((q), &__key); \
} while (0)
3-2 初始化等待队列项
静态初始化
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
#define DEFINE_WAIT_FUNC(name, function) \
wait_queue_t name = { \
.private = current, \
.func = function, \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
动态初始化
static wait_queue_t item;
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
q->flags = 0;
q->private = p;
q->func = default_wake_function;
}
3.3 将当前任务加入到等待队列,是进程睡眠
3.3.1 主函数 wait_event(wait_queue_head_t,c 表达式条件)
/**
* wait_event - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
#define wait_event(wq, condition) \
do { \
if (condition) \
break; \
__wait_event(wq, condition); \
} while (0)
3.3.2
#define __wait_event(wq, condition) \
do { \
DEFINE_WAIT(__wait); \ 通过静态方式新建wait_queue_t结构体,其private=task_struct
\
for (;;) { \
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ 睡眠过程
if (condition) \
break; \
schedule(); \
} \
finish_wait(&wq, &__wait); \ 进程满足条件后,执行扫尾工作:修改进程状态,从等待队列中移除
} while (0)
3.3.3 prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE)执行
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
* will be guaranteed to see waitqueue addition _or_ subsequent
* tests in this thread will see the wakeup having taken place.
*
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
* loads to move into the critical region).
*/
void
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait); //list_add(&wait->task_list,&q->task_list)
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
3.3.4 finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
/*
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
* @wait: wait descriptor
*
* Sets current thread back to running state and removes
* the wait descriptor from the given waitqueue if still
* queued.
*/
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
/*
* We can check for list emptiness outside the lock
* IFF:
* - we use the "careful" check that verifies both
* the next and prev pointers, so that there cannot
* be any half-pending updates in progress on other
* CPU's that we haven't seen yet (and that might
* still change the stack area.
* and
* - all other users take the lock (ie we can only
* have _one_ other CPU that looks at or modifies
* the list).
*/
if (!list_empty_careful(&wait->task_list)) {
spin_lock_irqsave(&q->lock, flags);
list_del_init(&wait->task_list);
spin_unlock_irqrestore(&q->lock, flags);
}
}
4 等待队列进程睡眠的其他函数
4-1 可中断睡眠
#define wait_event_interruptible(wq, condition) \ ({ \
int __ret = 0; \
if (!(condition)) \
__wait_event_interruptible(wq, condition, __ret); \
__ret; \
})
该等待进程如果由于condition满足被唤醒,返回0,如果由于信号被唤醒(当进程收到信号后,该进程的
thread->flag |= TIF_SIGPENDING ),返回-ERESTARTSYS
4-2 超时唤醒的睡眠
#define wait_event_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!(condition)) \
__wait_event_timeout(wq, condition, __ret); \ //__ret =0 或条件满足,都会退出
__ret; \
})
signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
default:
/*
* Another bit of PARANOID. Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout() (since it
* should never happens anyway). You just have the printk()
* that will tell you if something is gone wrong and where.
*/
if (timeout < 0) {
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
current->state = TASK_RUNNING;
goto out;
}
}
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
schedule();
del_singleshot_timer_sync(&timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer);
timeout = expire - jiffies;
out:
return timeout < 0 ? 0 : timeout;
}
当定时器唤醒该进程时,由于返回值timeout=0,所以该进程从睡眠等待中退出
4-3 超时或者信号均可唤醒的睡眠过程
#define wait_event_interruptible_timeout(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!(condition)) \
__wait_event_interruptible_timeout(wq, condition, __ret); \
__ret; \
})
返回0代表超时;返回正数返回剩余时间数,此时condition满足;返回-ERESTARTSYS代表被信号唤醒
6 完成量
完成量类似信号量,但是其实现基于等待队列,完成量结构中有两个参与者:若干个进程等待某个操作完成,而一个进程在操作完成时发出声明,从而唤醒等待着进程
6-1 完成量结构体 文件:include/linux/completion.h
/**
* struct completion - structure used to maintain state for a "completion"
*
* This is the opaque structure used to maintain the state for a "completion".
* Completions currently use a FIFO to queue threads that have to wait for
* the "completion" event.
*
* See also: complete(), wait_for_completion() (and friends _timeout,
* _interruptible, _interruptible_timeout, and _killable), init_completion(),
* and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
* INIT_COMPLETION().
*/
struct completion {
unsigned int done;
wait_queue_head_t wait;
};
done >=0,
当done大于0时,描述了操作已经被完成的次数,每当事件完成一次done+=1。
当done=0时,代表着该操作无法完成,等待进程睡眠,如果该操作被另一进程完成,则其会将done += 1,使done >0;并且调度唤醒睡眠进程,如果有睡眠进程且睡眠进程唤醒后再次执行前,done -=1,使done重新为0,这样以后到来的进程又会等待,而如果此时没有进程睡眠,done=1不变,之后如果有进程在等待该操作,则会使done-=1,该进程直接执行。
6-2 等待操作完成而睡眠的函数
extern void wait_for_completion(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout);
extern unsigned long wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
下面分析 wait_for_completion_interruptible
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
if (t == -ERESTARTSYS) //如果因为信号而被唤醒
return t;
return 0; //操作已完成
}
static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock); //加锁
timeout = do_wait_for_common(x, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
static inline long __sched
do_wait_for_common(struct completion *x, long timeout, int state)
{
if (!x->done) { //完成量以完成
DECLARE_WAITQUEUE(wait, current); //申请等待队列项
wait.flags |= WQ_FLAG_EXCLUSIVE; //每次只能唤醒一个进程
__add_wait_queue_tail(&x->wait, &wait); //加入到等待队列尾部
do {
if (signal_pending_state(state, current)) { //如果由于信号被唤醒
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout); //直到超时发生或操作完成(completion->done !=0)
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout; //timeout可以为剩余时间或者-ERESTARTSYS
}
x->done--; //进程唤醒后,操作--,代表该操作等待进程减少了一个
return timeout ?: 1;
}
6-2 操作完成而唤醒睡眠的进程的函数
extern void complete(struct completion *);
extern void complete_all(struct completion *); 与complete类似,不过x->done += UINT_MAX/2;
分析
void complete(struct completion *x)
{
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
x->done++; //默认每次只能唤醒一个等待进程
__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) && //唤醒函数
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
/**
* list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))
6-3 完成量举例