Linux驱动中的 wait_event_interruptible 与 wake_up_interruptible 深度理解

最新推荐文章于 2024-08-27 19:23:12 发布

sunk_young

最新推荐文章于 2024-08-27 19:23:12 发布

阅读量1w

点赞数 11

分类专栏：嵌入式C

本文链接：https://blog.csdn.net/Yeshangzhu/article/details/78051798

版权

嵌入式C 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

在 PCIE 驱动程序中常常会用到等待队列相关函数，主要是两个

（1）wait_event_interruptible ：使他们的调用进程在等待队列上睡眠，直到满足给定条件为止；

（2）wake_up_interruptible ：唤醒等待队列中的进程，并把它们的状态设置为TASK_RUNNING。

这两个函数函数对出现，主要用于中断处理、进程同步和定时等。

但在实际项目中，驱动程序中有可能会出现 wait 一次但 wake up 两次甚至多次的情况，这个时候会出现什么现象？这两个函数的实现细节是什么样的？所以有必要对内核源代码进行分析。

1.等待队列相关定义

一般等待队列都有一个等待队列头（wait queue head），等待队列头是一个类型为 wait_queue_head_t 的数据结构，在内核中定义如下：

struct __wait_queue_head {
	spinlock_t		lock;
	struct list_head	task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

struct list_head {
	struct list_head *next, *prev;
};

在驱动程序使用等待队列，一般先创建并初始化一个等待队列头：

#define DECLARE_WAIT_QUEUE_HEAD(name) \
	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

这是一个宏定义，它通过调用宏 __WAIT_QUEUE_HEAD_INITIALIZER 来初始化这个头，接着看：

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
	.task_list	= { &(name).task_list, &(name).task_list } }

（1）分两步，自旋锁的初始化：

#define __SPIN_LOCK_UNLOCKED(lockname) \
	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
#define __SPIN_LOCK_INITIALIZER(lockname) \
	{ { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
	{					\
	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
	SPIN_DEBUG_INIT(lockname)		\
	SPIN_DEP_MAP_INIT(lockname) }
# define SPIN_DEBUG_INIT(lockname)		\
	.magic = SPINLOCK_MAGIC,		\
	.owner_cpu = -1,			\
	.owner = SPINLOCK_OWNER_INIT,
#define SPINLOCK_MAGIC		0xdead4ead

#define SPINLOCK_OWNER_INIT	((void *)-1L)

# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }

以上是一个自旋锁完整的初始化流程，不是本文讨论的重点，不做进一步解读分析。

（2）链表头的初始化：很简单，直接让它的前驱结点和后继结点指向它自身。

等待队列头的初始化也可以通过手动先定义，再手动初始化的方法：

wait_queue_head_t q;
#define init_waitqueue_head(q)				\
	do {						\
		static struct lock_class_key __key;	\
							\
		__init_waitqueue_head((q), #q, &__key);	\
	} while (0)

再进一步分析__init_waitqueue_head函数：

void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
	spin_lock_init(&q->lock);
	lockdep_set_class_and_name(&q->lock, key, name);
	INIT_LIST_HEAD(&q->task_list);
}

spin_lock_init 和 lockdep_set_class_and_name 用来初始化自旋锁，重点看宏 INIT_LIST_HEAD：

static inline void INIT_LIST_HEAD(struct list_head *list)
{
	list->next = list;
	list->prev = list;
}

可以看到 INIT_LIST_HEAD 与宏 __WAIT_QUEUE_HEAD_INITIALIZER 里面的 .task_list = { &(name).task_list, &(name).task_list } 有异曲同工之妙。

由于等待队列是由中断处理程序和内核函数修改的，因此必须对其进行保护和同步，避免发生同时访问的情况，同步是通过等待队列头中 lock 自旋锁来实现的，task_list 字段是等待队列进程链表的头。

等待队列由双向链表实现，链表中的元素类型为 wait_queue_t，定义如下：

struct __wait_queue {
	unsigned int		flags;
	void			*private;
	wait_queue_func_t	func;
	struct list_head	task_list;
};
typedef struct __wait_queue wait_queue_t;

等待队列中的每个元素都代表一个睡眠进程，该进程等待某一事件的发生；他的描述符地址存放在 task 字段中，task_list 字段中包含的是指针，由这个指针把一个元素链接到等待相同事件的进程链表中。

从上面的等待队列头初始化流程中可以看出，等待队列头初始化后，只有一个空的链表头，等待队列双向链表中并不包含实际的 wait_queue_t 元素。

2. wait_event_interruptible 分析

#define wait_event_interruptible(wq, condition)				\
({									\
	int __ret = 0;							\
	might_sleep();							\
	if (!(condition))						\
		__ret = __wait_event_interruptible(wq, condition);	\
	__ret;								\
})

#define __wait_event_interruptible(wq, condition)			\
	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,		\
		      schedule())

#define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
({									\
	__label__ __out;						\
	wait_queue_t __wait;						\
	long __ret = ret;	/* explicit shadow */			\
									\
	INIT_LIST_HEAD(&__wait.task_list);				\
	if (exclusive)							\
		__wait.flags = WQ_FLAG_EXCLUSIVE;			\
	else								\
		__wait.flags = 0;					\
									\
	for (;;) {							\
		long __int = prepare_to_wait_event(&wq, &__wait, state);\
									\
		if (condition)						\
			break;						\
									\
		if (___wait_is_interruptible(state) && __int) {		\
			__ret = __int;					\
			if (exclusive) {				\
				abort_exclusive_wait(&wq, &__wait,	\
						     state, NULL);	\
				goto __out;				\
			}						\
			break;						\
		}							\
									\
		cmd;							\
	}								\
	finish_wait(&wq, &__wait);					\
__out:	__ret;								\
})

可以看到， wait_event_interruptible 其实是一个宏定义，前面的函数一说并不准确。

might_sleep()指示当前进程可以睡眠，当条件 condition 不满足时，进入 __wait_event_interruptible，然后进入宏 __wait_event，其参数中的任务状态掩码在内核中的定义如下：

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */
#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE	1
#define TASK_UNINTERRUPTIBLE	2
#define __TASK_STOPPED		4
#define __TASK_TRACED		8

可以看到，在 __wait_event 宏中，开始定义一个等待队列链表中的元素 wait_queue_t __wait，并用相似的方法对它进行初始化：

INIT_LIST_HEAD(&__wait.task_list);
static inline void INIT_LIST_HEAD(struct list_head *list)
{
	list->next = list;
	list->prev = list;
}

由于传入的参数 exclusive = 0，所以 __wait.flags = 0，然后进入函数 prepare_to_wait_event：

long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
	unsigned long flags;

	if (signal_pending_state(state, current))
		return -ERESTARTSYS;

	wait->private = current;
	wait->func = autoremove_wake_function;

	spin_lock_irqsave(&q->lock, flags);
	if (list_empty(&wait->task_list)) {
		if (wait->flags & WQ_FLAG_EXCLUSIVE)
			__add_wait_queue_tail(q, wait);
		else
			__add_wait_queue(q, wait);
	}
	set_current_state(state);
	spin_unlock_irqrestore(&q->lock, flags);

	return 0;
}

继续对链表元素 wait 进行赋值： wait->private = current； wait->func = autoremove_wake_function；

wait->func表示等待条件满足时的操作，重点看这个函数的细节：

int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int ret = default_wake_function(wait, mode, sync, key);

	if (ret)
		list_del_init(&wait->task_list);
	return ret;
}

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
			  void *key)
{
	return try_to_wake_up(curr->private, mode, wake_flags);
}

/**
 * try_to_wake_up - wake up a thread
 * @p: the thread to be awakened
 * @state: the mask of task states that can be woken
 * @wake_flags: wake modifier flags (WF_*)
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
 * re-schedule is in progress), and as such you're allowed to do
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
 * Return: %true if @p was woken up, %false if it was already running.
 * or @state didn't match @p's state.
 */
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
	unsigned long flags;
	int cpu, success = 0;

	/*
	 * If we are going to wake up a thread waiting for CONDITION we
	 * need to ensure that CONDITION=1 done by the caller can not be
	 * reordered with p->state check below. This pairs with mb() in
	 * set_current_state() the waiting thread does.
	 */
	smp_mb__before_spinlock();
	raw_spin_lock_irqsave(&p->pi_lock, flags);
	if (!(p->state & state))
		goto out;

	trace_sched_waking(p);

	success = 1; /* we're going to change ->state */
	cpu = task_cpu(p);

	/*
	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
	 * in smp_cond_load_acquire() below.
	 *
	 * sched_ttwu_pending()                 try_to_wake_up()
	 *   [S] p->on_rq = 1;                  [L] P->state
	 *       UNLOCK rq->lock  -----.
	 *                              \
	 *				 +---   RMB
	 * schedule()                   /
	 *       LOCK rq->lock    -----'
	 *       UNLOCK rq->lock
	 *
	 * [task p]
	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
	 *
	 * Pairs with the UNLOCK+LOCK on rq->lock from the
	 * last wakeup of our task and the schedule that got our task
	 * current.
	 */
	smp_rmb();
	if (p->on_rq && ttwu_remote(p, wake_flags))
		goto stat;

#ifdef CONFIG_SMP
	/*
	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
	 * possible to, falsely, observe p->on_cpu == 0.
	 *
	 * One must be running (->on_cpu == 1) in order to remove oneself
	 * from the runqueue.
	 *
	 *  [S] ->on_cpu = 1;	[L] ->on_rq
	 *      UNLOCK rq->lock
	 *			RMB
	 *      LOCK   rq->lock
	 *  [S] ->on_rq = 0;    [L] ->on_cpu
	 *
	 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
	 * from the consecutive calls to schedule(); the first switching to our
	 * task, the second putting it to sleep.
	 */
	smp_rmb();

	/*
	 * If the owning (remote) cpu is still in the middle of schedule() with
	 * this task as prev, wait until its done referencing the task.
	 */
	while (p->on_cpu)
		cpu_relax();
	/*
	 * Combined with the control dependency above, we have an effective
	 * smp_load_acquire() without the need for full barriers.
	 *
	 * Pairs with the smp_store_release() in finish_lock_switch().
	 *
	 * This ensures that tasks getting woken will be fully ordered against
	 * their previous state and preserve Program Order.
	 */
	smp_rmb();

	p->sched_contributes_to_load = !!task_contributes_to_load(p);
	p->state = TASK_WAKING;

	if (p->sched_class->task_waking)
		p->sched_class->task_waking(p);

	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
	if (task_cpu(p) != cpu) {
		wake_flags |= WF_MIGRATED;
		set_task_cpu(p, cpu);
	}
#endif /* CONFIG_SMP */

	ttwu_queue(p, cpu);
stat:
	ttwu_stat(p, cpu, wake_flags);
out:
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

	return success;
}

这段 wake_up 确实难以理解，不过只需要知道它是用来唤醒等待队列链表元素 wait 字段 task_list 上链接的线程就行了，唤醒后，通过 list_del_init(&wait->task_list) 删除链接的线程。

static inline void list_del_init(struct list_head *entry)
{
	__list_del_entry(entry);
	INIT_LIST_HEAD(entry);
}

static inline void __list_del_entry(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
}

static inline void __list_del(struct list_head * prev, struct list_head * next)
{
	next->prev = prev;
	WRITE_ONCE(prev->next, next);
}

#define WRITE_ONCE(x, val) \
({							\
	union { typeof(x) __val; char __c[1]; } __u =	\
		{ .__val = (__force typeof(x)) (val) }; \
	__write_once_size(&(x), __u.__c, sizeof(x));	\
	__u.__val;					\
})

再回到 prepare_to_wait_event 函数，进入判断语句 if（list_empty(&wait->task_list)），显然此时的 task_list 不为空，下一步由于 wait->flags = 0 进入 else 分支执行__add_wait_queue 函数：

static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
{
	list_add(&new->task_list, &head->task_list);
}

static inline void list_add(struct list_head *new, struct list_head *head)
{
	__list_add(new, head, head->next);
}

static inline void __list_add(struct list_head *new,
			      struct list_head *prev,
			      struct list_head *next)
{
	next->prev = new;
	new->next = next;
	new->prev = prev;
	prev->next = new;
}

相当于把 wait 元素添加到等待队列链表 q 中去，至此，等待队列中有一个等待的元素，wait_event_interruptible 分析完毕。

3. wake_up_interruptible 分析

同样的，它也是一系列的宏定义：

#define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)


void __wake_up(wait_queue_head_t *q, unsigned int mode,
			int nr_exclusive, void *key)
{
	unsigned long flags;

	spin_lock_irqsave(&q->lock, flags);
	__wake_up_common(q, mode, nr_exclusive, 0, key);
	spin_unlock_irqrestore(&q->lock, flags);
}

重点看 __wake_up_common：

/*
 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
 * number) then we wake all the non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
			int nr_exclusive, int wake_flags, void *key)
{
	wait_queue_t *curr, *next;

	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
		unsigned flags = curr->flags;

		if (curr->func(curr, mode, wake_flags, key) &&
				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
			break;
	}
}

其实，list_for_each_entry_safe 就是一个 for 循环，从等待队列头 head 开始遍历链表中的元素， &pos->member ！= （head）用于判断链表是否为空，这是 for 循环的退出条件。

#define list_first_entry(ptr, type, member) \
	list_entry((ptr)->next, type, member)

#define list_entry(ptr, type, member) \
	container_of(ptr, type, member)

#define container_of(ptr, type, member) ({			\
	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
	(type *)( (char *)__mptr - offsetof(type,member) );})

由于在 wait_event_interruptible 中创建了一个等待元素，此 for 循环至少会成功执行一次，所以进入 if 语句执行函数 curr->func(curr，mode，wake_flags，key)，此处的 func 即为之前在 prepare_to_wait_event 中定义的 wait->func = autoremove_wake_function，它会唤醒 task_list 所指向的进程。

至此，这一对等待队列操作的宏定义全部分析完毕，再回到开始那个问题，答案就很明显：wait 一次只创建并添加了一个 wait_queue_t 元素到等待队列头链表中，wake_up两次或多次时，在等效 “for 循环” list_for_each_entry_safe 中将会直接退出，从而对驱动程序和线程不产生影响，相当于无效操作。