等待队列本质上是一双向链表,由等待队列头和队列节点构成。
当运行的线程要获得某一个资源二暂不可得时,线程有时候需要等待,此时它可以进入睡眠状态,内核为此生成一个新的等待队列节点将睡眠的线程挂载到等待队列中。
定义和初始化等待队列头:
1) 动态定义
wait_queue_head_t my_queue;
init_waitqueue_head(&my_queue);
#define init_waitqueue_head(q) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((q), #q, &__key); \
} while (0)
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
}
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
2)//宏定义静态方式定义
DECLARE_WAIT_QUEUE_HEAD(my_queue);
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.task_list = { &(name).task_list, &(name).task_list } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
展开看宏定义, kernel/include/linux/wait.h
#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \
.task_list = { NULL, NULL } }
#define DECLARE_WAITQUEUE(name, tsk) \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
可以看到wait就是一个变量名称,那么这个current是什么呢?
kernel/include/asm-generic/current.h
#define get_current() (current_thread_info()->task)
#define current get_current()
static inline struct thread_info *current_thread_info(void)
{
register unsigned long sp asm ("sp");
return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
}
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
struct cpu_context_save cpu_context; /* cpu context */
__u32 syscall; /* syscall number */
__u8 used_cp[16]; /* thread used copro */
unsigned long tp_value;
#ifdef CONFIG_CRUNCH
struct crunch_state crunchstate;
#endif
union fp_state fpstate __attribute__((aligned(8)));
union vfp_state vfpstate;
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
struct restart_block restart_block;
};
可以看到current其实是获取当前进程的task_struct 结构体,DECLARE_WAITQUEUE(wait, current); 是定义一个名为wait的wait_queue_t局部变量,并初始化了此结构体。
static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
{
static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
return &pkmap_map_wait;
}
{
for (;;) {
last_pkmap_nr = get_next_pkmap_nr(color);
if (no_more_pkmaps(last_pkmap_nr, color)) {
flush_all_zero_pkmaps();
count = get_pkmap_entries_count(color);
}
if (!pkmap_count[last_pkmap_nr])
break; /* Found a usable entry */
if (--count)
continue;
/*
* Sleep for somebody else to unmap their entries
*/
{
DECLARE_WAITQUEUE(wait, current);
wait_queue_head_t *pkmap_map_wait =
get_pkmap_wait_queue_head(color);
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(pkmap_map_wait, &wait);
unlock_kmap();
schedule();
remove_wait_queue(pkmap_map_wait, &wait);
lock_kmap();
/* Somebody else might have mapped it while we slept */
if (page_address(page))
return (unsigned long)page_address(page);
/* Re-start */
goto start;
}
}
=======================唤醒此等待队列======================
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) 唤醒当前的wait_queue_head_t x 第一个队列项对应的task
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) 唤醒 x 队列里前nr个队列项对应的线程
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) 唤醒等待队列x里所有队列项对应的线程
wake_up_interruptible(wait_queue_head_t *q); //唤醒q上可中断唤醒的线程
其实这两个函数最后都是如下调用
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
/*The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
//这个func是 DECLARE_WAITQUEUE时,默认的default_wake_function
if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
可以看到这里默认是唤醒wait_queue_head_t 上的每个wait_queue_t,但会根据传进来的参数 flags 和nr_exclusive决定是否break,所以不同参数也可以只唤醒某些等待队列。
=========================do_select里等待队列的使用==========================
int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
rcu_read_unlock();
if (retval < 0)
return retval;
n = retval;
poll_initwait(&table);
wait = &table.pt;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
wait->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
retval = 0;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
bool can_busy_loop = false;
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == 0) {
i += BITS_PER_LONG;
continue;
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
f = fdget(i);
if (f.file) {
const struct file_operations *f_op;
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op->poll) {
wait_key_set(wait, in, out,
bit, busy_flag);
mask = (*f_op->poll)(f.file, wait); //调用驱动代码里的poll函数module_proc_poll
}
fdput(f);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
wait->_qproc = NULL;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
wait->_qproc = NULL;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
wait->_qproc = NULL;
}
/* got something, stop busy polling */
if (retval) {
can_busy_loop = false;
busy_flag = 0;
/*
* only remember a returned
* POLL_BUSY_LOOP if we asked for it
*/
} else if (busy_flag & mask)
can_busy_loop = true;
}
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
cond_resched();
}
wait->_qproc = NULL;
if (retval || timed_out || signal_pending(current))
break;
if (table.error) {
retval = table.error;
break;
}
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_end) {
busy_end = busy_loop_end_time();
continue;
}
if (!busy_loop_timeout(busy_end))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
to, slack))
timed_out = 1;
}
poll_freewait(&table);
return retval;
}
poll_initwait(&table);
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current; //将当前线程保存这里,然后其他地方wakeup时,会找到pwq获取此task然后唤醒
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->_qproc = qproc;
pt->_key = ~0UL; /* all events enabled */
}
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
entry->filp = get_file(filp);
entry->wait_address = wait_address;
entry->key = p->_key;
init_waitqueue_func_entry(&entry->wait, pollwake); //这里就是初始化一个队列项wait_queue_t,
entry->wait.private = pwq; //将pwq赋值给队列项的private,所以后面就可以根据privite找到pwq的task了
add_wait_queue(wait_address, &entry->wait); //wait_address是队列头,由驱动模块里实现初始化,这里将刚才的队列项加入wait_address队列里。后面模块里wakeup此队列时就能找到此队列项
}
//这里就是初始化一个队列项wait_queue_t,
static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
{
q->flags = 0;
q->private = NULL;
q->func = func;
}
do_select 里然后就可以睡下去了
if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
模块里启动时初始化一个队列,rxwait
init_waitqueue_head(&module_proc->assert.rxwait);
do_select里fd->poll进入模块驱动里poll了,然后将此队列项加入驱动里rxwait队列中
const struct file_operations mdbg_proc_fops = {
.open = mdbg_proc_open,
.release = mdbg_proc_release,
.read = mdbg_proc_read,
.write = mdbg_proc_write,
.poll = mdbg_proc_poll,
};
static unsigned int module_proc_poll(struct file *filp, poll_table *wait)
{
struct module_proc_entry *entry =(struct module_proc_entry *)filp->private_data;
char *type = entry->name;
unsigned int mask = 0;
if (strcmp(type, "assert") == 0) {
poll_wait(filp, &module_proc->assert.rxwait, wait); //poll wait 其实是执行init_poll_funcptr(&pwq->pt, __pollwait); 里的__pollwait, 所以相当于把wait里对应的队列项wait_queue_t 加入到assert.rxwait队列里;
if (module_proc->assert.rcv_len > 0)
mask |= POLLIN | POLLRDNORM;
}
驱动模块里监测到消息时就可以wakeup唤醒do select;然后do select就可以获取驱动里的信息返回返回给上层了。
wake_up_interruptible(&module_proc->assert.rxwait); //注意这个rxwait是wait_queue_head_t类型
使用select函数的过程一般是:
先调用宏FD_ZERO将指定的fd_set清零,然后调用宏FD_SET将需要测试的fd加入fd_set,接着调用函数select测试fd_set中的所有fd,最后用宏FD_ISSET检查某个fd在函数select调用后,相应位是否仍然为1
pWcndManger->listen_fd = socket_local_server(
WCND_SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT, SOCK_STREAM);
fd_set read_fds;
FD_ZERO(&read_fds);
FD_SET(pWcndManger->listen_fd, &read_fds);
rc = select(max + 1, &read_fds, NULL, NULL, NULL);
if (FD_ISSET(pWcndManger->listen_fd, &read_fds)) {
client = accept(pWcndManger->listen_fd, &addr, &alen);
read(client , buffer, sizeof(buffer) - 1));
}
=============下面是以一个sysdump问题来理解链表里各指针的含义===========
死机现场及调用栈:
[121256.724192] c1 Unable to handle kernel paging request at virtual address dead000000000100
[121256.724217] c1 pgd = ffffffc04f1d3000
[121256.724232] c0 [dead000000000100] *pgd=0000000082e35003, *pud=0000000082e35003, *pmd=0000000000000000
[121256.724273] c1 Internal error: Oops: 96000004 [#1] PREEMPT SMP
[121256.730095] c0 Modules linked in: sprdwl_ng(O) mtty marlin2_fm mali_kbase(O) [last unloaded: sprdwl_ng]
[121256.730163] c1 CPU: 1 PID: 1197 Comm: kworker/u13:2 Tainted: G S W O 4.4.83 #4
[121256.730178] c1 Hardware name: Spreadtrum SP9850KHsmt 2c20 Board (DT)
[121256.730224] c0 Workqueue: mali_jd kbase_jd_done_worker [mali_kbase]
[121256.730245] c1 task: ffffffc0aa978d80 task.stack: ffffffc0841a4000
[121256.730271] c1 PC is at __wake_up_common+0x78/0xa4
[121256.730290] c1 LR is at __wake_up_common+0x64/0xa4
[121256.730306] c1 pc : [<ffffff80080f3658>] lr : [<ffffff80080f3644>] pstate: 600001c5
[121256.730319] c1 sp : ffffffc0841a7be0
[121256.730333] c0 x29: ffffffc0841a7be0 x28: 0000000000000001
[121256.730359] c0 x27: ffffff800a2bacc8 x26: 0000000000000001
[121256.730384] c0 x25: 0000000000000000 x24: 0000000000000001
[121256.730409] c0 x23: 0000000000000001 x22: 0000000000000000
[121256.730434] c0 x21: 0000000000000000 x20: ffffff800a2ac310
[121256.730459] c0 x19: dead0000000000e8 x18: 000000000004c001
[121256.730485] c0 x17: 0000000000000001 x16: 0000000000000004
[121256.730510] c0 x15: 000000000000061f x14: 0000000000000001
[121256.730535] c0 x13: 0000000000000000 x12: 0000000000000204
[121256.730560] c0 x11: ffffffffffffff78 x10: 0000000000000090
[121256.730585] c0 x9 : 0000000000000001 x8 : ffffff8009082090
[121256.730610] c0 x7 : ffffffc0bfeb5090 x6 : ffffff800909d3d8
[121256.730635] c0 x5 : 0000000000000001 x4 : 00000040b6e33000
[121256.730661] c0 x3 : 00000000000004ad x2 : ffffff8009085f00
[121256.730686] c0 x1 : dead000000000100 x0 : 0000000000000000
[121257.015504] c1 [<ffffff80080f3658>] __wake_up_common+0x78/0xa4
[121257.015526] c1 [<ffffff80080f395c>] __wake_up+0x48/0x60
[121257.015573] c1 [<ffffff8000b61554>] kbase_event_wakeup+0x2c/0x38 [mali_kbase]
[121257.015610] c1 [<ffffff8000b53248>] kbase_event_post+0xd0/0x1a8 [mali_kbase]
[121257.015646] c1 [<ffffff8000b4d360>] kbase_jd_done_worker+0x230/0x3e8 [mali_kbase]
[121257.015667] c1 [<ffffff80080be3bc>] process_one_work+0x154/0x458
[121257.015686] c1 [<ffffff80080bf088>] worker_thread+0x134/0x4a4
[121257.015704] c1 [<ffffff80080c57c8>] kthread+0xdc/0xf0
看此函数的C代码:
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
* list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_for_each_entry_safe(pos, n, head, member)\
for (pos = list_first_entry(head, typeof(*pos), member),\
n = list_next_entry(pos, member);\
&pos->member != (head); \
pos = n, n = list_next_entry(n, member))
再次分析汇编
crash_arm64> dis -xl __wake_up_common
/kernel/kernel/sched/wait.c: 67
0xffffff80080f35e0 <__wake_up_common>: stp x29, x30, [sp,#-80]!
0xffffff80080f35e4 <__wake_up_common+0x4>: mov x29, sp
0xffffff80080f35e8 <__wake_up_common+0x8>: stp x19, x20, [sp,#16]
0xffffff80080f35ec <__wake_up_common+0xc>: stp x21, x22, [sp,#32]
0xffffff80080f35f0 <__wake_up_common+0x10>: stp x23, x24, [sp,#48]
0xffffff80080f35f4 <__wake_up_common+0x14>: str x25, [sp,#64]
0xffffff80080f35f8 <__wake_up_common+0x18>: mov x20, x0
0xffffff80080f35fc <__wake_up_common+0x1c>: mov x0, x30
0xffffff80080f3600 <__wake_up_common+0x20>: mov x21, x4
0xffffff80080f3604 <__wake_up_common+0x24>: mov w23, w1
0xffffff80080f3608 <__wake_up_common+0x28>: mov w24, w2
0xffffff80080f360c <__wake_up_common+0x2c>: mov w22, w3
0xffffff80080f3610 <__wake_up_common+0x30>: nop
kernel/kernel/sched/wait.c: 70
0xffffff80080f3614 <__wake_up_common+0x34>: ldr x4, [x20,#8]! x4是q->task_list->next指向第一个task_list节点的地址,即0xfc18
0xffffff80080f3618 <__wake_up_common+0x38>: cmp x20, x4 这个就是for语句条件判断,链表头X20跟第一个节点比
0xffffff80080f361c <__wake_up_common+0x3c>: ldr x19, [x4],#-24 取0xfc18指向下一个task_list节点地址给x19(即dead100),X4上移到wait_queue_t结构体的首地址
0xffffff80080f3620 <__wake_up_common+0x40>: sub x19, x19, #0x18 下一个task_list偏移成wait_queue_t结构体的首地址,给X19(图中最右边节点首地址)
0xffffff80080f3624 <__wake_up_common+0x44>: b.eq 0xffffff80080f366c <__wake_up_common+0x8c>
/kernel/kernel/sched/wait.c: 73
0xffffff80080f3628 <__wake_up_common+0x48>: ldr x5, [x4,#16] 取其func
0xffffff80080f362c <__wake_up_common+0x4c>: mov w1, w23
0xffffff80080f3630 <__wake_up_common+0x50>: mov x0, x4
0xffffff80080f3634 <__wake_up_common+0x54>: mov w2, w22
0xffffff80080f3638 <__wake_up_common+0x58>: mov x3, x21
/kernel/kernel/sched/wait.c: 71
0xffffff80080f363c <__wake_up_common+0x5c>: ldr w25, [x4] 取flag
kernel/kernel/sched/wait.c: 73
0xffffff80080f3640 <__wake_up_common+0x60>: blr x5
/kernel/kernel/sched/wait.c: 70
0xffffff80080f3644 <__wake_up_common+0x64>: add x1, x19, #0x18 x1又偏移成task_list
kernel/kernel/sched/wait.c: 73
0xffffff80080f3648 <__wake_up_common+0x68>: cbz w0, 0xffffff80080f3658 <__wake_up_common+0x78>
0xffffff80080f364c <__wake_up_common+0x6c>: tbz w25, #0, 0xffffff80080f3658 <__wake_up_common+0x78>
/kernel/kernel/sched/wait.c: 74
0xffffff80080f3650 <__wake_up_common+0x70>: subs w24, w24, #0x1
0xffffff80080f3654 <__wake_up_common+0x74>: b.eq 0xffffff80080f366c <__wake_up_common+0x8c>
kernel/kernel/sched/wait.c: 70
0xffffff80080f3658 <__wake_up_common+0x78>: ldr x0, [x19,#24] 再取下下下一个task_list;for 循环第三句
0xffffff80080f365c <__wake_up_common+0x7c>: cmp x20, x1 for语句条件判断,链表头X20跟第二个节点比,相等则链表一圈走完回到头了。
crash_arm64> list -s list_head ffffff800a2ac310
ffffff800a2ac310
struct list_head {
next = 0xffffffc09e93fc18,
prev = 0xffffffc09e93fc18
}
ffffffc09e93fc18
struct list_head {
next = 0xdead000000000100,
prev = 0xdead000000000200
}
dead000000000100
struct list_head Cannot access memory at address 0xdead000000000100
crash_arm64> rd ffffff800a2ac310
ffffff800a2ac310: ffffffc09e93fc18 ........
crash_arm64> rd ffffff800a2ac318
ffffff800a2ac318: ffffffc09e93fc18 ........
crash_arm64> rd 0xffffffc09e93fc18
ffffffc09e93fc18: dead000000000100 ........
crash_arm64> rd 0xffffffc09e93fc20
ffffffc09e93fc20: dead000000000200 ........
此问题就是链表第一个节点被删除了,里面next和prev 都复制dead了。 然而代码仍访问此被删除节点,并根据dead100异常值获取下一个节点,于是非法地址了。